In [57]:
import numpy as np

import pandas as pd

import sys, os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
     sys.path.append(module_path+"//utils")

from tqdm import tqdm

import semantic_term_placement

from collections import Counter
from bertopic import BERTopic
from sklearn.isotonic import IsotonicRegression

from collections import deque

from scipy import stats

from matplotlib import pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
import math


# Load Test Candidates

In [14]:
def load_test_data(file_path):
    test_candidates=pd.read_csv(file_path,sep='\t')
    test_labels=test_candidates.Label.to_numpy()
    test_candidates_context=test_candidates.top_terms.to_numpy()
    test_candidates_names=test_candidates.name.to_numpy()

    label_distribution=Counter(test_labels)
    print(label_distribution)
    return test_candidates_names,test_candidates_context,test_labels


def generate_binary_label(test_candidates):
    lsf_labels=list(range (9))
    # assign 1 to LSF samples and 0 to non-LSF samples
    binary_labels=[]
    lsf_indices=[]
    non_lsf_indices=[]
    for i,label in enumerate(test_candidates):
        if label in lsf_labels:
            lsf_indices.append(i)
            binary_labels.append(1)
        else:
            non_lsf_indices.append(i)
            binary_labels.append(0)
    ratio= math.ceil (len(non_lsf_indices)/len(lsf_indices))
    return binary_labels,ratio



file_path='../../data/test/test.tsv'
test_candidates_names,test_candidates_context,test_labels=load_test_data(file_path)
binary_labels,ratio=generate_binary_label(test_labels)

Counter({-1: 110, 1: 15, 3: 9, 0: 3, 2: 3, 5: 2, 8: 2, 6: 1})


# Predict using Annoy Index

### Creaet Annoy Index 

* Create two indices


    * Context around each sample is generated by the "Tagger"
    * "top_terms" are pregenerated keywords for each sample, extracted from the the context around each sample using the "KeyBERT"
    * "label" for candidates will be same as existing LSF or non-LSF samples if they have a close neighbour from them otherwise candidates remain unlabeled (-1 for BERTopic)

In [4]:
df_context_all=pd.read_csv('../../data/Final_Context.tsv',sep='\t')

index_lsf_names=df_context_all[df_context_all.serial< 200000].name.tolist()
index_lsf_labels=df_context_all[df_context_all.serial< 200000].label.tolist()

index_non_lsf_names=df_context_all[(df_context_all.serial>= 200000) &  (df_context_all.serial< 300000)].name.tolist()
index_non_lsf_labels=df_context_all[(df_context_all.serial>= 200000) &  (df_context_all.serial< 300000)].label.tolist()

index_lsf=semantic_term_placement.build_annoy_index(index_lsf_names)
index_non_lsf=semantic_term_placement.build_annoy_index(index_non_lsf_names)

Building the index with 100 trees...
Index is successfully built.
Building the index with 100 trees...
Index is successfully built.


In [15]:
def predict_by_KNN (candidates,index_lsf,index_lsf_names,index_non_lsf,index_non_lsf_names):
    probs_index=[]
    for i,name in enumerate(candidates):
        
        _,ditance_lsf=semantic_term_placement.find_neighbors(index_lsf,index_lsf_names, query_name =name,num_matches=3)
        _,distance_non_lsf=semantic_term_placement.find_neighbors(index_non_lsf,index_non_lsf_names,query_name=name,num_matches=3)

        score_lsf=1/np.mean(ditance_lsf)**2
  
        score_nonLSF=1/np.mean(distance_non_lsf)**2
      
        sum_scores=sum([score_lsf,score_nonLSF])
        probs_index.append(score_lsf/sum_scores)
    return probs_index

probs_KNN=predict_by_KNN(test_candidates_names,index_lsf,index_lsf_names,index_non_lsf,index_non_lsf_names)


# Predict using BERTopic model

In [20]:
%%capture

def load_and_predict_by_BERTopic_model(model_path,test_candidates_context):
    BERTopic_model = BERTopic.load(model_path)
    # these topics are selected manually and it should be changed if the model is retrianed
    LSF_topics=[3,4,6,8,9,10,13,14,17,18,19,20,21,23,25,27,30,31,33,36,39,40,45]
    predicted_topics=[]
    # Predict test samples
    predicted_probs=[]
    for doc in test_candidates_context:

        topics,probs=BERTopic_model.transform(doc)
        sum_probs=sum(probs[0])
        outlier_prob=1-sum_probs
        prob=sum(probs[0][LSF_topics])
        predicted_topic=np.argmax(probs[0])
        if topics[0] in LSF_topics:
            prob=prob+outlier_prob
        predicted_probs.append(prob)
        predicted_topics.append(np.argmax(probs[0]))
    predicted_probs=np.array(predicted_probs)
    return BERTopic_model,predicted_probs


#model_path="/Users/dzq660/LOCAL/LSF_Ontology/Trained_Topic_Models/model_unsupervised_guided"
model_path='../../model/BERTopic_Model'
BERTopic_model,BERTopic_predictions=load_and_predict_by_BERTopic_model(model_path,test_candidates_context)



# Consensus of BERTopic and KNN using calibrated probabilities

* To callibrate probabilities we have used approaches such as isotonic callibration and manual function fit, here for simplity linear regression is provided

In [37]:

def prepare_data(labels,probs,treshold):
    #TN=np.zeros(shape=(len(all_names_candidates_test)))
    TP=np.zeros(shape=(len(labels)))
    #FN=np.zeros(shape=(len(all_names_candidates_test)))
    FP=np.zeros(shape=(len(labels)))
    
    for i,label in enumerate(labels):
        if probs[i]>=treshold and label==1:
            TP[i]=1
        elif probs[i]>= treshold and label==0:
            FP[i]=1
      
    df=pd.DataFrame({'score':probs,'TP':TP,'FP':FP})
    df=df[(df.TP==1) | (df.FP==1)]
    df=df.sort_values(by=['score'],ascending=False)
    #df=df.sort_values(by=['score'])
    return df


def produce_curve_data(labels,probs,treshold,window_size):
	output = []
	window_check = False
	score_sum = 0
	tp_sum = 0
	fp_sum = 0

	df=prepare_data(labels,probs,treshold)

	pair_window = deque(maxlen = window_size)

	for score_in,tp_in,fp_in in df.itertuples(index=False):


		# retrieve values that will leave the window
		if len(pair_window) > 0 and window_check == True:
			(score_out, tp_out, fp_out) = pair_window[0]
		else:
			score_out = tp_out = fp_out = 0

		# calculate current sums within window
		score_sum += score_in - score_out
		tp_sum += tp_in - tp_out
		fp_sum += fp_in - fp_out

		# slide window
		pair_window += [[score_in, tp_in, fp_in]]

		if len(pair_window) == window_size:
			# calculate score average and precision within window
			score_av = score_sum / window_size
			if (tp_sum + fp_sum)==0:
				precision=0
			else:
				precision = tp_sum / (tp_sum + fp_sum)   
			output += [[score_av, precision]]
		
			window_check = True

	score_av=[]
	precision=[]
	for x,y in output:
		score_av.append(x)
		precision.append(y)
	precision=np.array(precision)
	score_av=np.array(score_av)
	return (score_av,precision)


def calibrate_linear(x,y,x_test):
    # Linear regression model
    gradient, intercept, r_value, p_value, slope_std_error = stats.linregress(x, y)

    # Line of best fit
    predict_y = gradient * x_test + intercept

    return predict_y



# We calibrate probs which are larger than 0.56, less than this value can be replaced by ratio of pos:neg  which is 0.3
def calibrate_Index(x,y,x_test):

    # Linear regression model
    gradient, intercept, r_value, p_value, slope_std_error = stats.linregress(x, y)

    # Line of best fit
    Y=[]
    for x in x_test:
        y=gradient * x + intercept
        if y<0.3:
            y=0.3
        elif y>1:
            y=1
        Y.append(y)
    Y=np.array(Y)
    return Y


def consensus_index_context_calibrated(probs_index_callibrated,probs_context_callibrated):
    
    
    return 1- (1-probs_index_callibrated)*(1-probs_context_callibrated)




In [54]:
xData_KNN,yData_KNN=produce_curve_data(binary_labels,probs_KNN,0.5,window_size=50)

probs_KNN_calibrated=calibrate_Index(xData_KNN,yData_KNN,probs_KNN)

xData_BERTopic,yData_BERToic=produce_curve_data(binary_labels,BERTopic_predictions,0.0,window_size=50)
probs_BERTopic_calibrated=calibrate_linear(xData_BERTopic,yData_BERToic, BERTopic_predictions)
probs_consensus_KNN_BERTOpic_calibrated=consensus_index_context_calibrated(probs_KNN_calibrated,probs_BERTopic_calibrated) 


In [59]:

# display = PrecisionRecallDisplay.from_predictions(binary_labels, probs_BERTopic_calibrated,ax= plt.gca(), name="BERTopic-Context")

# _ = PrecisionRecallDisplay.from_predictions(binary_labels, probs_consensus_KNN_BERTOpic_calibrated,ax= plt.gca(), name="Consensus",color='red')

# _ = PrecisionRecallDisplay.from_predictions(binary_labels, probs_KNN_calibrated,ax= plt.gca(), name="Index",color='green')



# _ = display.ax_.set_title("Precision-Recall curve - (Positive to Negative Ratio) < (1:" + str(round(ratio))+')')

# plt.legend(loc='upper right')
# #plt.legend(loc='lower left')
