# Co-training

1. We assume only part of label exist

2. We only select binary case (Only when one name indicate two and only two author)

3. When we apply 10 fold with co-training, each fold of first iteration will be baseline compare to co-training

In [1]:
import os
import sys
import warnings

#warnings.filterwarnings('error')
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
np.set_printoptions(precision=4, suppress=True)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

apply_threshold_to_name_group_samples = True

# pp_text = ["lsa", "pv_dm", "pv_dbow"]
pp_text = ["pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [2]:
import numpy as np
import warnings
import matplotlib.pyplot as plt

from collections import defaultdict

# create co training classifier
class Co_training_clf(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, p=1, n=1, k=30, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.deepcopy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u

    def init_L_U_U_prime(self, labels):
        # index of the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        print("Initial L size: ", len(L))
        print("Initial U size: ", len(U))
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        return L, U, U_prime
    
    def check_iter_label_mapping(self, iter_clf1, iter_clf2):
        '''
        In theory, it shouldn't occur that label not mapping since it trained on same dataset but different view
        But add a check to make sure it won't occur and save the class mapping for late label unlabeled sample
        '''
        dv1_class_label = iter_clf1.classes_
        dv2_class_label = iter_clf2.classes_
        if all(dv1_class_label == dv2_class_label):
            self.class_ = dv1_class_label
        else:
            sys.exit("Two view classifier label not mapping")

    def label_p_n_samples(self, proba, rank):
        U_prime_size = len(proba)
        self_trained_labels = []
        for label, conf_measure in enumerate(rank):
            # 0 positive sample
            if label==0:
                p = []
                index = 0
                while(len(p) < self.p):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if positive predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('P: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        p.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
                self_trained_labels.append(p)
            # 1 negative sample
            elif label == 1:
                n = []
                index = 0
                while(len(n) < self.n):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if negative predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('N: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        n.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
                self_trained_labels.append(n)
            else:
                print("Class label error")
        return self_trained_labels

    def get_self_labeled_sample(self):
        '''
        return:
            self-labeled new positive, self-labeled new negative (Index)
        '''
        
        return self.new_labeled_idx

    def plot_co_training_process(self, iterCount, data, iter_train_label, unlabeled_idx, new_sample_idx, plotSavingPath, name):
        if not os.path.exists(plotSavingPath):
            os.makedirs(plotSavingPath)
        # apply PCA on input data
        pca = PCA(n_components=2)
        pca_transformed = pca.fit_transform(X=data)
        pca_one = pca_transformed[:,0]
        pca_two = pca_transformed[:,1]
        # Layer 1. plot the labeled samples
        fig, ax = plt.subplots(figsize=(9,7))
        for author in np.unique(iter_train_label):
            ix = iter_train_label.index[iter_train_label == author].tolist()
            # print(ix)
            ax.scatter(pca_one[ix], pca_two[ix], cmap='viridis', label = author, s = 50, alpha = 0.5)
        # Layer 2, plot unlabel samples in u_prime
        ax.scatter(pca_one[unlabeled_idx], pca_two[unlabeled_idx], color='grey', label = "unlabeled", s = 50, alpha = 0.5)
        # layer 3, mark self labeled result
        temp = ax.scatter(pca_one[new_sample_idx], pca_two[new_sample_idx], edgecolor='black', linewidth='1', s=50)
        temp.set_facecolor("none")
        temp.set_label("self-labeled")
        legend = ax.legend(loc='upper left', bbox_to_anchor=(0, 1.2), ncol=2)
        plt.title('Co-training iteration: '+ str(iterCount), fontsize=14)
        plt.xlabel("PCA one",fontsize=14)
        plt.ylabel("PCA two",fontsize=14)
        plt.savefig((plotSavingPath+name+"_PCA_i-"+str(iterCount)+".png").encode('utf-8'), dpi=100, bbox_extra_artists=(legend,), bbox_inches='tight')
        plt.close("all")
        # plt.show()
        

    def fit(self, dataView1, dataView2, labels, dv1_test, dv2_test, label_test, plot_save_name=None, plot_save_path=None):
        # index of self labeled samples
        self.new_labeled_idx = defaultdict(list)
        # when fit co-train, we collect f1 on test samples wrt each iteration
        self.f1_on_test_dv1 = []
        self.f1_on_test_dv2 = []
        
        print("P value: ", self.p, " N value: ", self.n)
        
        L, U, U_prime = self.init_L_U_U_prime(labels)
        # print("L: ", L)
        
        train_idx = dataView1.index.values
        labels = pd.Series(labels, index = train_idx)
        iterCount = 0
        # --------- plot initial stage -------------- #
        init_train_label = labels[L]
        if plot_save_path != None:
            plot_save_dv1_name = plot_save_name+"_dv1"
            # ----- save pca reduced plot for dv1 ------ #
            self.plot_co_training_process(iterCount, dataView1, init_train_label, U_prime, [],
                                          plot_save_path, plot_save_dv1_name)
            # ----- dv2 -------- #
            plot_save_dv2_name = plot_save_name+"_dv2"
            self.plot_co_training_process(iterCount, dataView2, init_train_label, U_prime, [],
                                          plot_save_path, plot_save_dv2_name)
        
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount < self.k and U_prime:
            # print("step",iterCount, " L: ",L)
            # print("step",iterCount, " U_prime: ",U_prime)
            # ------------- get labeled samples for train ----------- # 
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2 = dataView2.iloc[L]
            iter_train_label = labels[L]
            # print(iter_train_label)
            # ----------- get U_prime unlabeled samples  ------------ #
            iter_unlabeled_d1 = dataView1.iloc[U_prime]
            iter_unlabeled_d2 = dataView2.iloc[U_prime]
            # ------------ train different view classifier ----------- #
            iter_clf1 = self.copy.deepcopy(self.clf1) 
            iter_clf2 = self.copy.deepcopy(self.clf2)
            iter_clf1.fit(iter_train_d1, iter_train_label.ravel())
            iter_clf2.fit(iter_train_d2, iter_train_label.ravel())
            self.check_iter_label_mapping(iter_clf1, iter_clf2)
            # --------- test error on test data --------------------- #
            # make prediction on test data
            y1 = iter_clf1.predict(dv1_test)
            y2 = iter_clf2.predict(dv2_test)
            # f1 score on each iteration
            f1_dv1 = f1_score(label_test, y1, average='macro')
            f1_dv2 = f1_score(label_test, y2, average='macro')
            # collect f1 for current iteration
            self.f1_on_test_dv1.append(f1_dv1)
            self.f1_on_test_dv2.append(f1_dv2)
            # rank class probabilities for unlabeled sample for it's confidence measure
            dv1_proba = iter_clf1.predict_proba(iter_unlabeled_d1)
            dv2_proba = iter_clf2.predict_proba(iter_unlabeled_d2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1_rank[i] is label i's confidence measure
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
            # print(dv1_proba)
            # print(dv1_proba_rank)
            # print(dv2_proba)
            # print(dv2_proba_rank)
            # h1 classifier
            newly_labeled_dv1 = self.label_p_n_samples(dv1_proba, dv1_proba_rank)
            # h2 classifier
            newly_labeled_dv2 = self.label_p_n_samples(dv2_proba, dv2_proba_rank)
            roundNew = list(zip(newly_labeled_dv1, newly_labeled_dv2))
            # auto label the samples and remove it from U_prime
            round_auto_labeled = []
            for label, round_new in enumerate(roundNew):
                round_new = set([item for sublist in round_new for item in sublist])
                auto_labeled = [U_prime[x] for x in round_new]
                round_auto_labeled.extend(auto_labeled)
                self.new_labeled_idx[self.class_[label]].append(auto_labeled)
                # add label to those new samples
                labels[auto_labeled] = self.class_[label]
                print(self.class_[label]," (u' idx): ",round_new)
                print(self.class_[label]," (U idx): ",auto_labeled)
            print(roundNew)
            print(round_auto_labeled)
            # extend the labeled sample
            L.extend(round_auto_labeled)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in round_auto_labeled]
            #print(U_prime)
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
            iterCount +=1
            # ----------- plot the co-training process -------------- #
            if plot_save_path != None:
                new_train_label = labels[L]
                self_labeled_idx_temp = [idx for idx in self.new_labeled_idx.values()]
                # print(self_labeled_idx_temp)
                last_iter_self_labeled_idx = [val for sublist in self_labeled_idx_temp for subsublist in sublist for val in subsublist]
                # print(last_iter_self_labeled_idx)
                # ----- save pca reduced plot for dv1 ------ #
                plot_save_dv1_name = plot_save_name+"_dv1"
                self.plot_co_training_process(iterCount, dataView1, new_train_label, U_prime, last_iter_self_labeled_idx,
                                              plot_save_path, plot_save_dv1_name)
                # ----- dv2 -------- #
                plot_save_dv2_name = plot_save_name+"_dv2"
                self.plot_co_training_process(iterCount, dataView2, new_train_label, U_prime, last_iter_self_labeled_idx,
                                              plot_save_path, plot_save_dv2_name)
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        print(self.k)
        print(iterCount)
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels.iloc[L])
        self.clf2.fit(newtrain_d2, labels.iloc[L])
        # ------ save f1 vs number of iteration plot ------- #
        if plot_save_path != None:
            default_text_based = [self.f1_on_test_dv1[0]] * iterCount
            default_citation_based = [self.f1_on_test_dv2[0]] * iterCount
            default_step = np.arange(0,iterCount)
            co_train_text_based = self.f1_on_test_dv1[1:]
            co_train_citation_based = self.f1_on_test_dv2[1:]
            co_training_step = np.arange(1,iterCount)

            fig = plt.figure()
            ax = plt.axes()
            plt.plot(default_step, default_text_based, linestyle='dashed', label="Text based default")
            plt.plot(default_step, default_citation_based, linestyle='dashdot', label="Citation based default")
            plt.plot(co_training_step, co_train_text_based, linestyle='solid', marker = "*", label="Text based")
            plt.plot(co_training_step, co_train_citation_based, linestyle='dotted', marker = "+", label="Citation based")
            ax.autoscale_view()
            legend = ax.legend(loc='upper left', bbox_to_anchor=(0, 1.2), ncol=2)
            plt.xlabel('Co-Training Iterations')
            plt.ylabel('F1 score')
            plt.savefig((plot_save_path+plot_save_name+"_diff_iter_f1.png"), dpi=300, bbox_extra_artists=(legend,), bbox_inches='tight')
            # plt.show()
            plt.close("all")
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False

    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = ["-1"] * dataView1.shape[0]
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                max_prob_idx = prod_y_probas.index(max(prod_y_probas))
                y_pred[i] = self.class_[max_prob_idx]
                print("result idx: ", max_prob_idx, " result: ",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, exit
                sys.exit("classifiers disagree with label, result may not accurate")
        # convert final result to np array
        y_pred_np_array = np.asarray(y_pred)
        return y_pred_np_array

    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product (*) of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba


In [3]:
import copy
import random

import seaborn as sns

from collections import Counter
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

# cross validation
def k_fold_cv_co_train_binary(dataview1, dataview2, label, init_labeled_size, clf, k=10, plot_save_name=None, plot_save_path=None):
    random.seed(1)
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel_co_train = []
    allPredLabel_LR = []
    allPredLabel_SVM = []
    
    # obtain data ratio
    c = Counter(label)
    data_ratio = [(i, c[i] / len(label)) for i in c]
    print(data_ratio)
    
    all_fold_statistic = []
    fold = 0
    
    for train_index, test_index in kf.split(dataview1, label):
        fold +=1
        if plot_save_path !=None:
            detailed_plot_path = plot_save_path+plot_save_name+"/fold"+str(fold)+"/"
        else:
            detailed_plot_path = plot_save_path
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        all_label_train, label_test = label.iloc[train_index], label.iloc[test_index]
        # ----------- set some labeled data as unlabeled ------------ #
        # 1. use co_train_per_class_size to draw "init_labeled_size" of samples as labeled, other as unlabeled
        co_train_per_class_size = [(label, round(ratio*init_labeled_size)) for label, ratio in data_ratio]
        final_train_label = all_label_train.tolist()
        train_sample_idx = []
        # 2. mark other as unlabeled
        for unique_label, training_size in co_train_per_class_size:
            curr_label_idx = [i for i, x in enumerate(final_train_label) if x == unique_label]
            curr_label_size = len(curr_label_idx)
            unlabeled_size = curr_label_size - training_size
            unlabel_item_idx = random.sample(curr_label_idx, unlabeled_size)
            train_sample_idx += [x for x in curr_label_idx if x not in unlabel_item_idx]
            for unlabel_idx in unlabel_item_idx:
                final_train_label[unlabel_idx]=-1
        print(final_train_label)
        unlabeled_sample_size = len(final_train_label)-len(train_sample_idx)
        final_dv1 = dv1_train.reset_index(drop=True)
        final_dv2 = dv2_train.reset_index(drop=True)
        ''' -------------- train binary co-training ------------------- '''
        per_fold_clf = copy.deepcopy(clf)
        per_fold_clf.fit(final_dv1, final_dv2, final_train_label, dv1_test, dv2_test, label_test, plot_save_name, detailed_plot_path)
        # get self-labeled sample index #
        self_labeled_index = per_fold_clf.get_self_labeled_sample()
        print("Self labeled sample index: ", self_labeled_index)
        self_labeled_idx_temp = [idx for idx in self_labeled_index.values()]
        all_self_labeled_index = [val for sublist in self_labeled_idx_temp for subsublist in sublist for val in subsublist]
        # -------- use concatenated features for comparsion -------- #
        concatenated_train = pd.concat([final_dv1.iloc[train_sample_idx],final_dv2.iloc[train_sample_idx]], axis=1, ignore_index=True)
        train_label = [final_train_label[i] for i in train_sample_idx]
        ''' --- train LR on concatenated features with "init_labeled_size" labeled samples  --- '''
        LR_clf = LogisticRegression(solver= "liblinear")
        LR_clf.fit(concatenated_train, train_label)
        ''' --- train SVM on concatenated features with "init_labeled_size" labeled samples  --- '''
        SVM_clf = SVC(gamma="auto", kernel='linear')
        SVM_clf.fit(concatenated_train, train_label)
        # ------------ generate concatenated test dataset ------------ #
        concatenated_test = pd.concat([dv1_test,dv2_test], axis=1, ignore_index=True)
        # ------------- get predicted label for test set ------------- #
        co_lr_label_predict = per_fold_clf.predict(dv1_test, dv2_test)
        LR_predict = LR_clf.predict(concatenated_test)
        SVM_predict = SVM_clf.predict(concatenated_test)
        
        print("co-train f1: ", metrics.classification_report(label_test, co_lr_label_predict))
        print("LR f1: ", metrics.classification_report(label_test, LR_predict) )
        print("SVM f1: ", metrics.classification_report(label_test, SVM_predict))
        
        allTrueLabel.extend(label_test.values.tolist())
        allPredLabel_co_train.extend(co_lr_label_predict)
        allPredLabel_LR.extend(LR_predict)
        allPredLabel_SVM.extend(SVM_predict)
        # collect per fold statistic
        curr_fold_statistic = {'author': plot_save_name, 'fold':fold, 'train_size': co_train_per_class_size, 'test_size': dv1_test.shape[0],
                               'total_self_labeled_train': len(all_self_labeled_index), "unlabeled size": unlabeled_sample_size,
                               'co-train f1': f1_score(label_test.values.tolist(), co_lr_label_predict,average='macro'),
                               'LR f1': f1_score(label_test.values.tolist(), LR_predict,average='macro'),
                               'SVM f1': f1_score(label_test.values.tolist(), SVM_predict,average='macro')}
        all_fold_statistic.append(curr_fold_statistic)
        
    if plot_save_path !=None:
        # --------------- plot per fold result f1 variance --------------- #
        all_per_fold_f1_score_variance_plot = pd.DataFrame(all_fold_statistic)
        plot_temp_data = all_per_fold_f1_score_variance_plot[['co-train f1', 'LR f1', 'SVM f1']].copy()
        plot_temp_data = pd.melt(plot_temp_data, var_name='methods', value_name='f1')
        ax = sns.boxplot(x="methods", y="f1", data=plot_temp_data)
        ax = sns.swarmplot(x="methods", y="f1", data=plot_temp_data, color=".25")
        ax.set_title(plot_save_name+" result variance with 10 fold")
        plt.savefig(plot_save_path+plot_save_name+"/"+plot_save_name+"_result_variance.png", dpi=300)
        plt.show()

    co_train_accuracy = accuracy_score(allTrueLabel, allPredLabel_co_train)
    co_train_f1 = f1_score(allTrueLabel, allPredLabel_co_train,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_co_train))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_co_train).ravel())
    
    LR_accuracy = accuracy_score(allTrueLabel, allPredLabel_LR)
    LR_f1 = f1_score(allTrueLabel, allPredLabel_LR,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_LR))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_LR).ravel())
    
    SVM_accuracy = accuracy_score(allTrueLabel, allPredLabel_SVM)
    SVM_f1 = f1_score(allTrueLabel, allPredLabel_SVM,average='macro')
    print(metrics.classification_report(allTrueLabel, allPredLabel_SVM))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel_SVM).ravel())
    
    return LR_f1, SVM_f1, co_train_f1, all_fold_statistic

In [4]:
import collections
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

init_labeled_size = 10

co_lr_diff_embedding_result = []

#---------------- load different embeddings for view one ---------------#
for select_emb in pp_text:
    print("Load text embedding: ", select_emb)
    # read viewone embeddings
    viewone_text_emb, viewone_emb_pid = com_func.read_text_embedding(emb_type=select_emb, training_size = "140k")
    viewone_text_emb = np.column_stack((viewone_emb_pid,viewone_text_emb))
    # read viewtwo embedding, notice here we only use labeled data
    print("Load citation embedding: ", pp_citation)
    viewtwo_citation_embedding = com_func.read_citation_embedding_sorted(emb_type = pp_citation, labeled_only = True)
    # print(viewone_text_emb[0])
    # print(viewtwo_citation_embedding[0])
    
    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        #plot_save_path = "../../plot/co_train_detail_plots/threshold="+str(step_threshold)+"/binary_sample=140k/"+select_emb+"/"
        threshold_change.append(step_threshold)
        # collect statistic to output
        name_group, total_sample_size, train_sample_size, test_sample_size= ([] for i in range(4))
        unlabeled_count, co_train_self_labeled = ([] for i in range(2))

        all_LR_f1,all_SVM_f1, all_co_LR_f1 = ([] for i in range(3))
        all_per_fold_f1_score_variance = []

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read labeled pid and aid from file
            data = com_func.read_pid_aid(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            print(labeled_data.shape)
            # ---------------- collect all labeled sample -------------------- #
            # ---------------- if use all samples as negative --------------- #
            all_labeled_samples = labeled_data["paperID"].tolist()
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name," pass")
            else:
                # --------------for each name group---------------- #
                if apply_threshold_to_name_group_samples == True:
                    # ---------- only use sample pass threshold ------- #
                    #-------- only select authors in name group are very productive (more than threshold)---------#
                    labeled_data, author_list, _= com_func.only_select_productive_authors(labeled_data, step_threshold)
                    # ----------------- if use filtered samples as negative  --------- #
                    filtered_all_labeled_samples = labeled_data["paperID"].tolist()
                else:
                    # ----------- use all sample in name group --------- #
                    author_list = com_func.productive_authors_list(labeled_data, step_threshold)
                    print(name, " name group sample size: ",labeled_data.shape)
                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_text = com_func.extract_sorted_embedding(viewone_text_emb, labeled_data["paperID"])
                print(labeled_viewone_text.shape)
                labeled_viewtwo_citation = com_func.extract_sorted_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_text), " : ", len(labeled_viewtwo_citation))
                # ---------------- shuffle the data ----------------- #
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # ------------------ alignment ---------------------- #
                labeled_viewone_text = pd.merge(labeled_data, labeled_viewone_text, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation = pd.merge(labeled_data, labeled_viewtwo_citation, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation.fillna(0, inplace=True)
                unique_labels = labeled_viewone_text.authorID.unique()
                map_dict = {}
                for idx, unique_label in enumerate(unique_labels):
                    map_dict[unique_label] = name+"_"+str(idx)
                true_label = labeled_viewone_text["authorID"].replace(map_dict)
                
                '''
                only work on binary case, ignored multi-class case
                We need to check whether the name group only contain binary case or not
                '''
                if len(author_list) == 2:
                    total_selected_group+= 1
                    name_group.append(name)
                    print(name + " is binary case")
                    viewone_text_final = labeled_viewone_text.drop(["paperID", "authorID", 0], axis=1)
                    viewtwo_citation_final = labeled_viewtwo_citation.drop(["paperID", "authorID", 0], axis=1)
                    # apply co-training
                    co_logistic_clf = Co_training_clf(clf1=LogisticRegression(solver= "liblinear"),p=1,n=1, k=30)
                    LR_f1, SVM_f1, co_lr_f1, name_per_fold_status= k_fold_cv_co_train_binary(viewone_text_final, viewtwo_citation_final,
                                                                                             true_label,init_labeled_size, co_logistic_clf,10)
                    total_sample_size.append(len(true_label))
                    train_sample_size.append(name_per_fold_status[0]["train_size"])
                    test_sample_size.append(name_per_fold_status[0]["test_size"])
                    unlabeled_count.append(name_per_fold_status[0]["unlabeled size"])
                    co_train_self_labeled.append(name_per_fold_status[0]["total_self_labeled_train"])
                    all_LR_f1.append(LR_f1)
                    all_SVM_f1.append(SVM_f1)
                    all_co_LR_f1.append(co_lr_f1)
                else:
                    print(name+ " is multi-class case, ignored")
                    
#         # write evaluation result to excel
#         output = pd.DataFrame({'Name':name_group, "Total sample size":total_sample_size, "train size":train_sample_size,
#                                "test size":test_sample_size, "unlabeled sample size": unlabeled_count, 
#                                "total self labeled sample":co_train_self_labeled,
#                                "LR F1": all_LR_f1, "SVM F1": all_SVM_f1, "co_logisticRegression F1": all_co_LR_f1})
#         savePath = "../../result/"+Dataset+"/co_train_binary_140k/"
#         filename = "(Labeled_size="+str(init_labeled_size)+") V1TextEmb="+select_emb+"_V2CitationEmb="+pp_citation+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")
        
        threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
        
    co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

Load text embedding:  pv_dbow
Total text vector records: 135796
Vector dimension:  100
Load citation embedding:  n2v
Total citation vector records: 124922
Vector dimension:  101
For name:  j_read
(136, 2)
j_read  pass
For name:  f_esteves
(34, 2)
f_esteves  pass
For name:  c_miller
(252, 2)
c_miller  pass
For name:  r_jha
(11, 2)
r_jha  pass
For name:  a_lowe
(102, 2)
a_lowe  pass
For name:  a_vega
(20, 2)
a_vega  pass
For name:  k_smith
(338, 2)
k_smith  pass
For name:  j_gordon
(19, 2)
j_gordon  pass
For name:  s_liao
(104, 2)
s_liao  pass
For name:  j_qian
(17, 2)
j_qian  pass
For name:  s_bernardi
(91, 2)
s_bernardi  pass
For name:  t_hill
(15, 2)
t_hill  pass
For name:  s_schindler
(51, 2)
s_schindler  pass
For name:  j_williams
(625, 2)
j_williams  pass
For name:  s_jacobson
(28, 2)
s_jacobson  pass
For name:  e_andrade
(17, 2)
e_andrade  pass
For name:  t_santos
(45, 2)
t_santos  pass
For name:  k_kim
(1111, 2)
Total sample size before apply threshold:  1111
Counter({'0000-0002-

Total missing sample:  0
(252, 101)
Total missing sample:  6
(252, 101)
Labeled:  252  :  252
(252, 103)
(252, 103)
p_robinson is binary case
[('p_robinson_0', 0.4722222222222222), ('p_robinson_1', 0.5277777777777778)]
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, 'p_robinson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_0', -1, -1, -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'p_robinson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 

N:  69  :  [0.0183 0.9817]
p_robinson_0  (u' idx):  {74, 84}
p_robinson_0  (U idx):  [96, 3]
p_robinson_1  (u' idx):  {68, 69}
p_robinson_1  (U idx):  [6, 221]
[([84], [74]), ([68], [69])]
[96, 3, 6, 221]
Total Labeled number:  120  Still unlabeled number:  85
30
30
Self labeled sample index:  defaultdict(<class 'list'>, {'p_robinson_0': [[9, 68], [46, 105], [85, 37], [165, 168], [55, 53], [137], [112, 56], [198, 130], [45, 133], [206, 104], [52, 155], [107, 58], [212], [146, 26], [50], [195, 124], [182], [11, 1], [186], [111, 211], [90, 147], [119, 157], [66, 136], [67, 73], [209, 30], [0, 83], [41], [100, 47], [202, 194], [96, 3]], 'p_robinson_1': [[190, 94], [43, 163], [103, 23], [185, 82], [173, 28], [207, 59], [166, 89], [88, 7], [84, 154], [218, 175], [176, 77], [169, 108], [109, 204], [162, 135], [126, 150], [134], [203, 38], [31, 110], [181], [223, 95], [138, 125], [36, 79], [93], [49], [98, 86], [101, 193], [116, 219], [42, 167], [57, 91], [6, 221]]})
y1 disagree on 0  Proba: 

P:  75  :  [0.9905 0.0095]
N:  20  :  [0.0358 0.9642]
P:  75  :  [0.9953 0.0047]
N:  50  :  [0.0363 0.9637]
p_robinson_0  (u' idx):  {75}
p_robinson_0  (U idx):  [130]
p_robinson_1  (u' idx):  {50, 20}
p_robinson_1  (U idx):  [95, 144]
[([75], [75]), ([20], [50])]
[130, 95, 144]
P:  55  :  [0.9629 0.0371]
N:  76  :  [0.0069 0.9931]
P:  72  :  [0.9754 0.0246]
N:  76  :  [0.0137 0.9863]
p_robinson_0  (u' idx):  {72, 55}
p_robinson_0  (U idx):  [129, 114]
p_robinson_1  (u' idx):  {76}
p_robinson_1  (U idx):  [49]
[([55], [72]), ([76], [76])]
[129, 114, 49]
P:  79  :  [0.9893 0.0107]
N:  26  :  [0.0318 0.9682]
P:  80  :  [0.9784 0.0216]
N:  28  :  [0.0315 0.9685]
p_robinson_0  (u' idx):  {80, 79}
p_robinson_0  (U idx):  [198, 133]
p_robinson_1  (u' idx):  {26, 28}
p_robinson_1  (U idx):  [57, 33]
[([79], [80]), ([26], [28])]
[198, 133, 57, 33]
P:  46  :  [0.966 0.034]
N:  60  :  [0.0311 0.9689]
P:  5  :  [0.9709 0.0291]
N:  41  :  [0.0329 0.9671]
p_robinson_0  (u' idx):  {5, 46}
p_robinson

P:  10  :  [0.9644 0.0356]
N:  45  :  [0.0487 0.9513]
P:  20  :  [0.9647 0.0353]
N:  34  :  [0.0366 0.9634]
p_robinson_0  (u' idx):  {10, 20}
p_robinson_0  (U idx):  [147, 8]
p_robinson_1  (u' idx):  {34, 45}
p_robinson_1  (U idx):  [218, 224]
[([10], [20]), ([45], [34])]
[147, 8, 218, 224]
P:  6  :  [0.957 0.043]
N:  74  :  [0.0463 0.9537]
P:  6  :  [0.9652 0.0348]
N:  22  :  [0.0399 0.9601]
p_robinson_0  (u' idx):  {6}
p_robinson_0  (U idx):  [214]
p_robinson_1  (u' idx):  {74, 22}
p_robinson_1  (U idx):  [77, 162]
[([6], [6]), ([74], [22])]
[214, 77, 162]
P:  56  :  [0.9559 0.0441]
N:  75  :  [0.0314 0.9686]
P:  56  :  [0.9532 0.0468]
N:  57  :  [0.0489 0.9511]
p_robinson_0  (u' idx):  {56}
p_robinson_0  (U idx):  [53]
p_robinson_1  (u' idx):  {57, 75}
p_robinson_1  (U idx):  [50, 47]
[([56], [56]), ([75], [57])]
[53, 50, 47]
P:  11  :  [0.9433 0.0567]
N:  77  :  [0.0253 0.9747]
P:  13  :  [0.9455 0.0545]
N:  75  :  [0.0136 0.9864]
p_robinson_0  (u' idx):  {11, 13}
p_robinson_0  (U 

p_robinson_0  (u' idx):  {72}
p_robinson_0  (U idx):  [37]
p_robinson_1  (u' idx):  {3, 53}
p_robinson_1  (U idx):  [110, 109]
[([72], [72]), ([53], [3])]
[37, 110, 109]
P:  35  :  [0.9435 0.0565]
N:  47  :  [0.043 0.957]
P:  34  :  [0.9531 0.0469]
N:  30  :  [0.0345 0.9655]
p_robinson_0  (u' idx):  {34, 35}
p_robinson_0  (U idx):  [41, 35]
p_robinson_1  (u' idx):  {30, 47}
p_robinson_1  (U idx):  [182, 176]
[([35], [34]), ([47], [30])]
[41, 35, 182, 176]
P:  54  :  [0.957 0.043]
N:  27  :  [0.0457 0.9543]
P:  28  :  [0.9582 0.0418]
N:  72  :  [0.0308 0.9692]
p_robinson_0  (u' idx):  {28, 54}
p_robinson_0  (U idx):  [130, 74]
p_robinson_1  (u' idx):  {72, 27}
p_robinson_1  (U idx):  [75, 90]
[([54], [28]), ([27], [72])]
[130, 74, 75, 90]
P:  40  :  [0.9289 0.0711]
N:  2  :  [0.0546 0.9454]
P:  1  :  [0.9626 0.0374]
N:  34  :  [0.0272 0.9728]
p_robinson_0  (u' idx):  {40, 1}
p_robinson_0  (U idx):  [97, 125]
p_robinson_1  (u' idx):  {2, 34}
p_robinson_1  (U idx):  [174, 167]
[([40], [1]

y1 disagree on 9  Proba:  [0.243 0.757]
y2 not aggreed on  9 Proba:  [0.8973 0.1027]
product probas: [0.21802348854380793, 0.07774788881662098]
result idx:  0  result:  p_robinson_0
y1 disagree on 12  Proba:  [0.8962 0.1038]
y2 not aggreed on  12 Proba:  [0.1204 0.8796]
product probas: [0.10788753922484062, 0.09134204831313825]
result idx:  0  result:  p_robinson_0
co-train f1:                precision    recall  f1-score   support

p_robinson_0       0.92      1.00      0.96        12
p_robinson_1       1.00      0.92      0.96        13

   micro avg       0.96      0.96      0.96        25
   macro avg       0.96      0.96      0.96        25
weighted avg       0.96      0.96      0.96        25

LR f1:                precision    recall  f1-score   support

p_robinson_0       0.92      0.92      0.92        12
p_robinson_1       0.92      0.92      0.92        13

   micro avg       0.92      0.92      0.92        25
   macro avg       0.92      0.92      0.92        25
weighted av

P:  72  :  [0.9653 0.0347]
N:  86  :  [0.0256 0.9744]
P:  78  :  [0.9743 0.0257]
N:  86  :  [0.0211 0.9789]
p_robinson_0  (u' idx):  {72, 78}
p_robinson_0  (U idx):  [73, 1]
p_robinson_1  (u' idx):  {86}
p_robinson_1  (U idx):  [117]
[([72], [78]), ([86], [86])]
[73, 1, 117]
P:  87  :  [0.9897 0.0103]
N:  37  :  [0.032 0.968]
P:  87  :  [0.9948 0.0052]
N:  68  :  [0.0212 0.9788]
p_robinson_0  (u' idx):  {87}
p_robinson_0  (U idx):  [131]
p_robinson_1  (u' idx):  {68, 37}
p_robinson_1  (U idx):  [100, 114]
[([87], [87]), ([37], [68])]
[131, 100, 114]
P:  82  :  [0.9843 0.0157]
N:  63  :  [0.0332 0.9668]
P:  47  :  [0.973 0.027]
N:  85  :  [0.001 0.999]
p_robinson_0  (u' idx):  {82, 47}
p_robinson_0  (U idx):  [166, 52]
p_robinson_1  (u' idx):  {85, 63}
p_robinson_1  (U idx):  [171, 142]
[([82], [47]), ([63], [85])]
[166, 52, 171, 142]
P:  86  :  [0.9943 0.0057]
N:  88  :  [0.0155 0.9845]
P:  85  :  [0.9744 0.0256]
N:  88  :  [0.0044 0.9956]
p_robinson_0  (u' idx):  {85, 86}
p_robinson_0

p_robinson_0  (u' idx):  {18, 55}
p_robinson_0  (U idx):  [111, 17]
p_robinson_1  (u' idx):  {77}
p_robinson_1  (U idx):  [103]
[([18], [55]), ([77], [77])]
[111, 17, 103]
P:  76  :  [0.9886 0.0114]
N:  77  :  [0.0184 0.9816]
P:  76  :  [0.9914 0.0086]
N:  77  :  [0.0065 0.9935]
p_robinson_0  (u' idx):  {76}
p_robinson_0  (U idx):  [24]
p_robinson_1  (u' idx):  {77}
p_robinson_1  (U idx):  [5]
[([76], [76]), ([77], [77])]
[24, 5]
P:  56  :  [0.9668 0.0332]
N:  71  :  [0.0216 0.9784]
P:  9  :  [0.9667 0.0333]
N:  77  :  [0.0035 0.9965]
p_robinson_0  (u' idx):  {56, 9}
p_robinson_0  (U idx):  [81, 63]
p_robinson_1  (u' idx):  {77, 71}
p_robinson_1  (U idx):  [171, 160]
[([56], [9]), ([71], [77])]
[81, 63, 171, 160]
P:  80  :  [0.9606 0.0394]
N:  79  :  [0.0177 0.9823]
P:  56  :  [0.9708 0.0292]
N:  79  :  [0.011 0.989]
p_robinson_0  (u' idx):  {80, 56}
p_robinson_0  (U idx):  [166, 41]
p_robinson_1  (u' idx):  {79}
p_robinson_1  (U idx):  [220]
[([80], [56]), ([79], [79])]
[166, 41, 220]

P:  7  :  [0.896 0.104]
N:  74  :  [0.0552 0.9448]
P:  9  :  [0.96 0.04]
N:  1  :  [0.0351 0.9649]
p_robinson_0  (u' idx):  {9, 7}
p_robinson_0  (U idx):  [144, 159]
p_robinson_1  (u' idx):  {1, 74}
p_robinson_1  (U idx):  [171, 182]
[([7], [9]), ([74], [1])]
[144, 159, 171, 182]
P:  68  :  [0.9136 0.0864]
N:  26  :  [0.046 0.954]
P:  4  :  [0.9683 0.0317]
N:  71  :  [0.027 0.973]
p_robinson_0  (u' idx):  {68, 4}
p_robinson_0  (U idx):  [157, 169]
p_robinson_1  (u' idx):  {26, 71}
p_robinson_1  (U idx):  [85, 54]
[([68], [4]), ([26], [71])]
[157, 169, 85, 54]
P:  34  :  [0.9246 0.0754]
N:  9  :  [0.0473 0.9527]
P:  0  :  [0.9546 0.0454]
N:  40  :  [0.0392 0.9608]
p_robinson_0  (u' idx):  {0, 34}
p_robinson_0  (U idx):  [125, 41]
p_robinson_1  (u' idx):  {40, 9}
p_robinson_1  (U idx):  [164, 176]
[([34], [0]), ([9], [40])]
[125, 41, 164, 176]
P:  73  :  [0.9739 0.0261]
N:  68  :  [0.0418 0.9582]
P:  53  :  [0.9536 0.0464]
N:  49  :  [0.0419 0.9581]
p_robinson_0  (u' idx):  {73, 53}
p_ro

P value:  1  N value:  1
Initial L size:  10
Initial U size:  217
P:  9  :  [0.8608 0.1392]
N:  25  :  [0.0636 0.9364]
P:  57  :  [0.937 0.063]
N:  59  :  [0.0651 0.9349]
p_robinson_0  (u' idx):  {9, 57}
p_robinson_0  (U idx):  [63, 163]
p_robinson_1  (u' idx):  {25, 59}
p_robinson_1  (U idx):  [127, 5]
[([9], [57]), ([25], [59])]
[63, 163, 127, 5]
P:  54  :  [0.9239 0.0761]
N:  10  :  [0.0669 0.9331]
P:  70  :  [0.9581 0.0419]
N:  71  :  [0.0547 0.9453]
p_robinson_0  (u' idx):  {70, 54}
p_robinson_0  (U idx):  [94, 76]
p_robinson_1  (u' idx):  {10, 71}
p_robinson_1  (U idx):  [6, 191]
[([54], [70]), ([10], [71])]
[94, 76, 6, 191]
P:  43  :  [0.9227 0.0773]
N:  65  :  [0.0575 0.9425]
P:  38  :  [0.9616 0.0384]
N:  54  :  [0.0422 0.9578]
p_robinson_0  (u' idx):  {43, 38}
p_robinson_0  (U idx):  [113, 37]
p_robinson_1  (u' idx):  {65, 54}
p_robinson_1  (U idx):  [115, 14]
[([43], [38]), ([65], [54])]
[113, 37, 115, 14]
P:  74  :  [0.9485 0.0515]
N:  5  :  [0.0555 0.9445]
P:  61  :  [0.96

P:  29  :  [0.9328 0.0672]
N:  4  :  [0.0423 0.9577]
p_robinson_0  (u' idx):  {48, 29}
p_robinson_0  (U idx):  [76, 125]
p_robinson_1  (u' idx):  {4, 62}
p_robinson_1  (U idx):  [5, 115]
[([48], [29]), ([62], [4])]
[76, 125, 5, 115]
P:  7  :  [0.9423 0.0577]
N:  62  :  [0.0579 0.9421]
P:  7  :  [0.9328 0.0672]
N:  67  :  [0.0359 0.9641]
p_robinson_0  (u' idx):  {7}
p_robinson_0  (U idx):  [63]
p_robinson_1  (u' idx):  {67, 62}
p_robinson_1  (U idx):  [189, 205]
[([7], [7]), ([62], [67])]
[63, 189, 205]
P:  16  :  [0.9417 0.0583]
N:  43  :  [0.0473 0.9527]
P:  74  :  [0.9653 0.0347]
N:  44  :  [0.0343 0.9657]
p_robinson_0  (u' idx):  {16, 74}
p_robinson_0  (U idx):  [74, 157]
p_robinson_1  (u' idx):  {43, 44}
p_robinson_1  (U idx):  [6, 69]
[([16], [74]), ([43], [44])]
[74, 157, 6, 69]
P:  74  :  [0.9606 0.0394]
N:  50  :  [0.0467 0.9533]
P:  59  :  [0.9459 0.0541]
N:  4  :  [0.0283 0.9717]
p_robinson_0  (u' idx):  {74, 59}
p_robinson_0  (U idx):  [191, 155]
p_robinson_1  (u' idx):  {50

co-train f1:                precision    recall  f1-score   support

p_robinson_0       0.92      1.00      0.96        12
p_robinson_1       1.00      0.92      0.96        13

   micro avg       0.96      0.96      0.96        25
   macro avg       0.96      0.96      0.96        25
weighted avg       0.96      0.96      0.96        25

LR f1:                precision    recall  f1-score   support

p_robinson_0       0.92      0.92      0.92        12
p_robinson_1       0.92      0.92      0.92        13

   micro avg       0.92      0.92      0.92        25
   macro avg       0.92      0.92      0.92        25
weighted avg       0.92      0.92      0.92        25

SVM f1:                precision    recall  f1-score   support

p_robinson_0       0.90      0.75      0.82        12
p_robinson_1       0.80      0.92      0.86        13

   micro avg       0.84      0.84      0.84        25
   macro avg       0.85      0.84      0.84        25
weighted avg       0.85      0.84      0.84

P:  89  :  [0.9889 0.0111]
N:  7  :  [0.047 0.953]
P:  88  :  [0.9955 0.0045]
N:  65  :  [0.0404 0.9596]
p_robinson_0  (u' idx):  {88, 89}
p_robinson_0  (U idx):  [17, 76]
p_robinson_1  (u' idx):  {65, 7}
p_robinson_1  (U idx):  [215, 23]
[([89], [88]), ([7], [65])]
[17, 76, 215, 23]
P:  4  :  [0.9789 0.0211]
N:  89  :  [0.0128 0.9872]
P:  87  :  [0.9952 0.0048]
N:  91  :  [0.0129 0.9871]
p_robinson_0  (u' idx):  {4, 87}
p_robinson_0  (U idx):  [58, 125]
p_robinson_1  (u' idx):  {89, 91}
p_robinson_1  (U idx):  [103, 195]
[([4], [87]), ([89], [91])]
[58, 125, 103, 195]
Total Labeled number:  113  Still unlabeled number:  92
30
30
Self labeled sample index:  defaultdict(<class 'list'>, {'p_robinson_0': [[163, 74], [37, 212], [173, 150], [184, 157], [191, 67], [138, 113], [94], [123, 56], [48, 1], [137, 116], [98, 183], [78, 96], [13, 104], [4, 209], [24], [181], [101, 3], [81], [72], [71, 8], [19], [126, 99], [224, 130], [131], [10, 52], [82, 208], [144], [156], [17, 76], [58, 125]], 'p

P:  39  :  [0.9753 0.0247]
N:  73  :  [0.0201 0.9799]
P:  75  :  [0.9769 0.0231]
N:  73  :  [0.0271 0.9729]
t_smith_0  (u' idx):  {75, 39}
t_smith_0  (U idx):  [14, 230]
t_smith_1  (u' idx):  {73}
t_smith_1  (U idx):  [206]
[([39], [75]), ([73], [73])]
[14, 230, 206]
P:  61  :  [0.9732 0.0268]
N:  75  :  [0.011 0.989]
P:  4  :  [0.9801 0.0199]
N:  5  :  [0.0363 0.9637]
t_smith_0  (u' idx):  {4, 61}
t_smith_0  (U idx):  [68, 88]
t_smith_1  (u' idx):  {75, 5}
t_smith_1  (U idx):  [48, 234]
[([61], [4]), ([75], [5])]
[68, 88, 48, 234]
P:  5  :  [0.9719 0.0281]
N:  76  :  [0.0094 0.9906]
P:  47  :  [0.9784 0.0216]
N:  61  :  [0.0326 0.9674]
t_smith_0  (u' idx):  {5, 47}
t_smith_0  (U idx):  [218, 214]
t_smith_1  (u' idx):  {76, 61}
t_smith_1  (U idx):  [71, 158]
[([5], [47]), ([76], [61])]
[218, 214, 71, 158]
P:  63  :  [0.9752 0.0248]
N:  74  :  [0.016 0.984]
P:  75  :  [0.998 0.002]
N:  61  :  [0.0331 0.9669]
t_smith_0  (u' idx):  {75, 63}
t_smith_0  (U idx):  [46, 37]
t_smith_1  (u' idx

[([22], [42]), ([38], [35])]
[123, 231, 44, 189]
P:  3  :  [0.9571 0.0429]
N:  51  :  [0.033 0.967]
P:  41  :  [0.9746 0.0254]
N:  43  :  [0.0394 0.9606]
t_smith_0  (u' idx):  {41, 3}
t_smith_0  (U idx):  [193, 68]
t_smith_1  (u' idx):  {43, 51}
t_smith_1  (U idx):  [159, 152]
[([3], [41]), ([51], [43])]
[193, 68, 159, 152]
P:  42  :  [0.9653 0.0347]
N:  45  :  [0.0321 0.9679]
P:  5  :  [0.9747 0.0253]
N:  72  :  [0.0331 0.9669]
t_smith_0  (u' idx):  {42, 5}
t_smith_0  (U idx):  [72, 160]
t_smith_1  (u' idx):  {72, 45}
t_smith_1  (U idx):  [42, 171]
[([42], [5]), ([45], [72])]
[72, 160, 42, 171]
P:  21  :  [0.9659 0.0341]
N:  73  :  [0.0127 0.9873]
P:  21  :  [0.9772 0.0228]
N:  58  :  [0.034 0.966]
t_smith_0  (u' idx):  {21}
t_smith_0  (U idx):  [113]
t_smith_1  (u' idx):  {73, 58}
t_smith_1  (U idx):  [71, 210]
[([21], [21]), ([73], [58])]
[113, 71, 210]
P:  40  :  [0.9629 0.0371]
N:  33  :  [0.0262 0.9738]
P:  27  :  [0.9765 0.0235]
N:  46  :  [0.0361 0.9639]
t_smith_0  (u' idx):  {

P:  16  :  [0.9651 0.0349]
N:  64  :  [0.0634 0.9366]
P:  45  :  [0.9738 0.0262]
N:  25  :  [0.0539 0.9461]
t_smith_0  (u' idx):  {16, 45}
t_smith_0  (U idx):  [32, 145]
t_smith_1  (u' idx):  {64, 25}
t_smith_1  (U idx):  [46, 10]
[([16], [45]), ([64], [25])]
[32, 145, 46, 10]
P:  11  :  [0.9574 0.0426]
N:  13  :  [0.0699 0.9301]
P:  42  :  [0.9765 0.0235]
N:  23  :  [0.0625 0.9375]
t_smith_0  (u' idx):  {42, 11}
t_smith_0  (U idx):  [51, 173]
t_smith_1  (u' idx):  {13, 23}
t_smith_1  (U idx):  [85, 95]
[([11], [42]), ([13], [23])]
[51, 173, 85, 95]
P:  57  :  [0.9509 0.0491]
N:  44  :  [0.0484 0.9516]
P:  22  :  [0.9727 0.0273]
N:  35  :  [0.0479 0.9521]
t_smith_0  (u' idx):  {57, 22}
t_smith_0  (U idx):  [224, 155]
t_smith_1  (u' idx):  {35, 44}
t_smith_1  (U idx):  [70, 131]
[([57], [22]), ([44], [35])]
[224, 155, 70, 131]
P:  6  :  [0.9621 0.0379]
N:  8  :  [0.0438 0.9562]
P:  40  :  [0.9754 0.0246]
N:  42  :  [0.0432 0.9568]
t_smith_0  (u' idx):  {40, 6}
t_smith_0  (U idx):  [113,

P:  40  :  [0.9682 0.0318]
N:  55  :  [0.0439 0.9561]
P:  73  :  [0.9782 0.0218]
N:  67  :  [0.0588 0.9412]
t_smith_0  (u' idx):  {40, 73}
t_smith_0  (U idx):  [166, 124]
t_smith_1  (u' idx):  {67, 55}
t_smith_1  (U idx):  [72, 168]
[([40], [73]), ([55], [67])]
[166, 124, 72, 168]
P:  52  :  [0.9717 0.0283]
N:  29  :  [0.0443 0.9557]
P:  23  :  [0.9808 0.0192]
N:  68  :  [0.0563 0.9437]
t_smith_0  (u' idx):  {52, 23}
t_smith_0  (U idx):  [30, 138]
t_smith_1  (u' idx):  {68, 29}
t_smith_1  (U idx):  [208, 165]
[([52], [23]), ([29], [68])]
[30, 138, 208, 165]
P:  38  :  [0.9735 0.0265]
N:  72  :  [0.0384 0.9616]
P:  23  :  [0.9829 0.0171]
N:  75  :  [0.0387 0.9613]
t_smith_0  (u' idx):  {38, 23}
t_smith_0  (U idx):  [14, 225]
t_smith_1  (u' idx):  {72, 75}
t_smith_1  (U idx):  [153, 228]
[([38], [23]), ([72], [75])]
[14, 225, 153, 228]
P:  67  :  [0.97 0.03]
N:  11  :  [0.04 0.96]
P:  3  :  [0.9834 0.0166]
N:  61  :  [0.0453 0.9547]
t_smith_0  (u' idx):  {3, 67}
t_smith_0  (U idx):  [127

LR f1:                precision    recall  f1-score   support

   t_smith_0       1.00      1.00      1.00        16
   t_smith_1       1.00      1.00      1.00        11

   micro avg       1.00      1.00      1.00        27
   macro avg       1.00      1.00      1.00        27
weighted avg       1.00      1.00      1.00        27

SVM f1:                precision    recall  f1-score   support

   t_smith_0       1.00      1.00      1.00        16
   t_smith_1       1.00      1.00      1.00        11

   micro avg       1.00      1.00      1.00        27
   macro avg       1.00      1.00      1.00        27
weighted avg       1.00      1.00      1.00        27

[-1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_1', -1, -

P:  83  :  [0.9954 0.0046]
N:  75  :  [0.0123 0.9877]
P:  81  :  [0.9963 0.0037]
N:  75  :  [0.0155 0.9845]
t_smith_0  (u' idx):  {81, 83}
t_smith_0  (U idx):  [157, 12]
t_smith_1  (u' idx):  {75}
t_smith_1  (U idx):  [182]
[([83], [81]), ([75], [75])]
[157, 12, 182]
P:  84  :  [0.9872 0.0128]
N:  79  :  [0.0137 0.9863]
P:  81  :  [0.9935 0.0065]
N:  82  :  [0.0177 0.9823]
t_smith_0  (u' idx):  {81, 84}
t_smith_0  (U idx):  [24, 81]
t_smith_1  (u' idx):  {82, 79}
t_smith_1  (U idx):  [56, 131]
[([84], [81]), ([79], [82])]
[24, 81, 56, 131]
P:  83  :  [0.9881 0.0119]
N:  6  :  [0.0125 0.9875]
P:  76  :  [0.99 0.01]
N:  84  :  [0.0067 0.9933]
t_smith_0  (u' idx):  {83, 76}
t_smith_0  (U idx):  [92, 183]
t_smith_1  (u' idx):  {84, 6}
t_smith_1  (U idx):  [229, 3]
[([83], [76]), ([6], [84])]
[92, 183, 229, 3]
Total Labeled number:  120  Still unlabeled number:  85
30
30
Self labeled sample index:  defaultdict(<class 'list'>, {'t_smith_0': [[61, 165], [195, 184], [32, 175], [51, 8], [125, 1

P:  71  :  [0.99 0.01]
N:  76  :  [0.0069 0.9931]
t_smith_0  (u' idx):  {75, 71}
t_smith_0  (U idx):  [30, 183]
t_smith_1  (u' idx):  {76}
t_smith_1  (U idx):  [70]
[([75], [71]), ([76], [76])]
[30, 183, 70]
P:  77  :  [0.993 0.007]
N:  79  :  [0.0119 0.9881]
P:  78  :  [0.9975 0.0025]
N:  54  :  [0.0262 0.9738]
t_smith_0  (u' idx):  {77, 78}
t_smith_0  (U idx):  [0, 147]
t_smith_1  (u' idx):  {54, 79}
t_smith_1  (U idx):  [156, 142]
[([77], [78]), ([79], [54])]
[0, 147, 156, 142]
P:  79  :  [0.9862 0.0138]
N:  77  :  [0.0059 0.9941]
P:  79  :  [0.9935 0.0065]
N:  80  :  [0.0065 0.9935]
t_smith_0  (u' idx):  {79}
t_smith_0  (U idx):  [14]
t_smith_1  (u' idx):  {80, 77}
t_smith_1  (U idx):  [212, 141]
[([79], [79]), ([77], [80])]
[14, 212, 141]
P:  1  :  [0.9861 0.0139]
N:  78  :  [0.008 0.992]
P:  71  :  [0.9888 0.0112]
N:  77  :  [0.015 0.985]
t_smith_0  (u' idx):  {1, 71}
t_smith_0  (U idx):  [235, 2]
t_smith_1  (u' idx):  {77, 78}
t_smith_1  (U idx):  [43, 122]
[([1], [71]), ([78], 

P:  25  :  [0.9831 0.0169]
N:  76  :  [0.0124 0.9876]
P:  77  :  [0.9982 0.0018]
N:  76  :  [0.0081 0.9919]
t_smith_0  (u' idx):  {25, 77}
t_smith_0  (U idx):  [138, 133]
t_smith_1  (u' idx):  {76}
t_smith_1  (U idx):  [123]
[([25], [77]), ([76], [76])]
[138, 133, 123]
P:  24  :  [0.981 0.019]
N:  78  :  [0.011 0.989]
P:  79  :  [0.9929 0.0071]
N:  78  :  [0.0058 0.9942]
t_smith_0  (u' idx):  {24, 79}
t_smith_0  (U idx):  [30, 147]
t_smith_1  (u' idx):  {78}
t_smith_1  (U idx):  [203]
[([24], [79]), ([78], [78])]
[30, 147, 203]
P:  44  :  [0.9832 0.0168]
N:  73  :  [0.0181 0.9819]
P:  79  :  [0.9939 0.0061]
N:  73  :  [0.0242 0.9758]
t_smith_0  (u' idx):  {44, 79}
t_smith_0  (U idx):  [42, 164]
t_smith_1  (u' idx):  {73}
t_smith_1  (U idx):  [43]
[([44], [79]), ([73], [73])]
[42, 164, 43]
P:  63  :  [0.983 0.017]
N:  78  :  [0.0055 0.9945]
P:  58  :  [0.9895 0.0105]
N:  78  :  [0.0238 0.9762]
t_smith_0  (u' idx):  {58, 63}
t_smith_0  (U idx):  [217, 149]
t_smith_1  (u' idx):  {78}
t_sm

P:  76  :  [0.9785 0.0215]
N:  75  :  [0.0168 0.9832]
P:  62  :  [0.9781 0.0219]
N:  75  :  [0.01 0.99]
t_smith_0  (u' idx):  {76, 62}
t_smith_0  (U idx):  [178, 213]
t_smith_1  (u' idx):  {75}
t_smith_1  (U idx):  [98]
[([76], [62]), ([75], [75])]
[178, 213, 98]
P:  75  :  [0.9928 0.0072]
N:  76  :  [0.0179 0.9821]
P:  41  :  [0.9771 0.0229]
N:  44  :  [0.0367 0.9633]
t_smith_0  (u' idx):  {41, 75}
t_smith_0  (U idx):  [103, 8]
t_smith_1  (u' idx):  {76, 44}
t_smith_1  (U idx):  [106, 62]
[([75], [41]), ([76], [44])]
[103, 8, 106, 62]
P:  75  :  [0.9903 0.0097]
N:  74  :  [0.0121 0.9879]
P:  75  :  [0.9931 0.0069]
N:  74  :  [0.0327 0.9673]
t_smith_0  (u' idx):  {75}
t_smith_0  (U idx):  [136]
t_smith_1  (u' idx):  {74}
t_smith_1  (U idx):  [113]
[([75], [75]), ([74], [74])]
[136, 113]
P:  44  :  [0.9792 0.0208]
N:  7  :  [0.02 0.98]
P:  79  :  [0.9966 0.0034]
N:  17  :  [0.0365 0.9635]
t_smith_0  (u' idx):  {44, 79}
t_smith_0  (U idx):  [61, 74]
t_smith_1  (u' idx):  {17, 7}
t_smith_

P:  13  :  [0.9689 0.0311]
N:  9  :  [0.031 0.969]
P:  6  :  [0.9802 0.0198]
N:  3  :  [0.0452 0.9548]
t_smith_0  (u' idx):  {13, 6}
t_smith_0  (U idx):  [210, 165]
t_smith_1  (u' idx):  {9, 3}
t_smith_1  (U idx):  [78, 199]
[([13], [6]), ([9], [3])]
[210, 165, 78, 199]
P:  64  :  [0.9746 0.0254]
N:  56  :  [0.0332 0.9668]
P:  22  :  [0.9813 0.0187]
N:  63  :  [0.0473 0.9527]
t_smith_0  (u' idx):  {64, 22}
t_smith_0  (U idx):  [138, 184]
t_smith_1  (u' idx):  {56, 63}
t_smith_1  (U idx):  [114, 110]
[([64], [22]), ([56], [63])]
[138, 184, 114, 110]
P:  28  :  [0.9668 0.0332]
N:  48  :  [0.0296 0.9704]
P:  22  :  [0.9827 0.0173]
N:  67  :  [0.0421 0.9579]
t_smith_0  (u' idx):  {28, 22}
t_smith_0  (U idx):  [75, 188]
t_smith_1  (u' idx):  {48, 67}
t_smith_1  (U idx):  [106, 111]
[([28], [22]), ([48], [67])]
[75, 188, 106, 111]
P:  73  :  [0.975 0.025]
N:  53  :  [0.0288 0.9712]
P:  74  :  [0.9851 0.0149]
N:  21  :  [0.0382 0.9618]
t_smith_0  (u' idx):  {73, 74}
t_smith_0  (U idx):  [193,

SVM f1:                precision    recall  f1-score   support

   t_smith_0       1.00      0.93      0.97        15
   t_smith_1       0.92      1.00      0.96        11

   micro avg       0.96      0.96      0.96        26
   macro avg       0.96      0.97      0.96        26
weighted avg       0.96      0.96      0.96        26

[-1, -1, -1, 't_smith_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 't_smith_0', -1, -1, 't_smith_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1

P:  80  :  [0.9959 0.0041]
N:  79  :  [0.0039 0.9961]
P:  80  :  [0.9884 0.0116]
N:  18  :  [0.0268 0.9732]
t_smith_0  (u' idx):  {80}
t_smith_0  (U idx):  [12]
t_smith_1  (u' idx):  {18, 79}
t_smith_1  (U idx):  [11, 99]
[([80], [80]), ([79], [18])]
[12, 11, 99]
P:  80  :  [0.993 0.007]
N:  81  :  [0.0087 0.9913]
P:  84  :  [0.9945 0.0055]
N:  81  :  [0.0055 0.9945]
t_smith_0  (u' idx):  {80, 84}
t_smith_0  (U idx):  [177, 198]
t_smith_1  (u' idx):  {81}
t_smith_1  (U idx):  [229]
[([80], [84]), ([81], [81])]
[177, 198, 229]
P:  80  :  [0.9909 0.0091]
N:  29  :  [0.0134 0.9866]
P:  82  :  [0.9992 0.0008]
N:  5  :  [0.0277 0.9723]
t_smith_0  (u' idx):  {80, 82}
t_smith_0  (U idx):  [61, 74]
t_smith_1  (u' idx):  {5, 29}
t_smith_1  (U idx):  [181, 239]
[([80], [82]), ([29], [5])]
[61, 74, 181, 239]
P:  43  :  [0.9848 0.0152]
N:  83  :  [0.0039 0.9961]
P:  82  :  [0.9935 0.0065]
N:  84  :  [0.0105 0.9895]
t_smith_0  (u' idx):  {82, 43}
t_smith_0  (U idx):  [216, 47]
t_smith_1  (u' idx): 

P:  29  :  [0.9815 0.0185]
N:  73  :  [0.0284 0.9716]
P:  75  :  [0.9924 0.0076]
N:  30  :  [0.1313 0.8687]
d_richardson_0  (u' idx):  {75, 29}
d_richardson_0  (U idx):  [176, 317]
d_richardson_1  (u' idx):  {73, 30}
d_richardson_1  (U idx):  [299, 320]
[([29], [75]), ([73], [30])]
[176, 317, 299, 320]
P:  74  :  [0.986 0.014]
N:  72  :  [0.0307 0.9693]
P:  44  :  [0.9885 0.0115]
N:  38  :  [0.1235 0.8765]
d_richardson_0  (u' idx):  {74, 44}
d_richardson_0  (U idx):  [275, 230]
d_richardson_1  (u' idx):  {72, 38}
d_richardson_1  (U idx):  [229, 53]
[([74], [44]), ([72], [38])]
[275, 230, 229, 53]
P:  50  :  [0.9848 0.0152]
N:  58  :  [0.031 0.969]
P:  73  :  [0.9954 0.0046]
N:  15  :  [0.1205 0.8795]
d_richardson_0  (u' idx):  {73, 50}
d_richardson_0  (U idx):  [28, 20]
d_richardson_1  (u' idx):  {58, 15}
d_richardson_1  (U idx):  [189, 148]
[([50], [73]), ([58], [15])]
[28, 20, 189, 148]
P:  47  :  [0.9851 0.0149]
N:  52  :  [0.0298 0.9702]
P:  74  :  [0.9956 0.0044]
N:  49  :  [0.127

d_richardson_0  (u' idx):  {40, 66}
d_richardson_0  (U idx):  [148, 66]
d_richardson_1  (u' idx):  {43, 46}
d_richardson_1  (U idx):  [301, 167]
[([66], [40]), ([46], [43])]
[148, 66, 301, 167]
P:  13  :  [0.9674 0.0326]
N:  43  :  [0.0532 0.9468]
P:  6  :  [0.9785 0.0215]
N:  20  :  [0.2454 0.7546]
d_richardson_0  (u' idx):  {13, 6}
d_richardson_0  (U idx):  [78, 26]
d_richardson_1  (u' idx):  {43, 20}
d_richardson_1  (U idx):  [335, 162]
[([13], [6]), ([43], [20])]
[78, 26, 335, 162]
P:  27  :  [0.9723 0.0277]
N:  12  :  [0.0505 0.9495]
P:  25  :  [0.9826 0.0174]
N:  32  :  [0.2062 0.7938]
d_richardson_0  (u' idx):  {25, 27}
d_richardson_0  (U idx):  [348, 276]
d_richardson_1  (u' idx):  {32, 12}
d_richardson_1  (U idx):  [42, 14]
[([27], [25]), ([12], [32])]
[348, 276, 42, 14]
P:  14  :  [0.9679 0.0321]
N:  64  :  [0.0414 0.9586]
P:  66  :  [0.9858 0.0142]
N:  47  :  [0.2137 0.7863]
d_richardson_0  (u' idx):  {66, 14}
d_richardson_0  (U idx):  [331, 280]
d_richardson_1  (u' idx):  {

P:  77  :  [0.9878 0.0122]
N:  75  :  [0.0077 0.9923]
P:  74  :  [0.9949 0.0051]
N:  71  :  [0.0733 0.9267]
d_richardson_0  (u' idx):  {74, 77}
d_richardson_0  (U idx):  [271, 122]
d_richardson_1  (u' idx):  {75, 71}
d_richardson_1  (U idx):  [99, 291]
[([77], [74]), ([75], [71])]
[271, 122, 99, 291]
P:  75  :  [0.9933 0.0067]
N:  39  :  [0.0117 0.9883]
P:  74  :  [0.9992 0.0008]
N:  73  :  [0.0848 0.9152]
d_richardson_0  (u' idx):  {74, 75}
d_richardson_0  (U idx):  [31, 120]
d_richardson_1  (u' idx):  {73, 39}
d_richardson_1  (U idx):  [72, 20]
[([75], [74]), ([39], [73])]
[31, 120, 72, 20]
Total Labeled number:  127  Still unlabeled number:  78
30
30
Self labeled sample index:  defaultdict(<class 'list'>, {'d_richardson_0': [[148, 66], [78, 26], [348, 276], [331, 280], [88, 165], [79, 217], [313, 11], [181, 222], [240, 245], [202, 191], [259, 257], [232, 135], [231, 325], [106, 157], [315, 93], [269, 283], [32, 285], [83, 108], [151, 267], [137, 211], [234, 159], [196, 166], [324], 

P:  1  :  [0.9756 0.0244]
N:  31  :  [0.0288 0.9712]
P:  64  :  [0.9937 0.0063]
N:  63  :  [0.1419 0.8581]
d_richardson_0  (u' idx):  {64, 1}
d_richardson_0  (U idx):  [182, 107]
d_richardson_1  (u' idx):  {63, 31}
d_richardson_1  (U idx):  [62, 36]
[([1], [64]), ([31], [63])]
[182, 107, 62, 36]
P:  6  :  [0.9765 0.0235]
N:  11  :  [0.0275 0.9725]
P:  3  :  [0.99 0.01]
N:  76  :  [0.0619 0.9381]
d_richardson_0  (u' idx):  {3, 6}
d_richardson_0  (U idx):  [45, 153]
d_richardson_1  (u' idx):  {11, 76}
d_richardson_1  (U idx):  [147, 224]
[([6], [3]), ([11], [76])]
[45, 153, 147, 224]
P:  68  :  [0.9812 0.0188]
N:  36  :  [0.0274 0.9726]
P:  76  :  [0.9924 0.0076]
N:  72  :  [0.1236 0.8764]
d_richardson_0  (u' idx):  {68, 76}
d_richardson_0  (U idx):  [164, 106]
d_richardson_1  (u' idx):  {72, 36}
d_richardson_1  (U idx):  [142, 179]
[([68], [76]), ([36], [72])]
[164, 106, 142, 179]
P:  74  :  [0.9885 0.0115]
N:  75  :  [0.0084 0.9916]
P:  74  :  [0.9989 0.0011]
N:  10  :  [0.1246 0.8754]

P:  24  :  [0.974 0.026]
N:  34  :  [0.0257 0.9743]
P:  74  :  [0.9897 0.0103]
N:  33  :  [0.1397 0.8603]
d_richardson_0  (u' idx):  {24, 74}
d_richardson_0  (U idx):  [75, 177]
d_richardson_1  (u' idx):  {33, 34}
d_richardson_1  (U idx):  [124, 192]
[([24], [74]), ([34], [33])]
[75, 177, 124, 192]
P:  73  :  [0.9756 0.0244]
N:  74  :  [0.0288 0.9712]
P:  5  :  [0.9831 0.0169]
N:  74  :  [0.1283 0.8717]
d_richardson_0  (u' idx):  {73, 5}
d_richardson_0  (U idx):  [84, 11]
d_richardson_1  (u' idx):  {74}
d_richardson_1  (U idx):  [178]
[([73], [5]), ([74], [74])]
[84, 11, 178]
P:  75  :  [0.9848 0.0152]
N:  51  :  [0.0276 0.9724]
P:  54  :  [0.9864 0.0136]
N:  43  :  [0.1262 0.8738]
d_richardson_0  (u' idx):  {75, 54}
d_richardson_0  (U idx):  [236, 234]
d_richardson_1  (u' idx):  {43, 51}
d_richardson_1  (U idx):  [330, 24]
[([75], [54]), ([51], [43])]
[236, 234, 330, 24]
P:  70  :  [0.9806 0.0194]
N:  71  :  [0.0289 0.9711]
P:  14  :  [0.9852 0.0148]
N:  61  :  [0.1186 0.8814]
d_richa

P:  62  :  [0.958 0.042]
N:  43  :  [0.0682 0.9318]
P:  72  :  [0.9786 0.0214]
N:  2  :  [0.1877 0.8123]
d_richardson_0  (u' idx):  {72, 62}
d_richardson_0  (U idx):  [202, 143]
d_richardson_1  (u' idx):  {2, 43}
d_richardson_1  (U idx):  [78, 24]
[([62], [72]), ([43], [2])]
[202, 143, 78, 24]
P:  31  :  [0.9594 0.0406]
N:  4  :  [0.0463 0.9537]
P:  74  :  [0.9886 0.0114]
N:  51  :  [0.1708 0.8292]
d_richardson_0  (u' idx):  {74, 31}
d_richardson_0  (U idx):  [252, 106]
d_richardson_1  (u' idx):  {51, 4}
d_richardson_1  (U idx):  [52, 187]
[([31], [74]), ([4], [51])]
[252, 106, 52, 187]
P:  70  :  [0.9573 0.0427]
N:  14  :  [0.0403 0.9597]
P:  5  :  [0.9792 0.0208]
N:  59  :  [0.1793 0.8207]
d_richardson_0  (u' idx):  {5, 70}
d_richardson_0  (U idx):  [226, 0]
d_richardson_1  (u' idx):  {59, 14}
d_richardson_1  (U idx):  [321, 346]
[([70], [5]), ([14], [59])]
[226, 0, 321, 346]
P:  29  :  [0.9621 0.0379]
N:  51  :  [0.0317 0.9683]
P:  29  :  [0.9795 0.0205]
N:  68  :  [0.1686 0.8314]
d

[-1, -1, 'd_richardson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_1', 'd_richardson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'd_richardson_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

P:  82  :  [0.9861 0.0139]
N:  74  :  [0.0116 0.9884]
P:  79  :  [0.9996 0.0004]
N:  72  :  [0.1159 0.8841]
d_richardson_0  (u' idx):  {82, 79}
d_richardson_0  (U idx):  [133, 119]
d_richardson_1  (u' idx):  {72, 74}
d_richardson_1  (U idx):  [336, 264]
[([82], [79]), ([74], [72])]
[133, 119, 336, 264]
Total Labeled number:  122  Still unlabeled number:  83
30
30
Self labeled sample index:  defaultdict(<class 'list'>, {'d_richardson_0': [[51, 217], [348, 280], [236, 226], [132, 313], [245, 240], [210, 232], [324, 40], [166, 290], [311], [32, 82], [128, 31], [124], [285, 238], [63, 11], [211, 143], [269, 29], [136, 254], [176, 222], [114, 21], [231, 162], [267, 45], [106, 111], [159, 276], [298, 249], [79], [26], [81, 0], [103, 334], [256, 234], [133, 119]], 'd_richardson_1': [[357, 224], [151, 185], [144, 297], [91, 201], [94, 14], [335], [310, 303], [265, 97], [326], [300, 27], [213, 243], [167, 66], [209, 299], [239, 284], [221, 25], [30, 43], [322, 155], [92], [241], [164, 355], [22

P:  73  :  [0.9886 0.0114]
N:  35  :  [0.018 0.982]
P:  42  :  [0.9884 0.0116]
N:  44  :  [0.1007 0.8993]
d_richardson_0  (u' idx):  {73, 42}
d_richardson_0  (U idx):  [318, 37]
d_richardson_1  (u' idx):  {35, 44}
d_richardson_1  (U idx):  [125, 181]
[([73], [42]), ([35], [44])]
[318, 37, 125, 181]
P:  73  :  [0.9818 0.0182]
N:  76  :  [0.018 0.982]
P:  73  :  [0.9965 0.0035]
N:  76  :  [0.0569 0.9431]
d_richardson_0  (u' idx):  {73}
d_richardson_0  (U idx):  [11]
d_richardson_1  (u' idx):  {76}
d_richardson_1  (U idx):  [189]
[([73], [73]), ([76], [76])]
[11, 189]
P:  73  :  [0.9821 0.0179]
N:  75  :  [0.0129 0.9871]
P:  6  :  [0.9883 0.0117]
N:  77  :  [0.0818 0.9182]
d_richardson_0  (u' idx):  {73, 6}
d_richardson_0  (U idx):  [143, 111]
d_richardson_1  (u' idx):  {75, 77}
d_richardson_1  (U idx):  [232, 322]
[([73], [6]), ([75], [77])]
[143, 111, 232, 322]
P:  12  :  [0.9805 0.0195]
N:  78  :  [0.0148 0.9852]
P:  27  :  [0.9896 0.0104]
N:  78  :  [0.0569 0.9431]
d_richardson_0  (u'

P:  19  :  [0.9795 0.0205]
N:  14  :  [0.025 0.975]
P:  0  :  [0.9755 0.0245]
N:  72  :  [0.0954 0.9046]
d_richardson_0  (u' idx):  {0, 19}
d_richardson_0  (U idx):  [31, 0]
d_richardson_1  (u' idx):  {72, 14}
d_richardson_1  (U idx):  [264, 127]
[([19], [0]), ([14], [72])]
[31, 0, 264, 127]
P:  62  :  [0.9833 0.0167]
N:  14  :  [0.0238 0.9762]
P:  36  :  [0.9771 0.0229]
N:  73  :  [0.1022 0.8978]
d_richardson_0  (u' idx):  {36, 62}
d_richardson_0  (U idx):  [191, 319]
d_richardson_1  (u' idx):  {73, 14}
d_richardson_1  (U idx):  [304, 76]
[([62], [36]), ([14], [73])]
[191, 319, 304, 76]
P:  18  :  [0.9794 0.0206]
N:  74  :  [0.016 0.984]
P:  20  :  [0.9756 0.0244]
N:  75  :  [0.0808 0.9192]
d_richardson_0  (u' idx):  {18, 20}
d_richardson_0  (U idx):  [250, 204]
d_richardson_1  (u' idx):  {74, 75}
d_richardson_1  (U idx):  [270, 14]
[([18], [20]), ([74], [75])]
[250, 204, 270, 14]
P:  73  :  [0.9847 0.0153]
N:  1  :  [0.0225 0.9775]
P:  73  :  [0.9814 0.0186]
N:  40  :  [0.1224 0.8776

P:  74  :  [0.9805 0.0195]
N:  7  :  [0.0449 0.9551]
P:  71  :  [0.9951 0.0049]
N:  52  :  [0.1379 0.8621]
d_richardson_0  (u' idx):  {74, 71}
d_richardson_0  (U idx):  [50, 79]
d_richardson_1  (u' idx):  {52, 7}
d_richardson_1  (U idx):  [165, 320]
[([74], [71]), ([7], [52])]
[50, 79, 165, 320]
P:  72  :  [0.9829 0.0171]
N:  12  :  [0.0381 0.9619]
P:  73  :  [0.9913 0.0087]
N:  4  :  [0.153 0.847]
d_richardson_0  (u' idx):  {72, 73}
d_richardson_0  (U idx):  [276, 349]
d_richardson_1  (u' idx):  {12, 4}
d_richardson_1  (U idx):  [326, 305]
[([72], [73]), ([12], [4])]
[276, 349, 326, 305]
P:  74  :  [0.9911 0.0089]
N:  59  :  [0.0329 0.9671]
P:  68  :  [0.9927 0.0073]
N:  71  :  [0.1215 0.8785]
d_richardson_0  (u' idx):  {74, 68}
d_richardson_0  (U idx):  [101, 262]
d_richardson_1  (u' idx):  {59, 71}
d_richardson_1  (U idx):  [122, 344]
[([74], [68]), ([59], [71])]
[101, 262, 122, 344]
P:  71  :  [0.9873 0.0127]
N:  62  :  [0.0308 0.9692]
P:  74  :  [0.9956 0.0044]
N:  11  :  [0.1331 

P:  47  :  [0.9644 0.0356]
N:  56  :  [0.0514 0.9486]
P:  54  :  [0.9741 0.0259]
N:  15  :  [0.171 0.829]
d_richardson_0  (u' idx):  {54, 47}
d_richardson_0  (U idx):  [262, 128]
d_richardson_1  (u' idx):  {56, 15}
d_richardson_1  (U idx):  [270, 277]
[([47], [54]), ([56], [15])]
[262, 128, 270, 277]
P:  69  :  [0.9667 0.0333]
N:  2  :  [0.0475 0.9525]
P:  37  :  [0.978 0.022]
N:  58  :  [0.2062 0.7938]
d_richardson_0  (u' idx):  {37, 69}
d_richardson_0  (U idx):  [357, 180]
d_richardson_1  (u' idx):  {2, 58}
d_richardson_1  (U idx):  [130, 189]
[([69], [37]), ([2], [58])]
[357, 180, 130, 189]
P:  2  :  [0.9681 0.0319]
N:  21  :  [0.0416 0.9584]
P:  73  :  [0.9909 0.0091]
N:  65  :  [0.1853 0.8147]
d_richardson_0  (u' idx):  {73, 2}
d_richardson_0  (U idx):  [79, 221]
d_richardson_1  (u' idx):  {65, 21}
d_richardson_1  (U idx):  [210, 224]
[([2], [73]), ([21], [65])]
[79, 221, 210, 224]
P:  11  :  [0.9719 0.0281]
N:  26  :  [0.0377 0.9623]
P:  27  :  [0.9819 0.0181]
N:  20  :  [0.1801 

                precision    recall  f1-score   support

d_richardson_0       0.97      1.00      0.99       231
d_richardson_1       1.00      0.96      0.98       167

     micro avg       0.98      0.98      0.98       398
     macro avg       0.99      0.98      0.98       398
  weighted avg       0.99      0.98      0.98       398

[231   0   6 161]
                precision    recall  f1-score   support

d_richardson_0       0.97      0.99      0.98       231
d_richardson_1       0.99      0.96      0.97       167

     micro avg       0.98      0.98      0.98       398
     macro avg       0.98      0.97      0.98       398
  weighted avg       0.98      0.98      0.98       398

[229   2   7 160]
For name:  j_moraes
(26, 2)
j_moraes  pass
For name:  e_moreno
(83, 2)
e_moreno  pass
For name:  r_little
(4, 2)
r_little  pass
For name:  t_kobayashi
(150, 2)
t_kobayashi  pass
For name:  a_lin
(46, 2)
a_lin  pass
For name:  a_miranda
(70, 2)
a_miranda  pass
For name:  h_vogel
(15, 2)

N:  73  :  [0.0991 0.9009]
P:  5  :  [0.9422 0.0578]
N:  12  :  [0.0757 0.9243]
y_wang_0  (u' idx):  {60, 5}
y_wang_0  (U idx):  [140, 145]
y_wang_1  (u' idx):  {73, 12}
y_wang_1  (U idx):  [40, 81]
[([60], [5]), ([73], [12])]
[140, 145, 40, 81]
P:  3  :  [0.8917 0.1083]
N:  52  :  [0.0827 0.9173]
P:  59  :  [0.9569 0.0431]
N:  10  :  [0.0568 0.9432]
y_wang_0  (u' idx):  {59, 3}
y_wang_0  (U idx):  [170, 157]
y_wang_1  (u' idx):  {10, 52}
y_wang_1  (U idx):  [205, 212]
[([3], [59]), ([52], [10])]
[170, 157, 205, 212]
P:  38  :  [0.9061 0.0939]
N:  71  :  [0.079 0.921]
P:  4  :  [0.9209 0.0791]
N:  73  :  [0.0367 0.9633]
y_wang_0  (u' idx):  {4, 38}
y_wang_0  (U idx):  [78, 84]
y_wang_1  (u' idx):  {73, 71}
y_wang_1  (U idx):  [114, 127]
[([38], [4]), ([71], [73])]
[78, 84, 114, 127]
P:  20  :  [0.9131 0.0869]
N:  65  :  [0.0606 0.9394]
P:  58  :  [0.9279 0.0721]
N:  74  :  [0.0256 0.9744]
y_wang_0  (u' idx):  {58, 20}
y_wang_0  (U idx):  [137, 46]
y_wang_1  (u' idx):  {65, 74}
y_wang_1

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_0', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_0', 'y_wang_1', -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 'y_wang_0', -1, -1, 'y_wang_1', -1, -1, 'y_wang_0', -1, -1, -1]
P value:  1  N value:  1
Initial L size:  10
Initial U size:  2

KeyboardInterrupt: 

In [18]:
from statistics import mean 

print(threshold_change_all_co_lr_f1s)
print(co_lr_diff_embedding_result)

[[0.9761167624944714, 0.9923460612315101, 0.9794320560192242, 0.9579713216076853, 0.966326695807026, 0.9819839830205209, 0.9922126745435016, 0.9424861608596549, 0.9944401544401544, 0.9732348111658456, 0.939352110194132, 0.9924358974358973, 0.976890756302521, 0.9293797303430003, 0.9883887801696021]]
[[[0.9761167624944714, 0.9923460612315101, 0.9794320560192242, 0.9579713216076853, 0.966326695807026, 0.9819839830205209, 0.9922126745435016, 0.9424861608596549, 0.9944401544401544, 0.9732348111658456, 0.939352110194132, 0.9924358974358973, 0.976890756302521, 0.9293797303430003, 0.9883887801696021]]]


In [None]:
# %whos
del viewtwo_citation_embedding
del viewone_text_emb