# multi-class co-training

Directly apply co-train to multi-class problem

In [1]:
import os
import sys
import warnings

# warnings.filterwarnings('error')
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func
# parameters
#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

pp_textual = ["pv_dbow"]
# pp_textual = ["lsa", "pv_dm", "pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [156]:
import numpy as np
import warnings
import random
import sys
from collections import defaultdict
from sklearn import preprocessing
# create co training classifier
class Multi_Class_Co_training(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, k=30, u = 100):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.deepcopy(clf1)
        else:
            self.clf2 = clf2
        # base of number of self labeled samples
        self.self_labeled_base = 1
        # number of iteration
        self.k = k
        # minimal size of pool of unlabeled samples
        self.minimal_u = u

    def init_U_prime(self, U):
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        return U, U_prime

    def check_iter_label_mapping(self, iter_clf1, iter_clf2):
        '''
        In theory, it shouldn't occur that label not mapping since it trained on same dataset but different view
        But add a check to make sure it won't occur and save the class mapping for late label unlabeled sample
        '''
        dv1_class_label = iter_clf1.classes_
        dv2_class_label = iter_clf2.classes_
        if all(dv1_class_label == dv2_class_label):
            self.class_ = dv1_class_label
        else:
            sys.exit("Two view classifier label not mapping")
        

    def label_unlabeled_samples(self, proba, rank):
        U_prime_size = len(proba)
        self_trained_labels = []
        for label, conf_measure in enumerate(rank):
            index = 0
            new_label = []
            while(len(new_label)<self.self_labeled_base):
                max_conf_sample_index = conf_measure[index]
                # ---- if predict proba is more than 50% ------- #
                if (proba[max_conf_sample_index][label] > 0.5):
                    # print(label, ': ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                    new_label.append(max_conf_sample_index)
                index +=1
                if (index>=U_prime_size):
                    break
            self_trained_labels.append(new_label)
        # print(self_trained_labels)
        return self_trained_labels

    def fit(self, dataView1, dataView2, labels):
        # index of self labeled samples
        self.new_labeled_idx = defaultdict(list)
        # count of self labeled samples
        self.new_labeled_count = {}
        
        # index of the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != "-1"]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == "-1"]
        # ----------our u prime will be max(user_input, number of labeled train) ------- #
        self.u = max(self.minimal_u, len(L))
        U, U_prime = self.init_U_prime(U)
        print("L: ", len(L), "U: ",len(U))
        init_train_label = labels[L]
        num_class = len(set(init_train_label))
        print("init class count: ", num_class)
        iterCount = 0
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount < self.k and U_prime:
            # print("step",iterCount, " L: ",L)
            # print("step",iterCount, " U_prime: ",U_prime)
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2= dataView2.iloc[L]
            iter_train_label = labels[L]
            iter_clf1 = self.copy.deepcopy(self.clf1) 
            iter_clf2 = self.copy.deepcopy(self.clf2)
            iter_clf1.fit(X=iter_train_d1, y=iter_train_label)
            iter_clf2.fit(X=iter_train_d2, y=iter_train_label)
            
            self.check_iter_label_mapping(iter_clf1, iter_clf2)
            
            iter_unlabeled_d1 = dataView1.iloc[U_prime]
            iter_unlabeled_d2 = dataView2.iloc[U_prime]
            # rank class probabilities for unlabeled sample for it's confidence measure
            dv1_proba = iter_clf1.predict_proba(iter_unlabeled_d1)
            dv2_proba = iter_clf2.predict_proba(iter_unlabeled_d2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1_rank[i] is label i's confidence measure
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
            #print(dv1_proba)
            #print(dv1_proba_rank)
            #print(dv2_proba)
            #print(dv2_proba_rank)
            # h1 classifier
            newly_labeled_dv1 = self.label_unlabeled_samples(dv1_proba, dv1_proba_rank)
            # h2 classifier
            newly_labeled_dv2 = self.label_unlabeled_samples(dv2_proba, dv2_proba_rank)
            roundNew = list(zip(newly_labeled_dv1, newly_labeled_dv2))
            # auto label the samples and remove it from U_prime
            round_auto_labeled = []
            for label, round_new in enumerate(roundNew):
                round_new = set([item for sublist in round_new for item in sublist])
                auto_labeled = [U_prime[x] for x in round_new]
                round_auto_labeled.extend(auto_labeled)
                self.new_labeled_idx[self.class_[label]].append(auto_labeled)
                # add label to those new samples
                labels[auto_labeled] = self.class_[label]
                #print(self.class_[label]," (u' idx): ",round_new)
                #print(self.class_[label]," (U idx): ",auto_labeled)
            print(round_auto_labeled)
            # extend the labeled sample
            L.extend(round_auto_labeled)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in round_auto_labeled]
            #print(U_prime)
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*num_class*self.self_labeled_base):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
            iterCount +=1
            
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels[L])
        self.clf2.fit(newtrain_d2, labels[L])

    def get_self_labeled_sample(self):
        '''
        return:
            self-labeled sample Index
        '''

        return self.new_labeled_idx
    
    def get_self_labeled_sample_count(self):
        for key, value in self.new_labeled_idx.items():
            flated_value = [item for sublist in value for item in sublist]
            # print(key, " : ", flated_value)
            self.new_labeled_count[key] = len(flated_value)
        return self.new_labeled_count

    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False

    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = [" "] * dataView1.shape[0]
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both clf agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                max_prob_idx = prod_y_probas.index(max(prod_y_probas))
                y_pred[i] = self.class_[max_prob_idx]
                print("result idx: ", max_prob_idx, " result: ",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, exit
                sys.exit("classifiers disagree with label, result may not accurate")
        # convert final result to np array
        y_pred_np_array = np.asarray(y_pred)
        return y_pred_np_array

    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba


In [157]:
import copy
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
from collections import defaultdict


# cross validation
def k_fold_cv_co_train(dataview1, dataview2, unlabeled_dv1, unlabeled_dv2, label, clf, k=10):
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel = []
    all_generated_train = []
    for train_index, test_index in kf.split(dataview1, label):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # -------------- add unlabeled to train ------------------ #
        final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
        final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
        final_dv1 = final_dv1.drop(["authorID", "paperID", "label"], axis=1)
        final_dv2 = final_dv2.drop(["authorID", "paperID", "label"], axis=1)
        final_labels = label_train.append(unlabeled_dv1["label"], ignore_index=True)
        # -------------- train co-training multi class ------------------- #
        co_train_clf = copy.deepcopy(clf)
        co_train_clf.fit(final_dv1, final_dv2, final_labels)
        all_generated_train.append(co_train_clf.get_self_labeled_sample_count())

        # -------------- test ovr co-training -------------------- #
        dv1_test = dv1_test.drop(["authorID", "paperID"], axis=1)
        dv2_test = dv2_test.drop(["authorID", "paperID"], axis=1)
        
        # get predicted label
        co_lr_label_predict = co_train_clf.predict(dv1_test, dv2_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(co_lr_label_predict)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))
    
    # --------- find average of positive/negative self labeled sample for each author --- #
    all_fold_averaged_generated_train = defaultdict(list)
    print(all_generated_train)
    for per_fold_count_dic in all_generated_train:
        for key, value in per_fold_count_dic.items():
            all_fold_averaged_generated_train[key].append(value)
    print(all_fold_averaged_generated_train)
    for key, value in all_fold_averaged_generated_train.items():
        all_fold_averaged_generated_train[key] = np.around(np.mean(value, axis=0))
    # ------------- accuracy and f1 ---------- #
    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(all_fold_averaged_generated_train)
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1, all_fold_averaged_generated_train

In [158]:
# load the file
import sys
import io
import os
import collections
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from statistics import mean 

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../../Data/"+Dataset+"/canopies/"
listfiles = os.listdir(fileDir)

co_lr_diff_embedding_result = []

# # ------------ view two citation is fix, so move out to save time ------- #
# # read viewtwo embedding
# print("Load citation embedding: ", pp_citation)
# viewtwo_citation_embedding = com_func.read_all_citation_embedding_sorted(emb_type = pp_citation)

for select_emb in pp_textual:
#     #---------------- load embeddings for different view ---------------#
#     print("Load textual embedding: ", select_emb)
#     # read viewone embeddings
#     viewone_textual_emb = com_func.read_all_textual_embedding_sorted(emb_type=select_emb, training_size = "3m")

    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, num_class, per_class_count, all_labeled_count, selected_labeled_count, unlabeled_count = ([] for i in range(6))

        all_co_LR_accuracy, all_co_LR_f1, co_train_generated_label_details= ([] for i in range(3))

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read pid and aid from file
            data = com_func.read_pid_aid(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            unlabeled_mask = data["authorID"] == "-1"
            ublabeled_data = data[unlabeled_mask]
            unlabeled_pid = ublabeled_data["paperID"]
            print(len(unlabeled_pid))
            print(labeled_data.shape)
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            # count number of paper each author write based on author ID
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                all_labeled_count.append(len(labeled_data))
                total_selected_group+= 1
                labeled_data, author_list, paperCounter= com_func.only_select_productive_authors(labeled_data, step_threshold)
                allname.append(name)
                num_class.append(len(paperCounter))
                per_class_count.append(paperCounter)

                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_textual = com_func.extract_sorted_embedding(viewone_textual_emb, labeled_data["paperID"])
                print(labeled_viewone_textual.shape)
                labeled_viewtwo_citation = com_func.extract_sorted_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_textual), " : ", len(labeled_viewtwo_citation))

                # read in unlabeled data
                unlabeled_viewone_textual = com_func.extract_unlabeled_embedding(viewone_textual_emb, unlabeled_pid)
                print(unlabeled_viewone_textual.shape)
                unlabeled_viewtwo_citation = com_func.extract_unlabeled_embedding(viewtwo_citation_embedding, unlabeled_pid)
                print(unlabeled_viewtwo_citation.shape)
                print("Unlabeled: ",len(unlabeled_viewone_textual), " : ", len(unlabeled_viewtwo_citation))
                # remove samples that have no citation link from ublabeled data
                noCitationPids_unlabeled = set(unlabeled_viewone_textual['paperID'])-set(unlabeled_viewtwo_citation['paperID'])
                print("Unlabeled no citation link size: ", len(noCitationPids_unlabeled))
                # process unlabeled data
                unlabeled_dv1 = unlabeled_viewone_textual[~unlabeled_viewone_textual['paperID'].isin(noCitationPids_unlabeled)].reset_index(drop=True)
                unlabeled_dv2 = unlabeled_viewtwo_citation
                
                # ---------------- shuffle the data ----------------- #
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # ------------------ alignment ---------------------- #
                labeled_viewone_textual = pd.merge(labeled_data, labeled_viewone_textual, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation = pd.merge(labeled_data, labeled_viewtwo_citation, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation.fillna(0, inplace=True)
                
                labeled_viewone_textual = labeled_viewone_textual.drop([0], axis=1)
                labeled_viewtwo_citation = labeled_viewtwo_citation.drop([0], axis=1)
                
                #print(labeled_viewone_textual.head())
                #print(unlabeled_dv1.head())
                
                print(labeled_viewone_textual.shape)
                print(labeled_viewtwo_citation.shape)
                print(unlabeled_dv1.shape)
                print(unlabeled_dv2.shape)
                unlabeled_count.append(unlabeled_dv1.shape[0])
                selected_labeled_count.append(labeled_viewone_textual.shape[0])
                # ---------------------- 10 fold cv ------------------------------------------------- #
                co_logistic_clf = Multi_Class_Co_training(clf1=LogisticRegression(multi_class='ovr'))
                co_lr_accuracy, co_lr_f1, all_fold_averaged_generated_train = k_fold_cv_co_train(labeled_viewone_textual, 
                                                                                                 labeled_viewtwo_citation,
                                                                                                 unlabeled_dv1, unlabeled_dv2,
                                                                                                 labeled_data["authorID"], 
                                                                                                 co_logistic_clf, k=2)
                print("lr macro f1: ",co_lr_f1)
                print("Co-training self-labeled samples: ", all_fold_averaged_generated_train)
                all_co_LR_accuracy.append(co_lr_accuracy)
                all_co_LR_f1.append(co_lr_f1)
                co_train_generated_label_details.append(all_fold_averaged_generated_train)
                break
                
#         # write evaluation result to excel
#         output = pd.DataFrame({'Name Group':allname,"Class number":num_class, "Per class size":per_class_count, 
#                                "Total labeled samples":all_labeled_count, "Total unlabeled samples":unlabeled_count, 
#                                "selected labeled samples": selected_labeled_count,
#                                "Co-training self-labeled samples": co_train_generated_label_details, 
#                                "co-train with lr accuracy":all_co_LR_accuracy, "co-train with lr f1": all_co_LR_f1})

#         savePath = "../../result/"+Dataset+"/co_train/"
#         filename = "(Global emb sample 3m) viewone_textual="+select_emb+"_viewtwo_citation="+pp_citation+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")
        
#         threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
#     co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

For name:  j_read
956
(136, 2)
j_read  pass
For name:  f_esteves
120
(34, 2)
f_esteves  pass
For name:  c_miller
6564
(252, 2)
c_miller  pass
For name:  r_jha
362
(11, 2)
r_jha  pass
For name:  a_lowe
772
(102, 2)
a_lowe  pass
For name:  a_vega
559
(20, 2)
a_vega  pass
For name:  k_smith
8516
(338, 2)
k_smith  pass
For name:  j_gordon
3753
(19, 2)
j_gordon  pass
For name:  s_liao
2273
(104, 2)
s_liao  pass
For name:  j_qian
3452
(17, 2)
j_qian  pass
For name:  s_bernardi
160
(91, 2)
s_bernardi  pass
For name:  t_hill
1180
(15, 2)
t_hill  pass
For name:  s_schindler
177
(51, 2)
s_schindler  pass
For name:  j_williams
13317
(625, 2)
j_williams  pass
For name:  s_jacobson
1535
(28, 2)
s_jacobson  pass
For name:  e_andrade
333
(17, 2)
e_andrade  pass
For name:  t_santos
587
(45, 2)
t_santos  pass
For name:  k_kim
29577
(1111, 2)
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392':

L:  253 U:  22824
init class count:  3
[7465, 23203, 2411, 17915, 20690, 13841]
[20731, 17210, 15892, 7241, 22631, 10364]
[6653, 8565, 11402, 11512, 1989]
[2470, 3947, 15907, 15394, 10912, 9061]
[11548, 18733, 11755, 21313, 20061, 21038]
[15497, 18977, 9691, 8975, 20621, 13963]
[7983, 13922, 12004, 9225, 13042, 3649]
[15421, 13269, 14098, 13791, 20610, 16685]
[23064, 12603, 10157, 22073, 19238, 8180]
[13054, 6440, 13932, 1704, 7307, 10317]
[16153, 10147, 14174, 15979, 6481, 19591]
[15396, 18815, 10136, 3603, 3420, 15234]
[18491, 12546, 8758, 1653, 15783]
[13352, 5967, 22350, 13700, 6882, 8787]
[21658, 4582, 11214, 10094, 9591, 8176]
[6386, 5622, 13920, 10008, 21836, 16811]
[17579, 18809, 20762, 19139, 22799]
[4674, 20347, 5185, 17229, 19528, 17690]
[18534, 19644, 21240, 2219, 20546, 7797]
[10965, 19095, 13325, 10361, 14820, 21220]
[14784, 8992, 4699, 7634, 13581]
[19885, 17792, 5429, 16523, 11909]
[12027, 4752, 8360, 4515, 17162, 12906]
[18867, 5752, 21823, 4529, 8567, 22793]
[15861, 1

In [127]:
all_fold_averaged_generated_train = defaultdict(list)
test = [{'0000-0001-9498-284X': 59, '0000-0002-5878-8895': 59, '0000-0002-6929-5359': 55}, {'0000-0001-9498-284X': 58, '0000-0002-5878-8895': 57, '0000-0002-6929-5359': 54}]
for per_fold_count_dic in test:
    for key, value in per_fold_count_dic.items():
        all_fold_averaged_generated_train[key].append(value)
    print(per_fold_count_dic)
print(all_fold_averaged_generated_train)

{'0000-0001-9498-284X': 59, '0000-0002-5878-8895': 59, '0000-0002-6929-5359': 55}
{'0000-0001-9498-284X': 58, '0000-0002-5878-8895': 57, '0000-0002-6929-5359': 54}
defaultdict(<class 'list'>, {'0000-0001-9498-284X': [59, 58], '0000-0002-5878-8895': [59, 57], '0000-0002-6929-5359': [55, 54]})


In [None]:
%reset

In [None]:
%whos

In [None]:
#               # ------------------- train test split ------------------------------ --------------#
#                 # ------------------- train test split 1:9 ratio -----------------------------------#
#                 dv1_train, dv1_test, dv_y_train, dv1_y_test = train_test_split(sorted_dv1, labeled_data["authorID"], 
#                                                                     test_size=0.1, stratify = labeled_data["authorID"])
#                 # get index of train and test
#                 train_index = dv_y_train.index.tolist()
#                 test_index = dv1_y_test.index.tolist()
#                 dv2_train, dv2_test = sorted_dv2.iloc[train_index], sorted_dv2.iloc[test_index]
#                 # ----------------------add ublabeled data to labeled to form final train set---------#
#                 # rename authorID as label
#                 print("labeled size: ", sorted_dv1.shape)
#                 print("unlabeled size: ", unlabeled_dv1.shape)
#                 print(dv1_train.head())
#                 final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
#                 final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
#                 print(final_dv1.head())
#                 print(final_dv1.shape)
#                 # get pid and labels for true labels
#                 test_true_label =labeled_data["authorID"].iloc[test_index]
#                 dv1_test.drop(["authorID", "paperID"], axis=1, inplace = True)
#                 dv2_test.drop(["authorID", "paperID"], axis=1, inplace = True)
#                 # ----------------------------------- ovr co-training --------------------------------#
#                 # co-training with logistic regression
#                 co_logistic_clf = Co_training_clf(clf1=LogisticRegression(),p=1,n=1)
#                 co_lr_clf_ovr = co_train_one_vs_rest().fit_one_vs_rest(final_dv1, final_dv2, co_logistic_clf)
#                 co_lr_label_predict = co_lr_clf_ovr.predict(dv1_test, dv2_test)
#                 co_lr_accuracy = accuracy_score(test_true_label, co_lr_label_predict)
#                 co_lr_f1 = f1_score(test_true_label, co_lr_label_predict,average='macro')
#                 print("lr macro f1: ",co_lr_f1)
#                 all_co_LR_accuracy.append(co_lr_accuracy)
#                 all_co_LR_f1.append(co_lr_f1)