In [1]:
import os
import sys
import pandas as pd
import numpy as np

import warnings

# warnings.filterwarnings('error')
warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

apply_threshold_to_sample = True

pp_textual = ["lsa", "pv_dm", "pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [None]:
import numpy as np
import warnings
import matplotlib.pyplot as plt

# create co training classifier
class Co_training_clf(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, p=1, n=1, k=30, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.deepcopy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u

    def init_L_U_U_prime(self, labels):
        # index of the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        print("Initial L size: ", len(L))
        print("Initial U size: ", len(U))
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        return L, U, U_prime

    def label_p_n_samples(self, proba, rank):
        U_prime_size = len(proba)
        print(U_prime_size)
        p, n = [], []
        for label, conf_measure in enumerate(rank):
            # 0 positive sample
            if label==0:
                index = 0
                while(len(p) < self.p):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if positive predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('P: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        p.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
            # 1 negative sample
            elif label == 1:
                index = 0
                while(len(n) < self.n):
                    max_conf_sample_index = conf_measure[index]
                    # ---- if negative predict proba is more than 50% ------- #
                    if (proba[max_conf_sample_index][label] > 0.5):
                        print('N: ', max_conf_sample_index, " : ", proba[max_conf_sample_index])
                        n.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
            else:
                print("Class label error")
        return p, n

    def get_self_labeled_sample(self):
        '''
        return:
            self-labeled new positive, self-labeled new negative (Index)
        '''
        
        return self.new_labeled_pos, self.new_labeled_neg

    def plot_co_training_process(self, iterCount, data, label, new_pos_idx, new_neg_idx, plotSavingPath, name):
        if not os.path.exists(plotSavingPath):
            os.makedirs(plotSavingPath)
        # split self_labeled samples from labeled samples
        self_labeled_pos_idx, self_labeled_neg_idx = ([] for i in range(2))
        if new_pos_idx:
            self_labeled_pos_idx = [i for i, e in enumerate(list(data.index)) if e in new_pos_idx]
        if new_neg_idx:
            self_labeled_neg_idx = [i for i, e in enumerate(list(data.index)) if e in new_neg_idx]
        self_labeled_sample_idx = self_labeled_pos_idx+self_labeled_neg_idx
        # apply PCA on input data
        pca = PCA(n_components=2)
        pca_transformed = pca.fit_transform(X=data)
        pca_one = pca_transformed[:,0]
        pca_two = pca_transformed[:,1]
        # plot the result
        fig, ax = plt.subplots(figsize=(9,7))
        for author in np.unique(label):
            ix = np.where(label == author)
            ax.scatter(pca_one[ix], pca_two[ix], cmap='viridis', label = author, s = 50, alpha = 0.5)
        # mark self labeled result
        temp = ax.scatter(pca_one[self_labeled_sample_idx], pca_two[self_labeled_sample_idx], edgecolor='black', linewidth='3', s=50)
        temp.set_facecolor("none")
        temp.set_label("self-labeled")
        legend = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.title('Co-training iteration: '+ str(iterCount), fontsize=14)
        plt.xlabel("PCA one",fontsize=14)
        plt.ylabel("PCA two",fontsize=14)
        plt.savefig((plotSavingPath+name+"_PCA_i-"+str(iterCount)+".png").encode('utf-8'), dpi=100, bbox_extra_artists=(legend,), bbox_inches='tight')
        plt.close("all")
        # plt.show()
        

    def fit(self, dataView1, dataView2, labels, dv1_test, dv2_test, label_test, plot_save_name=None, plot_save_path=None):
        # index of positive labeled samples
        self.new_labeled_pos = []
        # index of negative labeled samples
        self.new_labeled_neg = []
        # when fit co-train, we collect f1 on test samples wrt each iteration
        self.f1_on_test_dv1 = []
        self.f1_on_test_dv2 = []
        
        labels = np.asarray(labels, dtype='int32')
        print("P value: ", self.p, " N value: ", self.n)
        
        L, U, U_prime = self.init_L_U_U_prime(labels)
        
        iterCount = 0
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount < self.k and U_prime:
            # print("step",iterCount, " L: ",L)
            # print("step",iterCount, " U_prime: ",U_prime)
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2 = dataView2.iloc[L]
            iter_train_label = labels[L]
            iter_clf1 = self.copy.deepcopy(self.clf1) 
            iter_clf2 = self.copy.deepcopy(self.clf2)
            # print(iter_train_label)
            # ----------- plot the co-training process -------------- #
            if plot_save_name != None:
                last_iter_labeled_pos_idx = []
                last_iter_labeled_neg_idx = []
                # start on second iteration
                if iterCount != 0:
                    last_iter_labeled_pos_idx = self.new_labeled_pos[-1]
                    last_iter_labeled_neg_idx = self.new_labeled_neg[-1]
                # ----- save pca reduced plot for dv1 ------ #
                plot_save_dv1_name = plot_save_name+"_dv1"
                self.plot_co_training_process(iterCount, iter_train_d1, iter_train_label, last_iter_labeled_pos_idx,
                                              last_iter_labeled_neg_idx, plot_save_path, plot_save_dv1_name)
                # ----- dv2 -------- #
                plot_save_dv2_name = plot_save_name+"_dv2"
                self.plot_co_training_process(iterCount, iter_train_d2, iter_train_label, last_iter_labeled_pos_idx,
                                              last_iter_labeled_neg_idx, plot_save_path, plot_save_dv2_name)
            
            iter_clf1.fit(iter_train_d1, iter_train_label)
            iter_clf2.fit(iter_train_d2, iter_train_label)
            # --------- test error on test data --------------------- #
            # make prediction on test data
            y1 = iter_clf1.predict(dv1_test)
            y2 = iter_clf2.predict(dv2_test)
            # f1 score on each iteration
            f1_dv1 = f1_score(label_test, y1, average='macro')
            f1_dv2 = f1_score(label_test, y2, average='macro')
            # collect f1 for current iteration
            self.f1_on_test_dv1.append(f1_dv1)
            self.f1_on_test_dv2.append(f1_dv2)
            # ---------- get U_prime sample to be label at ---------- #
            iter_labeling_d1 = dataView1.iloc[U_prime]
            iter_labeling_d2 = dataView2.iloc[U_prime]
            # rank class probabilities for unlabeled sample for it's confidence measure
            dv1_proba = iter_clf1.predict_proba(iter_labeling_d1)
            dv2_proba = iter_clf2.predict_proba(iter_labeling_d2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1_rank[i] is label i's confidence measure
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
            # print(dv1_proba)
            # print(dv1_proba_rank)
            # print(dv2_proba)
            # print(dv2_proba_rank)
            # h1 classifier
            p1,n1 = self.label_p_n_samples(dv1_proba, dv1_proba_rank)
            # h2 classifier
            p2,n2 = self.label_p_n_samples(dv2_proba, dv2_proba_rank)
            roundP = set(p1+p2)
            roundN = set(n1+n2)
            print("P: ", len(roundP), " N: ", len(roundN))
            print(roundP, roundN)
            # auto label the samples and remove it from U_prime
            auto_labeled_pos_idx = [U_prime[x] for x in roundP]
            auto_labeled_neg_idx = [U_prime[x] for x in roundN]
            auto_labeled_samples_idx = auto_labeled_pos_idx+auto_labeled_neg_idx
            # ---------- collect index of auto_labeled_samples ------------ #
            self.new_labeled_pos.append(auto_labeled_pos_idx)
            self.new_labeled_neg.append(auto_labeled_neg_idx)
            
            labels[auto_labeled_pos_idx] = 0
            labels[auto_labeled_neg_idx] = 1
            # extend the labeled sample
            L.extend(auto_labeled_pos_idx)
            L.extend(auto_labeled_neg_idx)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in auto_labeled_samples_idx]
            #print(U_prime)
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
            iterCount +=1
            
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels[L])
        self.clf2.fit(newtrain_d2, labels[L])
        # ------ save f1 vs number of iteration plot ------- #
        if plot_save_name != None:
            default_text_based = [self.f1_on_test_dv1[0]] * iterCount
            default_citation_based = [self.f1_on_test_dv2[0]] * iterCount
            default_step = np.arange(0,iterCount)
            co_train_text_based = self.f1_on_test_dv1[1:]
            co_train_citation_based = self.f1_on_test_dv2[1:]
            co_training_step = np.arange(1,iterCount)

            fig = plt.figure()
            ax = plt.axes()
            plt.plot(default_step, default_text_based, linestyle='dashed', label="Text based default")
            plt.plot(default_step, default_citation_based, linestyle='dashdot', label="Citation based default")
            plt.plot(co_training_step, co_train_text_based, linestyle='solid', marker = "*", label="Text based")
            plt.plot(co_training_step, co_train_citation_based, linestyle='dotted', marker = "+", label="Citation based")
            ax.autoscale_view()
            legend = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=2)
            plt.xlabel('Co-Training Iterations')
            plt.ylabel('F1 score')
            plt.savefig((plot_save_path+plot_save_name+"_diff_iter_f1.png"), dpi=300, bbox_extra_artists=(legend,), bbox_inches='tight')
            # plt.show()
            plt.close("all")
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False

    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = np.asarray([-1] * dataView1.shape[0])
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                y_pred[i] = prod_y_probas.index(max(prod_y_probas))
                print("result",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, so we guess
                warnings.warn("classifiers disagree with label, result may not accurate")
                print("sample at: ", i, " c1: ", y1_i, " c2: ", y2_i)
                y_pred[i] = random.randint(0, 1)
        #check if predict works
        assert not (-1 in y_pred)
        return y_pred

    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product (*) of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
import seaborn as sns

# cross validation
def k_fold_cv_co_train_binary(dataview1, dataview2, unlabeled_dv1, unlabeled_dv2, label,
                              clf, k=10, plot_save_name=None, plot_save_path=None):
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel = []
    
    all_fold_statistic = []
    fold = 0
    
    for train_index, test_index in kf.split(dataview1, label):
        fold +=1
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        _, label_test = label.iloc[train_index], label.iloc[test_index]
        # -------------- add unlabeled to train ------------------ #
        final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
        final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
        # ----------------extract label for training ---------------- #
        label_train = final_dv1["label"]
        final_dv1.drop(["label", "paperID"], axis=1, inplace = True)
        final_dv2.drop(["label", "paperID"], axis=1, inplace = True)
        # ----------------- extract data for test ------------------------ #
        dv1_test = dv1_test.drop(["label", "paperID"], axis=1)
        dv2_test = dv2_test.drop(["label", "paperID"], axis=1)
        # -------------- train binary co-training ------------------- #
        per_fold_clf = copy.deepcopy(clf)
        detailed_plot_path = plot_save_path+plot_save_name+"/fold"+str(fold)+"/"
        per_fold_clf.fit(final_dv1, final_dv2, label_train, dv1_test, dv2_test, label_test, plot_save_name, detailed_plot_path)
        # -------------- get self-labeled sample index -------------- #
        self_labeled_pos_index, self_labeled_neg_index = per_fold_clf.get_self_labeled_sample()
        self_labeled_pos_index = [j for i in self_labeled_pos_index for j in i]
        self_labeled_neg_index = [j for i in self_labeled_neg_index for j in i]
        self_labeled_sample_size = len(self_labeled_pos_index)+len(self_labeled_neg_index)
        self_labeled_pos_sample = final_dv1.iloc[self_labeled_pos_index]
        self_labeled_neg_sample = final_dv1.iloc[self_labeled_neg_index]
        print("Self labeled pos size: ", len(self_labeled_pos_index))
        print("Self labeled neg size: ", len(self_labeled_neg_index))
        # get predicted label
        co_lr_label_predict = per_fold_clf.predict(dv1_test, dv2_test)
        allTrueLabel.extend(label_test["label"].values.tolist())
        allPredLabel.extend(co_lr_label_predict)
        # collect per fold statistic
        curr_fold_statistic = {'author': plot_save_name, 'fold':fold, 'train_size': dv1_train.shape[0], 
                               'self_labeled_train': self_labeled_sample_size, 'test_size': dv1_test.shape[0],
                               'f1': f1_score(label_test["label"].values.tolist(), co_lr_label_predict,average='macro')}
        all_fold_statistic.append(curr_fold_statistic)
        # print(allTrueLabel)
        # print(allPredLabel)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    # ------------ plot result variance for each author ----------------------- #
    all_fold_statistic_plot = pd.DataFrame(all_fold_statistic)
    ax = sns.boxplot(x="author", y="f1", data=all_fold_statistic_plot)
    ax = sns.swarmplot(x="author", y="f1", data=all_fold_statistic_plot, color=".25")
    plt.savefig(plot_save_path+plot_save_name+"/"+plot_save_name+"_result_variance.png", dpi=100)
    # plt.show()
    
    return accuracy, f1, all_fold_statistic

In [None]:
import collections
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

np.set_printoptions(precision=4, suppress=True)

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../../Data/"+Dataset+"/canopies/"
listfiles = os.listdir(fileDir)

co_lr_diff_embedding_result = []

# ------------ view two citation is fix, so move out to save time ------- #
# read viewtwo embedding
print("Load citation embedding: ", pp_citation)
viewtwo_citation_embedding = com_func.read_all_citation_embedding_sorted(emb_type = pp_citation)

#---------------- load different embeddings for view one ---------------#
for select_emb in pp_textual:
    print("Load textual embedding: ", select_emb)
    # read viewone embeddings
    viewone_textual_emb = com_func.read_all_textual_embedding_sorted(emb_type=select_emb, training_size = "3m")
    
    # print(viewone_textual_emb[0])
    # print(viewtwo_citation_embedding[0])
    
    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        plot_save_path = "../../plot/co_train_detail_plots/binary_sample=3m/"+select_emb+"/threshold="+str(step_threshold)+"/"
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, positive_sample_size, negative_sample_size  = ([] for i in range(3))
        all_labeled_count, unlabeled_count = ([] for i in range(2))

        all_co_LR_accuracy, all_co_LR_f1 = ([] for i in range(2))
        all_per_fold_f1_score_variance = []

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            per_name_per_fold_f1_variance = []
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read pid and aid from file
            data = com_func.read_pid_aid(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            unlabeled_mask = data["authorID"] == "-1"
            ublabeled_data = data[unlabeled_mask]
            unlabeled_pid = ublabeled_data["paperID"].tolist()
            print(labeled_data.shape)
            # ---------------- collect all labeled sample -------------------- #
            # ---------------- if use all samples as negative --------------- #
            all_labeled_samples = labeled_data["paperID"].tolist()
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name," pass")
            else:
                total_selected_group+= 1
                # --------------for each name group---------------- #
                if apply_threshold_to_sample == True:
                    # ---------- only use sample pass threshold ------- #
                    #-------- only select authors in name group are very productive (more than threshold)---------#
                    labeled_data, author_list, _= com_func.only_select_productive_authors(labeled_data, step_threshold)
                    # ----------------- if use filtered samples as negative  --------- #
                    filtered_all_labeled_samples = labeled_data["paperID"].tolist()
                else:
                    # ----------- use all sample in name group --------- #
                    author_list = com_func.productive_authors_list(labeled_data, step_threshold)
                    print(name, " name group sample size: ",labeled_data.shape)
                    
                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_textual = com_func.extract_sorted_embedding(viewone_textual_emb, labeled_data["paperID"])
                print(labeled_viewone_textual.shape)
                labeled_viewtwo_citation = com_func.extract_sorted_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_textual), " : ", len(labeled_viewtwo_citation))

                # read in unlabeled data
                unlabeled_viewone_textual = com_func.extract_unlabeled_embedding(viewone_textual_emb, unlabeled_pid)
                print(unlabeled_viewone_textual.shape)
                unlabeled_viewtwo_citation = com_func.extract_unlabeled_embedding(viewtwo_citation_embedding, unlabeled_pid)
                print(unlabeled_viewtwo_citation.shape)
                print("Unlabeled: ",len(unlabeled_viewone_textual), " : ", len(unlabeled_viewtwo_citation))
                
                # remove samples that have no citation link from ublabeled data
                noCitationPids_unlabeled = set(unlabeled_viewone_textual['paperID'])-set(unlabeled_viewtwo_citation['paperID'])
                print("Unlabeled no citation link size: ", len(noCitationPids_unlabeled))
                # process unlabeled data
                unlabeled_dv1 = unlabeled_viewone_textual[~unlabeled_viewone_textual['paperID'].isin(noCitationPids_unlabeled)].reset_index(drop=True)
                unlabeled_dv2 = unlabeled_viewtwo_citation
                
                # ---------------- shuffle the data ----------------- #
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # ------------------ alignment ---------------------- #
                labeled_viewone_textual = pd.merge(labeled_data, labeled_viewone_textual, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation = pd.merge(labeled_data, labeled_viewtwo_citation, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation.fillna(0, inplace=True)
                
                print(labeled_viewone_textual.shape)
                print(labeled_viewtwo_citation.shape)
                print(unlabeled_dv1.shape)
                print(unlabeled_dv2.shape)
                counter = 0
                # loop through each author
                for author in author_list:
                    all_labeled_count.append(len(labeled_data))
                    unlabeled_count.append(len(unlabeled_dv1))
                    author_name = name+'_'+str(counter)
                    allname.append(author_name)
                    print(author_name, " : ", author)
                    mask = labeled_data["authorID"] == author
                    temp = labeled_data[mask]
                    positive_sample_pid = temp["paperID"].tolist()
                    negative_sample_pid = com_func.extractNegativeSample(positive_sample_pid, filtered_all_labeled_samples)
                    
                    # save number of positive and negative samples
                    positive_sample_size.append(len(positive_sample_pid))
                    negative_sample_size.append(len(negative_sample_pid))
                    
                    # ----------------- generate binary label ------------------ #
                    # form positive and negative (negative class come from similar name group)
                    all_authors = []
                    all_authors.append(positive_sample_pid)
                    all_authors.append(negative_sample_pid)
                    appended_data = []
                    for label, pid in enumerate(all_authors):
                        # create df save one author data 
                        authordf = pd.DataFrame({"paperID":pid})
                        authordf['label'] = label
                        appended_data.append(authordf)
                    label_pid = pd.concat(appended_data, axis=0,ignore_index=True)
                    # ----------- alignment of label with input data ------------ #
                    label_pid = pd.merge(labeled_viewone_textual["paperID"].to_frame(), label_pid, on = "paperID")
                    #------------- process data for k-fold cv ------------------- #
                    # throw away some column for labeled data
                    labeled_dv1 = labeled_viewone_textual.drop(["authorID", 0], axis=1)
                    labeled_dv2 = labeled_viewtwo_citation.drop(["authorID", 0], axis=1)
                    # merge label into data
                    labeled_dv1 = pd.merge(labeled_dv1, label_pid, on = "paperID")
                    labeled_dv2 = pd.merge(labeled_dv2, label_pid, on = "paperID")
                    label = label_pid.drop(["paperID"], axis=1)
                    # ----------- check the final inputs------------------ #
                    # print(labeled_dv1.head())
                    # print(unlabeled_dv1.head())
                    # ------------ fit co-training model with k-fold ------------------------ #
                    co_logistic_clf = Co_training_clf(clf1=LogisticRegression(solver= "liblinear"),p=1,n=1, k=30)
                    co_lr_accuracy, co_lr_f1, author_per_fold_f1_score= k_fold_cv_co_train_binary(labeled_dv1, labeled_dv2, 
                                                                         unlabeled_dv1, unlabeled_dv2, label,
                                                                         co_logistic_clf, 10, author_name, plot_save_path)
                    # f1 variance on different fold for different author
                    all_co_LR_accuracy.append(co_lr_accuracy)
                    all_co_LR_f1.append(co_lr_f1)
                    per_name_per_fold_f1_variance.extend(author_per_fold_f1_score)
                    all_per_fold_f1_score_variance.extend(author_per_fold_f1_score)
                    counter+=1
                
                # ---------- plot per name group classifier f1 variance -------------- #
                per_name_per_fold_f1_variance_plot = pd.DataFrame(per_name_per_fold_f1_variance)
                ax = sns.boxplot(x="author", y="f1", data=per_name_per_fold_f1_variance_plot)
                ax = sns.swarmplot(x="author", y="f1", data=per_name_per_fold_f1_variance_plot, color=".25")
                plt.savefig(plot_save_path+name+"_group_result_variance.png", dpi=300)
        
#         # write evaluation result to excel
#         output = pd.DataFrame({'Author Name':allname, "positive sample size":positive_sample_size,"negative sample size":negative_sample_size, 
#                                "labeled sample size": all_labeled_count, "unlabeled sample size": unlabeled_count, 
#                                "co_logisticRegression Accuracy":all_co_LR_accuracy, "co_logisticRegression F1": all_co_LR_f1})
#         savePath = "../../result/"+Dataset+"/co_train_binary/"
#         filename = "(Global emb sample 3m) viewone_textual="+select_emb+"_viewtwo_citation="+pp_citation+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")

        
#         threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
        
#     co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

In [None]:
from statistics import mean 

print(threshold_change_all_co_lr_f1s)
print(co_lr_diff_embedding_result)

In [None]:
#         # --------------- plot overall result f1 variance --------------- #
#         all_per_fold_f1_score_variance_plot = pd.DataFrame(all_per_fold_f1_score_variance)
#         ax = sns.boxplot(x="author", y="f1", data=all_per_fold_f1_score_variance_plot)
#         ax = sns.swarmplot(x="author", y="f1", data=all_per_fold_f1_score_variance_plot, color=".25")
#         plt.savefig(plot_save_path+"all_result_variance.png", dpi=300)
#         # plt.show()

In [None]:
# %whos
del viewtwo_citation_embedding
del viewone_textual_emb