In [None]:
import os
import sys
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

apply_threshold_to_sample = True

pp_textual = ["lsa", "pv_dm", "pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [None]:
def read_file(infile):
    AllRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1]}
                AllRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(AllRecords_original)

In [None]:
def extract_embedding(all_embedding, wanted_pid_list):
    extracted_emb = []
    wanted_pid_list = wanted_pid_list.values.tolist()
    wanted_pid_list = [int(x) for x in wanted_pid_list]
    wanted_pid_list = list(sorted(set(wanted_pid_list)))
    total_missing_sample = 0
    # only if embedding exist
    if len(all_embedding)>0:
        # loop through wanted pid list to keep input order
        for embedding in all_embedding:
            if(len(wanted_pid_list)==0):
                break
            while (wanted_pid_list[0]<=int(embedding[0])):
                if wanted_pid_list[0]==int(embedding[0]):
                    extracted_emb.append(embedding)
                    wanted_pid_list.remove(int(embedding[0]))
                elif (wanted_pid_list[0]<int(embedding[0])):
                    total_missing_sample+=1
                    # ------------------------ fill it up with 0's -------------------------- #
                    fill_na = [wanted_pid_list[0]]
                    temp = [0] * (len(all_embedding[0])-1)
                    final_filled_zero_emb = fill_na+temp
                    extracted_emb.append(final_filled_zero_emb)
                    # ----- or do nothing and remove those missing samples from dataset ----- #
                    # remove paper that not in all dataset
                    wanted_pid_list.remove(wanted_pid_list[0])
                if len(wanted_pid_list)==0:
                    break
    print("Total missing sample: ", total_missing_sample)
    extracted_emb = pd.DataFrame(extracted_emb)
    return extracted_emb

In [None]:
# collect unlabeled vectors
def extract_unlabeled_embedding(allembedding, unlabeled_pid):
    unlabeled_pid = [int(x) for x in unlabeled_pid]
    unlabeled_pid = list(sorted(set(unlabeled_pid)))
    wanted_embedding = []
    for embedding in allembedding:
        if(len(unlabeled_pid)==0):
            break
        while (unlabeled_pid[0]<=int(embedding[0])):
            if unlabeled_pid[0]==int(embedding[0]):
                wanted_embedding.append(embedding)
                unlabeled_pid.remove(int(embedding[0]))
            elif (unlabeled_pid[0]<int(embedding[0])):
                # remove paper that not in all dataset
                unlabeled_pid.remove(unlabeled_pid[0])
            if len(unlabeled_pid)==0:
                break
    unlabeled_data = pd.DataFrame(wanted_embedding)
    unlabeled_data['label'] = "-1"
    unlabeled_data = unlabeled_data.rename(columns={0: 'paperID'})
    return unlabeled_data

In [None]:
# some of the record doesn't have citation links, therefore we will have to remove those papers from train and test set
# synchronize data wrt pid
def synchro_views(labeled_dv1, labeled_dv2, unlabeled_data1, unlabeled_data2):
    noCitationPids_labeled = set(labeled_dv1[0])-set(labeled_dv2[0])
    print("labeled no citation link: ", len(noCitationPids_labeled))
    noCitationPids_unlabeled = set(unlabeled_data1['paperID'])-set(unlabeled_data2['paperID'])
    print("Unlabeled no citation link size: ", len(noCitationPids_unlabeled))
    # process unlabeled data
    unlabeled_dv1 = unlabeled_data1[~unlabeled_data1['paperID'].isin(noCitationPids_unlabeled)].reset_index(drop=True)
    unlabeled_dv2 = unlabeled_data2
    # process labeled data
    labeled_dv1_final = labeled_dv1[~labeled_dv1[0].isin(noCitationPids_labeled)].reset_index(drop=True)
    labeled_dv2_final = labeled_dv2
    # since our input data are sorted, all data are in order with pid
    return labeled_dv1_final, labeled_dv2_final, unlabeled_dv1, unlabeled_dv2

In [None]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    return negativeSample

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv_co_train_binary(dataview1, dataview2, unlabeled_dv1, unlabeled_dv2, label, clf, k=10):
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(dataview1, label):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        _, label_test = label.iloc[train_index], label.iloc[test_index]
        # -------------- add unlabeled to train ------------------ #
        final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
        final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
        # ----------------extract label for training ---------------- #
        label_train = final_dv1["label"]
        final_dv1.drop(["label", "paperID"], axis=1, inplace = True)
        final_dv2.drop(["label", "paperID"], axis=1, inplace = True)
        # -------------- train binary co-training ------------------- #
        clf.fit(final_dv1, final_dv2, label_train)
        
        dv1_test.drop(["label", "paperID"], axis=1, inplace = True)
        dv2_test.drop(["label", "paperID"], axis=1, inplace = True)
        # get predicted label
        co_lr_label_predict = clf.predict(dv1_test, dv2_test)
        allTrueLabel.extend(label_test["label"].values.tolist())
        allPredLabel.extend(co_lr_label_predict)
        # print(allTrueLabel)
        # print(allPredLabel)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))
        break

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [None]:
import numpy as np
import warnings
# create co training classifier
class Co_training_clf(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, max_k=30, p=1, n=1, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.copy(clf1)
        else:
            self.clf2 = clf2
        # number of iteration
        self.max_k = max_k
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # size of pool of unlabeled samples
        self.u = u
        # debug
        self.verbose = 0
        
        
    def label_p_n_samples(self, dv1_proba, dv2_proba, rank):
        '''
        Only label it if consistent with result from two classifier
        '''
        U_prime_size = len(dv1_proba)
        p, n = [], []
        for label, conf_measure in enumerate(rank):
            # 0 positive sample
            if label==0:
                index = 0
                while(len(p) <self.p):
                    max_conf_sample_index = conf_measure[index]
                    dv1_result = np.argmax(dv1_proba[max_conf_sample_index])
                    dv2_result = np.argmax(dv2_proba[max_conf_sample_index])
                    positive_confidence = dv1_proba[max_conf_sample_index][0] * dv2_proba[max_conf_sample_index][0]
                    if (dv1_result == dv2_result) and (positive_confidence>=0.5):
                        if self.verbose:
                            print(positive_confidence)
                            print(dv1_proba[max_conf_sample_index])
                            print(dv2_proba[max_conf_sample_index])
                        p.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
                    if self.verbose:
                        print("positive idx ", index)
                    
            # 1 negative sample
            elif label == 1:
                index = 0
                while(len(n) <self.n):
                    max_conf_sample_index = conf_measure[index]
                    dv1_result = np.argmax(dv1_proba[max_conf_sample_index])
                    dv2_result = np.argmax(dv2_proba[max_conf_sample_index])
                    negative_confidence = dv1_proba[max_conf_sample_index][1] * dv2_proba[max_conf_sample_index][1]
                    if(dv1_result == dv2_result) and (negative_confidence>=0.5):
                        if self.verbose:
                            print(negative_confidence)
                            print(dv1_proba[max_conf_sample_index])
                            print(dv2_proba[max_conf_sample_index])
                        n.append(max_conf_sample_index)
                    index +=1
                    if (index>=U_prime_size):
                        break
                    if self.verbose: 
                        print("negative idx ", index)
            else:
                print("Class label error")
        return p, n
    
    def init_L_U_U_prime(self, labels):
        # index of the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        print("Initial L size: ", len(L))
        print("Initial U size: ", len(U))
        # random drawing sample from U
        U_prime = random.sample(U, min(len(U), self.u))
        # remove the samples in U_prime from U
        U = [x for x in U if x not in U_prime]
        return L, U, U_prime
        
        
    def fit(self, dataView1, dataView2, labels):
        
        labels = np.asarray(labels, dtype='int32')
        print("P: ", self.p, " N: ", self.n)
        L, U, U_prime = self.init_L_U_U_prime(labels)
        print("U: ", len(U))
        print("U_prime: ", len(U_prime))
        #----------- auto estimate number of iteration should run -------- #
        pos_sample_num = np.count_nonzero(labels==0)
        neg_sample_num = np.count_nonzero(labels==1)
        n_p_ratio = int(neg_sample_num/pos_sample_num)
        print(pos_sample_num)
        print(neg_sample_num)
        print(n_p_ratio)
        label_sample_size = len(L)
        
        iterCount = 0
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount <= self.max_k and U_prime:
            iterCount +=1
            if self.verbose:
                print("step",iterCount, " L: ",L)
                print("step",iterCount, " U_prime: ",U_prime)
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2= dataView2.iloc[L]
            iter_train_label = labels[L]
            print(iter_train_label.shape)
            self.clf1.fit(iter_train_d1, iter_train_label)
            self.clf2.fit(iter_train_d2, iter_train_label)
            
            iter_labeling_d1 = dataView1.iloc[U_prime]
            iter_labeling_d2 = dataView2.iloc[U_prime]
            
            # ---------- 1. rank class probabilities for unlabeled sample for it's confidence measure ---- #
            dv1_proba = self.clf1.predict_proba(iter_labeling_d1)
            dv2_proba = self.clf2.predict_proba(iter_labeling_d2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1[i] is label i's confidence measure, rank is index of sample
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
            if self.verbose:
                print(dv1_proba)
                print(dv1_proba_rank)
                print(dv2_proba)
                print(dv2_proba_rank)
            # ----2. use probability to have p, n new label samples (result must consistent with 2 classifier) --- #
            #h1 classifier
            p1,n1 = self.label_p_n_samples(dv1_proba, dv2_proba, dv1_proba_rank)
            # h2 classifier
            p2,n2 = self.label_p_n_samples(dv1_proba, dv2_proba, dv2_proba_rank)
            finalP = set(p1+p2)
            finalN = set(n1+n2)
            print("Final p: ", len(finalP), " Final n: ", len(finalN))
                
            # ------------ 4. if U_prime not produce new positive or negative sample, resample from U ------ #
            if (len(finalP) ==0) or (len(finalN) ==0):
                # random drawing sample from U
                U_prime = random.sample(U, min(len(U), self.u))
                # remove the samples in U_prime from U
                U = [x for x in U if x not in U_prime]
            # --------------------------- else add new sample to training set ----------------------------- #
            else:
                if self.verbose:
                    print("P: ", finalP, " N: ", finalN)
                # auto label the samples and remove it from U_prime
                auto_labeled_pos = [U_prime[x] for x in finalP]
                auto_labeled_neg = [U_prime[x] for x in finalN]
                auto_labeled_samples = auto_labeled_pos+auto_labeled_neg
                labels[auto_labeled_pos] = 0
                labels[auto_labeled_neg] = 1
                # extend the labeled sample
                L.extend(auto_labeled_pos)
                L.extend(auto_labeled_neg)
                # remove the labeled sample from U_prime
                U_prime = [x for x in U_prime if x not in auto_labeled_samples]
                if self.verbose:
                    print(U_prime)
                # randomly choice 2p+2n examples from u to replenish u_prime
                replenishItem = U[-(2*self.p+2*self.n):]
                U_prime.extend(replenishItem)
                U = U[:-len(replenishItem)]
                print("U: ", len(U))
                print("U_prime: ", len(U_prime))
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels[L])
        self.clf2.fit(newtrain_d2, labels[L])
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False
        
    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = np.asarray([-1] * dataView1.shape[0])
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                y_pred[i] = prod_y_probas.index(max(prod_y_probas))
                print("result",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, so we guess
                warnings.warn("classifiers disagree with label, result may not accurate")
                print("sample at: ", i, " c1: ", y1_i, " c2: ", y2_i)
                y_pred[i] = random.randint(0, 1)
        #check if predict works
        assert not (-1 in y_pred)
        return y_pred
    
    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product (*) of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba

In [None]:
import collections
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../../Data/"+Dataset+"/canopies/"
listfiles = os.listdir(fileDir)

co_lr_diff_embedding_result = []

# # ------------ view two citation is fix, so move out to save time ------- #
# # read viewtwo embedding
# print("Load citation embedding: ", pp_citation)
# viewtwo_citation_embedding = com_func.read_all_citation_embedding_sorted(emb_type = pp_citation)

#---------------- load different embeddings for view one ---------------#
for select_emb in pp_textual:
    print("Load textual embedding: ", select_emb)
#     # read viewone embeddings
#     viewone_textual_emb = com_func.read_all_textual_embedding_sorted(emb_type=select_emb, training_size = "3m")
    
#     print(viewone_textual_emb[0])
#     print(viewtwo_citation_embedding[0])
    
    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, positive_sample_size, negative_sample_size  = ([] for i in range(3))
        all_labeled_count, unlabeled_count = ([] for i in range(2))

        all_co_LR_accuracy, all_co_LR_f1 = ([] for i in range(2))

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read pid and aid from file
            data = read_file(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            unlabeled_mask = data["authorID"] == "-1"
            ublabeled_data = data[unlabeled_mask]
            unlabeled_pid = ublabeled_data["paperID"].tolist()
            print(labeled_data.shape)
            # ---------------- collect all labeled sample -------------------- #
            # ---------------- if use all samples as negative --------------- #
            all_labeled_samples = labeled_data["paperID"].tolist()
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name," pass")
            else:
                total_selected_group+= 1
                # --------------for each name group---------------- #
                if apply_threshold_to_sample == True:
                    # ---------- only use sample pass threshold ------- #
                    #-------- only select authors in name group are very productive (more than threshold)---------#
                    labeled_data, author_list, _= com_func.only_select_productive_authors(labeled_data, step_threshold)
                    # ----------------- if use filtered samples as negative  --------- #
                    filtered_all_labeled_samples = labeled_data["paperID"].tolist()
                else:
                    # ----------- use all sample in name group --------- #
                    author_list = com_func.productive_authors_list(labeled_data, step_threshold)
                    print(name, " name group sample size: ",labeled_data.shape)
                    
                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_textual = extract_embedding(viewone_textual_emb, labeled_data["paperID"])
                print(labeled_viewone_textual.shape)
                labeled_viewtwo_citation = extract_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_textual), " : ", len(labeled_viewtwo_citation))

                # read in unlabeled data
                unlabeled_viewone_textual = extract_unlabeled_embedding(viewone_textual_emb, unlabeled_pid)
                print(unlabeled_viewone_textual.shape)
                unlabeled_viewtwo_citation = extract_unlabeled_embedding(viewtwo_citation_embedding, unlabeled_pid)
                print(unlabeled_viewtwo_citation.shape)
                print("Unlabeled: ",len(unlabeled_viewone_textual), " : ", len(unlabeled_viewtwo_citation))
                
                # remove samples that have no citation link from ublabeled data
                noCitationPids_unlabeled = set(unlabeled_viewone_textual['paperID'])-set(unlabeled_viewtwo_citation['paperID'])
                print("Unlabeled no citation link size: ", len(noCitationPids_unlabeled))
                # process unlabeled data
                unlabeled_dv1 = unlabeled_viewone_textual[~unlabeled_viewone_textual['paperID'].isin(noCitationPids_unlabeled)].reset_index(drop=True)
                unlabeled_dv2 = unlabeled_viewtwo_citation
                
                # ---------------- shuffle the data ----------------- #
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # ------------------ alignment ---------------------- #
                labeled_viewone_textual = pd.merge(labeled_data, labeled_viewone_textual, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation = pd.merge(labeled_data, labeled_viewtwo_citation, left_on="paperID", right_on = [0], how = "left")
                labeled_viewtwo_citation.fillna(0, inplace=True)
                
                print(labeled_viewone_textual.shape)
                print(labeled_viewtwo_citation.shape)
                print(unlabeled_dv1.shape)
                print(unlabeled_dv2.shape)
                counter = 0
                # loop through each author
                for author in author_list:
                    all_labeled_count.append(len(labeled_data))
                    unlabeled_count.append(len(unlabeled_dv1))
                    author_name = name+'_'+str(counter)
                    allname.append(author_name)
                    print(author_name, " : ", author)
                    mask = labeled_data["authorID"] == author
                    temp = labeled_data[mask]
                    positive_sample_pid = temp["paperID"].tolist()
                    negative_sample_pid = extractNegativeSample(positive_sample_pid, filtered_all_labeled_samples)
                    
                    # save number of positive and negative samples
                    positive_sample_size.append(len(positive_sample_pid))
                    negative_sample_size.append(len(negative_sample_pid))
                    
                    # ----------------- generate binary label ------------------ #
                    # form positive and negative (negative class come from similar name group)
                    all_authors = []
                    all_authors.append(positive_sample_pid)
                    all_authors.append(negative_sample_pid)
                    appended_data = []
                    for label, pid in enumerate(all_authors):
                        # create df save one author data 
                        authordf = pd.DataFrame({"paperID":pid})
                        authordf['label'] = label
                        appended_data.append(authordf)
                    label_pid = pd.concat(appended_data, axis=0,ignore_index=True)
                    # ----------- alignment of label with input data ------------ #
                    label_pid = pd.merge(labeled_viewone_textual["paperID"].to_frame(), label_pid, on = "paperID")
                    #------------- process data for k-fold cv ------------------- #
                    # throw away some column for labeled data
                    labeled_dv1 = labeled_viewone_textual.drop(["authorID", 0], axis=1)
                    labeled_dv2 = labeled_viewtwo_citation.drop(["authorID", 0], axis=1)
                    # merge label into data
                    labeled_dv1 = pd.merge(labeled_dv1, label_pid, on = "paperID")
                    labeled_dv2 = pd.merge(labeled_dv2, label_pid, on = "paperID")
                    label = label_pid.drop(["paperID"], axis=1)
                    # ----------- check the final inputs------------------ #
#                     print(labeled_dv1.head())
#                     print(unlabeled_dv1.head())
                    # ------------ fit co-training model with k-fold ------------------------ #
                    co_logistic_clf = Co_training_clf(clf1=LogisticRegression(),p=1,n=1)
                    co_lr_accuracy, co_lr_f1 = k_fold_cv_co_train_binary(labeled_dv1, labeled_dv2, unlabeled_dv1, unlabeled_dv2,
                                                                  label, co_logistic_clf, 10)
                    
                    all_co_LR_accuracy.append(co_lr_accuracy)
                    all_co_LR_f1.append(co_lr_f1)
                    counter+=1
                    
                    break
                break
                
#         # write evaluation result to excel
#         output = pd.DataFrame({'Author Name':allname, "positive sample size":positive_sample_size,"negative sample size":negative_sample_size, 
#                                "labeled sample size": all_labeled_count, "unlabeled sample size": unlabeled_count, 
#                                "co_logisticRegression Accuracy":all_co_LR_accuracy, "co_logisticRegression F1": all_co_LR_f1})
#         savePath = "../../result/"+Dataset+"/co_train_binary_advanced/"
#         filename = "(Global emb sample 3m) viewone_textual="+select_emb+"_viewtwo_citation="+pp_citation+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")
        
#         threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
        
#     co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

In [None]:
print(len(positive_sample_pid))

In [None]:
from statistics import mean 

print(threshold_change_all_co_lr_f1s)
print(co_lr_diff_embedding_result)

In [None]:
# %whos
del v1_all_features
del v2_all_features