In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func
# parameters
#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 10
threshold_upper = 50

pp_textual = ["pv_dbow"]
# pp_textual = ["lsa", "pv_dm", "pv_dbow"]
pp_citation = "n2v"

Dataset = "pubmed"

In [2]:
import pickle
import gensim
# read trained rec to rec textual graph
def read_textual_embedding(Dataset = "pubmed", emb_type = "off"):
    textual_emb = []
    while True:
        if emb_type == "lsa":
            modelSaveDir = "../../Data/"+Dataset+"/models/lsa/textual_sample=3m/"
            with open(modelSaveDir+'lsa_Matrix.pickle', "rb") as input_file:
                vec = pickle.load(input_file)
            with open(modelSaveDir+'feature_pid.pickle', "rb") as input_file:
                allPaperid = pickle.load(input_file)
            allPaperid = np.array(allPaperid)
            textual_emb = np.column_stack((allPaperid,vec))
            break
        elif emb_type == "pv_dm":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/Doc2Vec(dbow,d100,n5,mc3,s0.001,t24).txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()
            break
        elif emb_type == "pv_dbow":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/Doc2Vec(dmm,d100,n5,w5,mc3,s0.001,t24).txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    print("Total textual vector records:",len(textual_emb))
    print("Vector dimension: ", len(textual_emb[0]))
    return textual_emb

In [3]:
# read trained rec to rec node2vec citation graph
def read_citation_embedding(Dataset = "pubmed", emb_type = "off"):
    citation_emb = []
    while True:
        if emb_type == "n2v":
            citation_emb_dir = "../../Data/"+Dataset+"/vectors/"+emb_type+"/n2v.txt"
            with open(citation_emb_dir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    if(len(read_data)==101):
                        citation_emb.append(read_data)
            f.close()
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    print("Total citation vector records:",len(citation_emb))
    print("Vector dimension: ", len(citation_emb[0]))
    return citation_emb

In [4]:
def extract_embedding(all_embedding, wanted_pid_list):
    extracted_emb = []
    wanted_pid_list = wanted_pid_list.values.tolist()
    wanted_pid_list = [int(x) for x in wanted_pid_list]
    wanted_pid_list = list(sorted(set(wanted_pid_list)))
    total_missing_sample = 0
    # only if embedding exist
    if len(all_embedding)>0:
        # loop through wanted pid list to keep input order
        for embedding in all_embedding:
            if(len(wanted_pid_list)==0):
                break
            while (wanted_pid_list[0]<=int(embedding[0])):
                if wanted_pid_list[0]==int(embedding[0]):
                    extracted_emb.append(embedding)
                    wanted_pid_list.remove(int(embedding[0]))
                elif (wanted_pid_list[0]<int(embedding[0])):
                    total_missing_sample+=1
                    # remove paper that not in all dataset
                    wanted_pid_list.remove(wanted_pid_list[0])
                if len(wanted_pid_list)==0:
                    break
    print("Total missing sample: ", total_missing_sample)
    extracted_emb = pd.DataFrame(extracted_emb)
    return extracted_emb

In [5]:
# collect unlabeled vectors
def extract_unlabeled_embedding(allembedding, unlabeled_pid):
    unlabeled_pid = [int(x) for x in unlabeled_pid]
    unlabeled_pid = list(sorted(set(unlabeled_pid)))
    wanted_embedding = []
    for embedding in allembedding:
        if(len(unlabeled_pid)==0):
            break
        while (unlabeled_pid[0]<=int(embedding[0])):
            if unlabeled_pid[0]==int(embedding[0]):
                wanted_embedding.append(embedding)
                unlabeled_pid.remove(int(embedding[0]))
            elif (unlabeled_pid[0]<int(embedding[0])):
                # remove paper that not in all dataset
                unlabeled_pid.remove(unlabeled_pid[0])
            if len(unlabeled_pid)==0:
                break
    unlabeled_data = pd.DataFrame(wanted_embedding)
    unlabeled_data['authorID'] = "-1"
    unlabeled_data = unlabeled_data.rename(columns={0: 'paperID'})
    return unlabeled_data

In [6]:
def read_file(infile):
    AllRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1]}
                AllRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(AllRecords_original)

In [7]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    return negativeSample

In [8]:
# some of the record doesn't have citation links, therefore we will have to remove those papers from train and test set
# synchronize data wrt pid
def synchro_views(labeled_dv1, labeled_dv2, unlabeled_data1, unlabeled_data2):
    noCitationPids_labeled = set(labeled_dv1[0])-set(labeled_dv2[0])
    print("labeled no citation link: ", len(noCitationPids_labeled))
    noCitationPids_unlabeled = set(unlabeled_data1["paperID"])-set(unlabeled_data2["paperID"])
    print("Unlabeled no citation link size: ", len(noCitationPids_unlabeled))
    # process unlabeled data
    unlabeled_dv1 = unlabeled_data1[~unlabeled_data1["paperID"].isin(noCitationPids_unlabeled)].reset_index(drop=True)
    unlabeled_dv2 = unlabeled_data2
    # process labeled data
    labeled_dv1_final = labeled_dv1[~labeled_dv1[0].isin(noCitationPids_labeled)].reset_index(drop=True)
    labeled_dv2_final = labeled_dv2.reset_index(drop=True)
    # since our input data are sorted, all data are in order with pid
    return labeled_dv1_final, labeled_dv2_final, unlabeled_dv1, unlabeled_dv2

In [9]:
import numpy as np
import warnings
# create co training classifier
class Co_training_clf(object):
    
    import copy
    
    def __init__(self, clf1, clf2=None, p=1, n=1, k=30, u = 75):
        
        self.clf1 = clf1
        # assume co_training on one classifier
        if clf2 == None:
            self.clf2 = self.copy.copy(clf1)
        else:
            self.clf2 = clf2
        # take p example from most confidently positive labels to example
        self.p = p
        # take n example from most confidently negative label to example
        self.n = n
        # number of iteration
        self.k = k
        # size of pool of unlabeled samples
        self.u = u
        
    def label_p_n_samples(self, rank):
        p, n = [], []
        for label, conf_measure in enumerate(rank):
            # 0 positive sample
            if label==0:
                index = 0
                while(len(p) < self.p):
                    p.append(conf_measure[index])
                    index +=1
            # 1 negative sample
            elif label == 1:
                index = 0
                while(len(n) < self.n):
                    n.append(conf_measure[index])
                    index +=1
            else:
                print("Class label error")
        return p, n
        
    def fit(self, dataView1, dataView2, labels):
        
        labels = np.asarray(labels, dtype='int32')
        print("P: ", self.p, " N: ", self.n)
        assert(self.p > 0 and self.n > 0 and self.k > 0 and self.u > 0)
        
        # index of the samples that are initially labeled
        L = [i for i, label_i in enumerate(labels) if label_i != -1]
        # index of unlabeled samples
        U = [i for i, label_i in enumerate(labels) if label_i == -1]
        print("Initial L size: ", len(L))
        print("Initial U size: ", len(U))
        # random drawing sample from U
        random.shuffle(U)
        U_prime = U[-min(len(U), self.u):]
        # remove the samples in U_prime from U
        U = U[:-len(U_prime)]
        iterCount = 0
        #loop until we have assigned labels to every sample in U and U_prime or we hit our iteration break condition
        while iterCount <= self.k and U_prime:
            iterCount +=1
#             print("step",iterCount, " L: ",L)
#             print("step",iterCount, " U_prime: ",U_prime)
            iter_train_d1 = dataView1.iloc[L]
            iter_train_d2= dataView2.iloc[L]
            iter_train_label = labels[L]
#             print(iter_train_label.shape)
            self.clf1.fit(iter_train_d1, iter_train_label.ravel())
            self.clf2.fit(iter_train_d2, iter_train_label.ravel())
            
            iter_labeling_d1 = dataView1.iloc[U_prime]
            iter_labeling_d2 = dataView2.iloc[U_prime]
            # rank class probabilities for unlabeled sample for it's confidence measure
            dv1_proba = self.clf1.predict_proba(iter_labeling_d1)
            dv2_proba = self.clf2.predict_proba(iter_labeling_d2)
            # make prediction on data
#             y1 = self.clf1.predict(iter_labeling_d1)
#             y2 = self.clf2.predict(iter_labeling_d2)
#             print("dataviewone prediction on unlabeled: ",y1)
#             print("dataviewtwo prediction on unlabeled: ",y2)
            dv1_proba_rank = []
            dv2_proba_rank = []
            # proba1_rank[i] is label i's confidence measure
            for class_proba in dv1_proba.T:
                dv1_proba_rank.append((-class_proba).argsort())
            for class_proba in dv2_proba.T:
                dv2_proba_rank.append((-class_proba).argsort())
#             print(dv1_proba)
#             print(dv1_proba_rank)
#             print(dv2_proba)
#             print(dv2_proba_rank)
            # h1 classifier
            p1,n1 = self.label_p_n_samples(dv1_proba_rank)
            # h2 classifier
            p2,n2 = self.label_p_n_samples(dv2_proba_rank)
            finalP = set(p1+p2)
            finalN = set(n1+n2)
#             print("P: ", finalP, " N: ", finalN)
            # auto label the samples and remove it from U_prime
            auto_labeled_pos = [U_prime[x] for x in finalP]
            auto_labeled_neg = [U_prime[x] for x in finalN]
            auto_labeled_samples = auto_labeled_pos+auto_labeled_neg
            labels[auto_labeled_pos] = 0
            labels[auto_labeled_neg] = 1
            # extend the labeled sample
            L.extend(auto_labeled_pos)
            L.extend(auto_labeled_neg)
            # remove the labeled sample from U_prime
            U_prime = [x for x in U_prime if x not in auto_labeled_samples]
            #print(U_prime)
            # randomly choice 2p+2n examples from u to replenish u_prime
            replenishItem = U[-(2*self.p+2*self.n):]
            U_prime.extend(replenishItem)
            U = U[:-len(replenishItem)]
        print("Total Labeled number: ", len(L), " Still unlabeled number: ", len(U_prime))
        # final train
        newtrain_d1 = dataView1.iloc[L]
        newtrain_d2 = dataView2.iloc[L]
        self.clf1.fit(newtrain_d1, labels[L])
        self.clf2.fit(newtrain_d2, labels[L])
    
    def supports_proba(self, clf, x):
        try:
            clf.predict_proba([x])
            return True
        except:
            return False
        
    def predict(self, dataView1, dataView2):
        y1 = self.clf1.predict(dataView1)
        y2 = self.clf2.predict(dataView2)
        proba_supported = self.supports_proba(self.clf1, dataView1.iloc[0]) and self.supports_proba(self.clf2, dataView2.iloc[0])
        #fill pred with -1 so we can identify the samples in which sample classifiers failed to agree
        y_pred = np.asarray([-1] * dataView1.shape[0])
        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            # if both agree on label
            if y1_i == y2_i:
                y_pred[i] = y1_i
            # if disagree on label, times probability together, choice the class have higher probabilities
            elif proba_supported:
                y1_probas = self.clf1.predict_proba([dataView1.iloc[i]])[0]
                y2_probas = self.clf2.predict_proba([dataView2.iloc[i]])[0]
                print("y1 disagree on",i, " Proba: ",y1_probas)
                print("y2 not aggreed on ",i, "Proba: ", y2_probas)
                prod_y_probas = [proba_y1 * proba_y2 for (proba_y1, proba_y2) in zip(y1_probas, y2_probas)]
                print("product probas:",prod_y_probas)
                y_pred[i] = prod_y_probas.index(max(prod_y_probas))
                print("result",y_pred[i])
            else:
                #the classifiers disagree and don't support probability, so we guess
                warnings.warn("classifiers disagree with label, result may not accurate")
                print("sample at: ", i, " c1: ", y1_i, " c2: ", y2_i)
                y_pred[i] = random.randint(0, 1)
        #check if predict works
        assert not (-1 in y_pred)
        return y_pred
    
    def predict_proba(self, dataView1, dataView2):
        # the predicted probabilities is simply a product of probabilities given from each classifier trained
        y1_probas = self.clf1.predict_proba(dataView1)
        y2_probas = self.clf2.predict_proba(dataView2)
        
        proba = (y1_probas*y2_probas)
        return proba

In [10]:
import copy
# self defined one vs rest
class co_train_one_vs_rest:
    
    def __init__ (self):
        self.binary_clf = []

    def fit_one_vs_rest(self, all_train_dv1, all_train_dv2, clf):
        # ----------- binary statistic collection --------------#
        self.positive_sample_size = []
        self.negative_sample_size = []
        # --------- split labeled/unlabeled from all data ----------------#
        labeled_mask = all_train_dv1["authorID"] != "-1"
        labeled_processed_dv1 = all_train_dv1[labeled_mask]
        labeled_processed_dv2 = all_train_dv2[labeled_mask]
        all_labeled_sample_pid = labeled_processed_dv1["paperID"].tolist()
        
        unlabeled_mask = all_train_dv1["authorID"] == "-1"
        unlabeled_dv1 = all_train_dv1[unlabeled_mask]
        unlabeled_dv2 = all_train_dv2[unlabeled_mask]
        print("labled_samples: ", labeled_processed_dv1.shape)
        print("unlabled_samples: ", unlabeled_dv1.shape)
        # ---------------- generate binary labels -------------------- #

        self.classes = np.unique(all_train_dv1["authorID"]).tolist()
        # check for "RARE_VALUE" special marker that only used for train binary classifier, not creating new class
        for author in self.classes:
            if author =="-1":
                pass
            else:
                print("Binary clf: ", author)
                mask = labeled_processed_dv1["authorID"] == author
                temp = labeled_processed_dv1[mask]
                positive_sample_pid = temp["paperID"].tolist()
                negative_sample_pid = extractNegativeSample(positive_sample_pid, all_labeled_sample_pid)
                # append to statistic collection
                self.positive_sample_size.append(len(positive_sample_pid))
                self.negative_sample_size.append(len(negative_sample_pid))
                # form positive and negative (negative class come from similar name group)
                all_authors = []
                all_authors.append(positive_sample_pid)
                all_authors.append(negative_sample_pid)
                appended_data = []
                for label, pid in enumerate(all_authors):
                    # create df save one author data 
                    authordf = pd.DataFrame({"paperID":pid})
                    authordf['label'] = label
                    appended_data.append(authordf)
                processed_data = pd.concat(appended_data, axis=0,ignore_index=True)
                
                # alignment 
                processed_data = pd.merge(labeled_processed_dv1["paperID"].to_frame(), processed_data, on="paperID")
#                 print(processed_data)
                # -------------construct binary labeleds dataset ----------------#
                dv1_with_binary_label = pd.merge(labeled_processed_dv1, processed_data, on="paperID", how = 'outer')
                dv2_with_binary_label = pd.merge(labeled_processed_dv2, processed_data, on="paperID", how = 'outer')
                dv1_with_binary_label = dv1_with_binary_label.drop(["authorID"], axis=1).reset_index(drop=True)
                dv2_with_binary_label = dv2_with_binary_label.drop(["authorID"], axis=1).reset_index(drop=True)
                # ------------- add unlabeled data to form final dataset ---------#
                unlabeled_dv1 = unlabeled_dv1.rename(columns={"authorID": 'label'})
                unlabeled_dv2 = unlabeled_dv2.rename(columns={"authorID": 'label'})
                final_dv1 = pd.concat([dv1_with_binary_label,unlabeled_dv1], ignore_index=True)
                final_dv2 = pd.concat([dv2_with_binary_label,unlabeled_dv2], ignore_index=True)
                # ---------------------- final data ------------------------------#
                label = final_dv1[["label"]]
                pid = final_dv1[["paperID"]]
                
                final_dv1.drop(["paperID", "label"], axis=1, inplace = True)
                final_dv2.drop(["paperID", "label"], axis=1, inplace = True)
#                 print(label)
                print(final_dv1.shape)
                print(final_dv2.shape)
#                 LRaccuracy, LRmarcof1 = k_fold_cv(train_data, label, clf, k=10)
#                 print("LR Accuracy: ",LRaccuracy)
#                 print("LR F1: ", LRmarcof1)

                # using converted feature vector to train classifier
                traing_clf = copy.deepcopy(clf)
                traing_clf.fit(final_dv1, final_dv2, label)
                self.binary_clf.append(traing_clf)
        print(self.classes)
        print(self.positive_sample_size)
        print(self.negative_sample_size)
        self.classes.remove('-1')
        return self
        
    def predict(self, dataviewone, dataviewtwo):
        author_proba = pd.DataFrame()
        for author, author_clf in zip(self.classes, self.binary_clf):
#             print(author_clf.predict_proba(dataviewone, dataviewtwo))
            # only look at probability of 0 (belone to that author)
            author_proba[author] = author_clf.predict_proba(dataviewone, dataviewtwo)[:,0]
        # for author less than threshold number of samples
        self.predict_proba = author_proba
        labels = author_proba.idxmax(axis=1).values
        return labels
    
    

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv_co_train_ovr(dataview1, dataview2, unlabeled_dv1, unlabeled_dv2, label, clf, k=10):
    kf = StratifiedKFold(n_splits=k)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(dataview1, label):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # ---------------split train and test -------------------- #
        dv1_train, dv1_test = dataview1.iloc[train_index], dataview1.iloc[test_index]
        dv2_train, dv2_test = dataview2.iloc[train_index], dataview2.iloc[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # -------------- add unlabeled to train ------------------ #
        final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
        final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
        # -------------- train ovr co-training ------------------- #
        ovr_clf = co_train_one_vs_rest().fit_one_vs_rest(final_dv1, final_dv2, clf)
        
        dv1_test.drop(["authorID", "paperID"], axis=1, inplace = True)
        dv2_test.drop(["authorID", "paperID"], axis=1, inplace = True)
        # get predicted label
        co_lr_label_predict = ovr_clf.predict(dv1_test, dv2_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(co_lr_label_predict)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [None]:
# load the file
import sys
import io
import os
import collections
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from statistics import mean 

# fix random seed for reproducibility
np.random.seed(1)

# loop through all files in directory add name to name list
fileDir = "../../Data/"+Dataset+"/canopies/"
listfiles = os.listdir(fileDir)

co_lr_diff_embedding_result = []

# ------------ view two citation is fix, so move out to save time ------- #
# read viewtwo embedding
print("Load citation embedding: ", pp_citation)
viewtwo_citation_embedding = com_func.read_all_citation_embedding_sorted(emb_type = pp_citation)

for select_emb in pp_textual:
    #---------------- load embeddings for different view ---------------#
    print("Load textual embedding: ", select_emb)
    # read viewone embeddings
    viewone_textual_emb = com_func.read_all_textual_embedding_sorted(emb_type=select_emb, training_size = "3m")

    threshold_change_all_co_lr_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, num_class, per_class_count, all_labeled_count, selected_labeled_count, unlabeled_count = ([] for i in range(6))

        all_co_LR_accuracy, all_co_LR_f1, = ([] for i in range(2))

        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read pid and aid from file
            data = read_file(fileDir+file)
            labeled_mask = data["authorID"] != "-1"
            labeled_data = data[labeled_mask]
            unlabeled_mask = data["authorID"] == "-1"
            ublabeled_data = data[unlabeled_mask]
            unlabeled_pid = ublabeled_data["paperID"]
            print(len(unlabeled_pid))
            print(labeled_data.shape)
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            # count number of paper each author write based on author ID
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                total_selected_group+= 1
                labeled_data, author_list, paperCounter= com_func.only_select_productive_authors(labeled_data, step_threshold)
                allname.append(name)
                all_labeled_count.append(len(labeled_data))
                num_class.append(len(paperCounter))
                per_class_count.append(paperCounter)

                # -------------- extract all samples for name group -------------- #
                # for each name group
                # read in labeled data
                labeled_viewone_textual = extract_embedding(viewone_textual_emb, labeled_data["paperID"])
                print(labeled_viewone_textual.shape)
                labeled_viewtwo_citation = extract_embedding(viewtwo_citation_embedding, labeled_data["paperID"])
                print(labeled_viewtwo_citation.shape)
                print("Labeled: ",len(labeled_viewone_textual), " : ", len(labeled_viewtwo_citation))

                # read in unlabeled data
                unlabeled_viewone_textual = extract_unlabeled_embedding(viewone_textual_emb, unlabeled_pid)
                print(unlabeled_viewone_textual.shape)
                unlabeled_viewtwo_citation = extract_unlabeled_embedding(viewtwo_citation_embedding, unlabeled_pid)
                print(unlabeled_viewtwo_citation.shape)
                print("Unlabeled: ",len(unlabeled_viewone_textual), " : ", len(unlabeled_viewtwo_citation))

                # synchronize different view based on pid
                sorted_dv1, sorted_dv2, unlabeled_dv1, unlabeled_dv2= synchro_views(labeled_viewone_textual, labeled_viewtwo_citation,
                                                                                    unlabeled_viewone_textual, unlabeled_viewtwo_citation)
                print(sorted_dv1.shape)
                print(sorted_dv2.shape)
                print(unlabeled_dv1.shape)
                print(unlabeled_dv2.shape)
                unlabeled_count.append(unlabeled_dv1.shape[0])
                selected_labeled_count.append(sorted_dv1.shape[0])
                labeled_data = labeled_data[labeled_data.paperID.isin(sorted_dv1[0])]
                # ---------------------- alignment -------------------------------------------------#
                # shuffle the data
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # alignment 
                sorted_dv1 = pd.merge(labeled_data, sorted_dv1, left_on="paperID", right_on = [0], how = "outer")
                sorted_dv2 = pd.merge(labeled_data, sorted_dv2, left_on="paperID", right_on = [0], how = "outer")
                sorted_dv1 = sorted_dv1.drop([0], axis=1)
                sorted_dv2 = sorted_dv2.drop([0], axis=1)
#               # ------------------- train test split ------------------------------ --------------#
#                 # ------------------- train test split 1:9 ratio -----------------------------------#
#                 dv1_train, dv1_test, dv_y_train, dv1_y_test = train_test_split(sorted_dv1, labeled_data["authorID"], 
#                                                                     test_size=0.1, stratify = labeled_data["authorID"])
#                 # get index of train and test
#                 train_index = dv_y_train.index.tolist()
#                 test_index = dv1_y_test.index.tolist()
#                 dv2_train, dv2_test = sorted_dv2.iloc[train_index], sorted_dv2.iloc[test_index]
#                 # ----------------------add ublabeled data to labeled to form final train set---------#
#                 # rename authorID as label
#                 print("labeled size: ", sorted_dv1.shape)
#                 print("unlabeled size: ", unlabeled_dv1.shape)
#                 print(dv1_train.head())
#                 final_dv1 = pd.concat([dv1_train,unlabeled_dv1], ignore_index=True)
#                 final_dv2 = pd.concat([dv2_train,unlabeled_dv2], ignore_index=True)
#                 print(final_dv1.head())
#                 print(final_dv1.shape)
#                 # get pid and labels for true labels
#                 test_true_label =labeled_data["authorID"].iloc[test_index]
#                 dv1_test.drop(["authorID", "paperID"], axis=1, inplace = True)
#                 dv2_test.drop(["authorID", "paperID"], axis=1, inplace = True)
#                 # ----------------------------------- ovr co-training --------------------------------#
#                 # co-training with logistic regression
#                 co_logistic_clf = Co_training_clf(clf1=LogisticRegression(),p=1,n=1)
#                 co_lr_clf_ovr = co_train_one_vs_rest().fit_one_vs_rest(final_dv1, final_dv2, co_logistic_clf)
#                 co_lr_label_predict = co_lr_clf_ovr.predict(dv1_test, dv2_test)
#                 co_lr_accuracy = accuracy_score(test_true_label, co_lr_label_predict)
#                 co_lr_f1 = f1_score(test_true_label, co_lr_label_predict,average='macro')
#                 print("lr macro f1: ",co_lr_f1)
#                 all_co_LR_accuracy.append(co_lr_accuracy)
#                 all_co_LR_f1.append(co_lr_f1)
                # ---------------------- 10 fold cv ------------------------------------------------- #
                co_logistic_clf = Co_training_clf(clf1=LogisticRegression(),p=1,n=1)
                co_lr_accuracy, co_lr_f1 = k_fold_cv_co_train_ovr(sorted_dv1, sorted_dv2, unlabeled_dv1, unlabeled_dv2,
                                                                  labeled_data["authorID"], co_logistic_clf, 10)
                print("lr macro f1: ",co_lr_f1)
                all_co_LR_accuracy.append(co_lr_accuracy)
                all_co_LR_f1.append(co_lr_f1)
                

        # write evaluation result to excel
        output = pd.DataFrame({'Name Group':allname,"Class number":num_class, "Per class size":per_class_count, 
                               "Total labeled samples":all_labeled_count, "Total unlabeled samples":unlabeled_count, 
                               "selected labeled samples": selected_labeled_count, 
                               "co-train with lr accuracy":all_co_LR_accuracy, "co-train with lr f1": all_co_LR_f1})

        savePath = "../../result/"+Dataset+"/co_train_advanced/"
        filename = "(Global emb sample 3m) viewone_textual="+select_emb+"_viewtwo_citation="+pp_citation+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
        com_func.write_csv_df(savePath, filename, output)
        print("Done")
        
        threshold_change_all_co_lr_f1s.append(all_co_LR_f1)
        
    co_lr_diff_embedding_result.append(threshold_change_all_co_lr_f1s)

Load citation embedding:  n2v


In [1]:
print(all_co_LR_accuracy)
print(co_lr_f1)

NameError: name 'all_co_LR_accuracy' is not defined

In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
print(pp_textual)
print(allname)
# 3d, d1 diff emb, d2 diff threshold, d3 result for different author
print(co_lr_diff_embedding_result)
print(all_co_LR_f1)

In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
# -------------- extract result for plot --------------------- #
colr_per_author = []
colr_lsa_per_author_result = co_lr_diff_embedding_result[0][0]
colr_pv_dm_per_author_result = co_lr_diff_embedding_result[1][0]
colr_pv_dbow_per_author_result = co_lr_diff_embedding_result[2][0]
colr_per_author.append(colr_lsa_per_author_result)
colr_per_author.append(colr_pv_dm_per_author_result)
colr_per_author.append(colr_pv_dbow_per_author_result)


In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
%matplotlib inline
import matplotlib.pyplot as plt
#--------------   logistic regression --------------------------#
# process result into np array
co_logistic_regression_result = np.array(colr_per_author)
name_group = np.array(allname)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(pp_textual, co_logistic_regression_result):
    emb_type = "Viewone: "+emb_type+" Viewtwo: n2v)"
    plt.xticks(range(len(result)), name_group)
    plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    plt.plot(result, label=emb_type)
ax.autoscale_view()
plt.legend()
plt.title('F1 for different embedding method for co-trained logistic regression')
plt.xlabel('Name group')
plt.ylabel('marco f1 score')
# plt.savefig('diff_combined_embedding_sample=3m_clf=co_train_logistic_threshold=100.eps', format='eps', dpi=300)

In [None]:
print(len(listfiles))

In [None]:
%reset

In [None]:
%whos