In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

threshold = 100
cutoff = 3
Dataset = "pubmed"

pp_textual = "pv_dbow"
citation_emb = "n2v"


In [2]:
# read trained rec to rec textual graph
def read_textual_embedding(Dataset = "pubmed", emb_type = "off"):
    textual_emb = []
    while True:
        if emb_type == "pv_dm":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/extracted_labeled_pv_dm.txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()

            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[0])
            break
        elif emb_type == "pv_dbow":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/extracted_labeled_pv_dbow.txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()
            
            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[0])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return textual_emb

In [3]:
# read trained rec to rec node2vec citation graph
def read_citation_embedding(Dataset = "pubmed", emb_type = "off"):
    citation_emb = []
    while True:
        if emb_type == "n2v":
            citation_emb_dir = "../../Data/"+Dataset+"/vectors/"+emb_type+"/extracted_labeled_n2v.txt"
            with open(citation_emb_dir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    if(len(read_data)==101):
                        paper_Vectors = read_data
                        citation_emb.append(paper_Vectors)
            f.close()
            print("Total citation vector records:",len(citation_emb))
            print(citation_emb[:3])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return citation_emb

In [4]:
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [5]:
def extract_embedding(all_embedding, pid):
    extracted_emb = []
    wanted_pid = pid.values.tolist()
    # only if embedding exist
    if len(all_embedding)>0:
        for paper_embedding in all_embedding:
            if paper_embedding[0] in wanted_pid:
                extracted_emb.append(paper_embedding)
    
    extracted_emb = pd.DataFrame(extracted_emb)
    # only if embedding exist
    if len(all_embedding)>0:
        # reorder embedding with pid and fill empty record with 0
        extracted_emb = pd.merge(pid.to_frame(), extracted_emb, left_on='paperID', right_on=0, how='outer')
        # fill missing value with 0
        extracted_emb.fillna(0, inplace = True)
        # remove index
        extracted_emb.drop(['paperID', 0], axis=1, inplace=True)
    return extracted_emb

In [6]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

def normal_group_predict(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    # get predicted label
    label_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,label_pred)
    f1 = f1_score(y_test, label_pred,average='macro')
    
    print(metrics.classification_report(y_test, label_pred))
    print(metrics.confusion_matrix(y_test, label_pred).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(y_test, label_pred)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)
    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [7]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [8]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [9]:
# read pretrained embeddings
all_textual_embedding = read_textual_embedding(emb_type = pp_textual)
all_citation_embedding = read_citation_embedding(emb_type = citation_emb)

FileNotFoundError: [Errno 2] No such file or directory: '../../Data/pubmed/vectors/d2v/extracted_labeled_pv_dbow.txt'

In [10]:
# load the file
import io
import collections
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# fix random seed for reproducibility
np.random.seed(1)

# collect statistic to output
allname, num_class, per_class_count = ([] for i in range(3))

all_svcLinear_accuracy, all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(4))

# collect overall tp, tn, fp, fn
svcTP=svcTN=svcFP=svcFN = 0
lrTP=lrTN=lrFP=lrFN = 0

# read all file in labeled group
for file in listfiles:
    # group name
    temp = file.split("_")
    name = temp[1]+"_"+temp[-1]
    print("For name: ",name)
    # read needed content in labeled file
    labeled_data = read_labeled_file(fileDir+file)
    print("total sample size before apply threshold: ",len(labeled_data))
    # count number of paper each author write based on author ID
    paperCounter = collections.Counter(labeled_data["authorID"])
    print(paperCounter)
    # collect per class statistic
    for k in list(paperCounter):
        if paperCounter[k] < threshold:
            del paperCounter[k]
    temp =list(paperCounter.keys())
    print(temp)
    # remove samples that are smaller than threshold
    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
    print("Total sample size after apply threshold: ",len(labeled_data))
    # if only have one class or no class pass the threshold, not applicable
    if(len(paperCounter)<2):
        print(name, " pass")
    else:
        allname.append(name)
        num_class.append(len(paperCounter))
        per_class_count.append(paperCounter)
        # extract true label and pid
        label = labeled_data["authorID"]
        pid = labeled_data["paperID"]
        # list of different data field
        part_collection = []
        # select feature wanted to fit to clustering/classification algorithm
        # data part extract textual embedding
        data_part_textual = extract_embedding(all_textual_embedding, pid)
        print(data_part_textual.shape)
        part_collection.append(data_part_textual)
        # data part read citation embedding 
        data_part_citation = extract_embedding(all_citation_embedding, pid)
        print(data_part_citation.shape)
        part_collection.append(data_part_citation)
        # merge different part of data data together by concatenate it all together
        # remove empty emb (when emb set off)
        part_collection = [part for part in part_collection if len(part)!=0]
        print(len(part_collection))
        if len(part_collection)>1:
            combinedata = np.concatenate(part_collection,axis=1)
        elif len(part_collection)==1:
            if isinstance(part_collection[0], pd.DataFrame):
                combinedata = part_collection[0].values
            else:
                combinedata = part_collection[0]
        else:
            print("No data available")
            break
        print(combinedata.shape)
        # shuffle split train and test
        X_train, X_test, y_train, y_test = train_test_split(combinedata, label,
                                                            stratify=label, 
                                                            test_size=0.20)
        print(X_train.shape)
        print(y_train.shape)
        print(X_test.shape)
        print(y_test.shape)
        # using converted feature vector to train classifier
        # using SVM with linear kernal
        clf = SVC(decision_function_shape='ovr', kernel='linear')
        # normal predict
        svcaccuracy, svcmarcof1, tp, tn, fp, fn = normal_group_predict(X_train, y_train, X_test, y_test, clf)
#         # use 10 fold cv
#         svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
        svcTP+=tp
        svcTN+=tn
        svcFP+=fp
        svcFN+=fn
        print(name, " in group svc Accuracy: ",svcaccuracy)
        print(name, " in group svc F1: ", svcmarcof1)
        all_svcLinear_accuracy.append(svcaccuracy)
        all_svcLinear_f1.append(svcmarcof1)
        # using logistic regression
        clf = LogisticRegression(multi_class='ovr')
        # normal predict
        LRaccuracy, LRmarcof1, tp, tn, fp, fn = normal_group_predict(X_train, y_train, X_test, y_test, clf)
#         # 10 fold
#         LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
        lrTP+=tp
        lrTN+=tn
        lrFP+=fp
        lrFN+=fn
        print("LR Accuracy: ",LRaccuracy)
        print("LR F1: ", LRmarcof1)
        all_LR_accuracy.append(LRaccuracy)
        all_LR_f1.append(LRmarcof1)
        break

# print f1 for entire model
print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
        
# # write evaluation result to excel
# output = pd.DataFrame({'Name Group':allname,"Class number":num_class, "per_class_size":per_class_count,
#                        "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
#                        "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

# savePath = "../../result/"+Dataset+"/skovr/"
# filename = "textual="+pp_textual+"_citation="+citation_emb+"_threshold="+str(threshold)+".csv"
# write_csv_df(savePath, filename, output)
# print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
[]
Total sample size after apply threshold:  0
j_read  pass
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
[]
Total sample size after apply threshold:  0
f_esteves  pass
For name:  c_miller
total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-0002-2936-7717': 6, '0000-0003-3898-9734': 6, '0000-0002-5074-6914': 2, '0000-0003-4266-6700': 1, '0000-0002-9

NameError: name 'all_textual_embedding' is not defined

In [None]:
print("textual =",pp_textual,"_citation =",citation_emb)
print("svc: ", svcF1)
print("lr:", lrF1)

In [None]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

In [None]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))