In [7]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import com_func

def dummy(doc):
    return doc

# parameters
threshold = 30
cutoff = 3

pp_textual = ["tf","tf_idf","lsa"]

Dataset = "pubmed"

In [8]:
import pandas as pd
# load text information
Dataset = "pubmed"
raw_filepath = "../../Data"+"/"+Dataset+"/id_textual_combined_labeled.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # textual information can be defined as title+abstract
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " labeled paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

Total  135796  labeled paper have text information


In [28]:
import pickle
# read trained rec to rec textual graph
def read_textual_embedding(Dataset = "pubmed", emb_type = "off"):
    textual_emb = []
    while True:
        if emb_type == "pv_dm":
            modelSaveDir = "../../Data/"+Dataset+"/models/doc2v/textual_sample=140k/"
            model = gensim.models.Doc2Vec.load(modelSaveDir+"pv_dm/Doc2Vec(dmm,d100,n5,w5,mc3,s0.001,t24)")
            allPaperTags = model.docvecs.offset2doctag
            for pid in allPaperTags:
                vectorRepresentation = model.docvecs[pid].tolist()
                vectorRepresentation = [format(i, '.8f') for i in vectorRepresentation]
                vectorRepresentation = ' '.join(vectorRepresentation)
                textual_emb.append(pid+" "+vectorRepresentation)
                
            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[:3])
            break
        elif emb_type == "pv_dbow":
            modelSaveDir = "../../Data/"+Dataset+"/models/doc2v/textual_sample=140k/"
            model = gensim.models.Doc2Vec.load(modelSaveDir+"pv_dbow/Doc2Vec(dbow,d100,n5,mc3,s0.001,t24)")
            allPaperTags = model.docvecs.offset2doctag
            for pid in allPaperTags:
                vectorRepresentation = model.docvecs[pid].tolist()
                vectorRepresentation = [format(i, '.8f') for i in vectorRepresentation]
                vectorRepresentation = ' '.join(vectorRepresentation)
                textual_emb.append(pid+" "+vectorRepresentation)
                
            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[:3])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return textual_emb

In [15]:
# read trained rec to rec node2vec citation graph
def read_citation_embedding(Dataset = "pubmed", emb_type = "off"):
    citation_emb = []
    while True:
        if emb_type == "n2v":
            citation_emb_dir = "../../Data/"+Dataset+"/vectors/"+emb_type+"/extracted_labeled_n2v.txt"
            with open(citation_emb_dir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    if(len(read_data)==101):
                        paper_Vectors = read_data
                        citation_emb.append(paper_Vectors)
            f.close()
            print("Total citation vector records:",len(citation_emb))
            print(citation_emb[:3])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return citation_emb

In [16]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [17]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    modelSaveDir = "../../Data/"+Dataset+"/models/count/textual_sample=140k/"
    with open(modelSaveDir+'tf_idf_Vectorizer.pickle', "rb") as input_file:
        model = pickle.load(input_file)
    tfidfMatrix = model.transform(cleaned_token).toarray()
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [None]:
def extract_embedding(all_embedding, pid):
    extracted_emb = []
    wanted_pid = pid.values.tolist()
    # only if embedding exist
    if len(all_embedding)>0:
        for paper_embedding in all_embedding:
            if paper_embedding[0] in wanted_pid:
                extracted_emb.append(paper_embedding)
    
    extracted_emb = pd.DataFrame(extracted_emb)
    # only if embedding exist
    if len(all_embedding)>0:
        # reorder embedding with pid and fill empty record with 0
        extracted_emb = pd.merge(pid.to_frame(), extracted_emb, left_on='paperID', right_on=0, how='outer')
        # fill missing value with 0
        extracted_emb.fillna(0, inplace = True)
        # remove index
        extracted_emb.drop(['paperID', 0], axis=1, inplace=True)
    return extracted_emb

In [18]:
import pickle
import gensim
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def get_textual_embedding(raw_textual_content, pretrained_emb, pid, emb_type="off", stopword=True):
    while True:
        if emb_type == "tf":
            cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
            average_sample_size = sum(sample_size)/len(sample_size)
            print("Minimal sample size: ", min(sample_size))
            print("maximal sample size: ", max(sample_size))
            modelSaveDir = "../../Data/"+Dataset+"/models/count/textual_sample=140k/"
            with open(modelSaveDir+'CountVectorizer.pickle', "rb") as input_file:
                model = pickle.load(input_file)
            tf_vector = model.transform(cleaned_token).toarray()
            print(tf_vector.shape)
            result_vector = normalize(tf_vector)
            break
        elif emb_type == "tf_idf":
            cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
            average_sample_size = sum(sample_size)/len(sample_size)
            print("Minimal sample size: ", min(sample_size))
            print("maximal sample size: ", max(sample_size))
            # using tf-idf
            modelSaveDir = "../../Data/"+Dataset+"/models/count/textual_sample=140k/"
            with open(modelSaveDir+'tf_idf_Vectorizer.pickle', "rb") as input_file:
                model = pickle.load(input_file)
            result_vector = model.transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "lsa":
            cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
            average_sample_size = sum(sample_size)/len(sample_size)
            print("Minimal sample size: ", min(sample_size))
            print("maximal sample size: ", max(sample_size))
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type =="pv_dm" or emb_type =="pv_dbow":
            result_vector = extract_embedding(pretrained_emb, pid)
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, return nothing")
            emb_type="off"
    return result_vector, average_sample_size

In [19]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [20]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [27]:
# read pretrained embeddings
all_textual_embedding = read_textual_embedding(emb_type = "pv_dbow")
all_citation_embedding = read_citation_embedding(emb_type = "n2v")

Total textual vector records: 135796
['8077 -0.42665145 -0.20274664 0.08016871 0.28375724 0.03493189 -0.22356583 -0.37297058 0.08550507 -0.84635490 -0.53577036 -0.18753140 -0.15832068 0.03342307 0.19853322 0.21486974 -0.88039148 0.07680665 -0.45549583 0.25181597 0.68849790 -0.52579153 -0.15735805 0.69318497 -0.31203189 0.26918790 -0.69177866 0.31827661 -1.24258828 -0.03450382 -0.43042749 0.15029581 0.36190116 -0.03027276 -0.47853798 0.48261651 0.27281243 -0.00015305 -0.02307266 0.28314903 0.43221837 -0.08020838 0.63495243 -0.51042092 1.03504837 0.15637830 0.16105181 -0.16248947 0.57620406 0.12860727 0.20960683 0.15471290 0.27213791 -0.38468286 0.67985624 -0.59635264 -0.00133450 0.07793075 0.17300151 0.07857662 0.66823900 -0.27214140 -0.08212417 -0.43762782 -0.68239814 -0.06299266 0.59357041 0.05746553 -0.02640015 -0.13266836 0.39245602 -0.12700117 -0.46807149 0.15950061 -0.13681552 -0.06565703 -0.13609785 0.44973734 0.06173952 -0.39990264 0.62084496 -0.76443452 -0.12911695 0.70948893 0

In [16]:
# load the file
import io
import collections
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# entire model f1
modelSVCf1, modelLRf1 = ([] for i in range(2))

for emb in pp_textual:
    # collect statistic to output
    allname, num_class, per_class_count, average_textual_size = ([] for i in range(4))

    all_svcLinear_accuracy, all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(4))
    
    # collect overall tp, tn, fp, fn
    svcTP=svcTN=svcFP=svcFN = 0
    lrTP=lrTN=lrFP=lrFN = 0
    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data_part = read_labeled_file(fileDir+file)
        print("total sample size before apply threshold: ",len(labeled_data_part))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data_part["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove samples that are smaller than threshold
        labeled_data_part = labeled_data_part[labeled_data_part.authorID.isin(temp)]
        print("Total sample size after apply threshold: ",len(labeled_data_part))
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name, " pass")
        else:
            allname.append(name)
            num_class.append(len(paperCounter))
            per_class_count.append(paperCounter)
            # convert author id to label
            gather_label = []
            for index, record in labeled_data_part.iterrows():
                gather_label.append(temp.index(record["authorID"]))
            labeled_data_part["label"] = gather_label
            # merge title and abstract from all raw data to labeled dataset
            labeled_data = pd.merge(left=labeled_data_part,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
            # shuffle the data
            labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
            # extract true label and pid
            label = labeled_data["label"]
            pid = labeled_data["paperID"]
            # list of different data field
            part_collection = []
            # select feature wanted to fit to clustering/classification algorithm
            # data part, textual information
            data_part_textual, avg_textual_size = raw_text_to_vector(labeled_data['combine_textual'], emb_type=emb)
            average_textual_size.append(avg_textual_size)
            print(data_part_textual.shape)
            part_collection.append(data_part_textual)
            # merge different part of data data together by concatenate it all together
            # remove empty emb (when emb set off)
            part_collection = [part for part in part_collection if len(part)!=0]
            print(len(part_collection))
            if len(part_collection)>1:
                combinedata = np.concatenate(part_collection,axis=1)
            elif len(part_collection)==1:
                if isinstance(part_collection[0], pd.DataFrame):
                    combinedata = part_collection[0].values
                else:
                    combinedata = part_collection[0]
            else:
                print("No data available")
                break
            print(len(combinedata))
            # using converted feature vector to train classifier
            # using SVM with linear kernal
            clf = SVC(decision_function_shape='ovr', kernel='linear')
            svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            svcTP+=tp
            svcTN+=tn
            svcFP+=fp
            svcFN+=fn
            print("svc Accuracy: ",svcaccuracy)
            print("svc F1: ", svcmarcof1)
            all_svcLinear_accuracy.append(svcaccuracy)
            all_svcLinear_f1.append(svcmarcof1)
            # using logistic regression
            clf = LogisticRegression(multi_class='ovr')
            LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            lrTP+=tp
            lrTN+=tn
            lrFP+=fp
            lrFN+=fn
            print("LR Accuracy: ",LRaccuracy)
            print("LR F1: ", LRmarcof1)
            all_LR_accuracy.append(LRaccuracy)
            all_LR_f1.append(LRmarcof1)
        break
    # print f1 for entire model
    print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
    print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
    svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
    lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
    modelSVCf1.append(svcF1)
    modelLRf1.append(lrF1)
    break
#     # write evaluation result to excel
#     output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"per_class_size":per_class_count, 
#                            "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
#                            "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

#     savePath = "../../result/"+Dataset+"/skovr/"
#     filename = "textual="+emb+"_threshold="+str(threshold)+".csv"
#     write_csv_df(savePath, filename, output)
#     print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
Minimal sample size:  6
maximal sample size:  248




(127, 331350)
(127, 331350)
1
127
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        31
           1       1.00      0.95      0.97        39
           2       0.93      1.00      0.97        57

   micro avg       0.97      0.97      0.97       127
   macro avg       0.98      0.96      0.97       127
weighted avg       0.97      0.97      0.97       127

[29  0  2  0 37  2  0  0 57]
svc Accuracy:  0.968503937007874
svc F1:  0.9688175240360789
              precision    recall  f1-score   support

           0       1.00      0.74      0.85        31
           1       0.97      0.92      0.95        39
           2       0.85      1.00      0.92        57

   micro avg       0.91      0.91      0.91       127
   macro avg       0.94      0.89      0.91       127
weighted avg       0.92      0.91      0.91       127

[23  1  7  0 36  3  0  0 57]
LR Accuracy:  0.9133858267716536
LR F1:  0.906191703871387
svc: TP:  123 TN:  250 FP:

In [32]:
print(pp_textual)
print("svc: ", modelSVCf1)
print("lr: ", modelLRf1)

['tf', 'tf_idf', 'lsa']
svc:  [0.9464991405128569, 0.9731343812578639, 0.9706356660582325]
lr:  [0.9699976962200287, 0.9314359637774903, 0.9415017101135941]


In [19]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

578
578
0.9798807689357634
0.9496564529342066


In [20]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

578
578
0.9764067884953497
0.9288583090243283


In [None]:
%reset

In [None]:
%who