In [None]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func

def dummy(doc):
    return doc

# parameters
threshold = 30
cutoff = 3

pp_textual = "tf_idf"

Dataset = "pubmed"

In [None]:
import pandas as pd
# load text information
Dataset = "pubmed"
raw_filepath = "../../Data"+"/"+Dataset+"/id_textual_combined.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # keyword and mesh
        key_mesh = keywords+" "+mesh
        # title and abstract
        title_abstract = title+" "+abstract
        # title keywords mesh
        title_key_mesh = title+" "+key_mesh
        # abstract keywords mesh
        abstract_key_mesh = abstract+" "+key_mesh
        # all feature combined
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "title":title, "keywords_mesh":key_mesh, "abstract": abstract,
                              "title_abstract":title_abstract,"title_key_mesh":title_key_mesh, 
                              "abstract_key_mesh":abstract_key_mesh, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

In [None]:
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [None]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = count_vectorizer.fit_transform(cleaned_token).toarray()
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [None]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [None]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [None]:
# load the file
import io
import collections
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# model f1
modelMNBf1, modelSVCf1, modelLRf1 = ([] for i in range(3))

# list of features
all_features = ["title_abstract", "combine_textual"]

for feature in all_features:
    # collect statistic to output
    allname, num_class, per_class_count, average_textual_size = ([] for i in range(4))

    all_mnb_accuracy, all_mnb_f1,all_svcLinear_accuracy = ([] for i in range(3))
    all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(3))
    
    # collect overall tp, tn, fp, fn
    mnbTP=mnbTN=mnbFP=mnbFN = 0
    svcTP=svcTN=svcFP=svcFN = 0
    lrTP=lrTN=lrFP=lrFN = 0
    
    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data_part = read_labeled_file(fileDir+file)
        print("total sample size before apply threshold: ",len(labeled_data_part))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data_part["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove samples that are smaller than threshold
        labeled_data_part = labeled_data_part[labeled_data_part.authorID.isin(temp)]
        print("Total sample size after apply threshold: ",len(labeled_data_part))
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name, " pass")
        else:
            allname.append(name)
            num_class.append(len(paperCounter))
            per_class_count.append(paperCounter)
            # convert author id to label
            gather_label = []
            for index, record in labeled_data_part.iterrows():
                gather_label.append(temp.index(record["authorID"]))
            labeled_data_part["label"] = gather_label
            # merge title and abstract from all raw data to labeled dataset
            labeled_data = pd.merge(left=labeled_data_part,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
            # shuffle the data
            labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
            # extract true label and pid
            label = labeled_data["label"]
            pid = labeled_data["paperID"]
            # list of different data field
            part_collection = []
            # select feature wanted to fit to clustering/classification algorithm
            # data part 3, textual information
            data_part_textual, avg_textual_size = raw_text_to_vector(labeled_data[feature], emb_type=pp_textual)
            average_textual_size.append(avg_textual_size)
            print(data_part_textual.shape)
            part_collection.append(data_part_textual)
            # merge different part of data data together by concatenate it all together
            # remove empty emb (when emb set off)
            part_collection = [part for part in part_collection if len(part)!=0]
            if len(part_collection)>1:
                combinedata = np.concatenate(part_collection,axis=1)
            else:
                combinedata = part_collection[0]
            print(combinedata.shape)
            # using converted feature vector to train classifier
            # using Multinomial naive bayes
            clf = MultinomialNB()
            # use 10 fold cv
            mnbaccuracy, mnbmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            mnbTP+=tp
            mnbTN+=tn
            mnbFP+=fp
            mnbFN+=fn
            print("MNB Accuracy: ",mnbaccuracy)
            print("MNB F1: ", mnbmarcof1)
            all_mnb_accuracy.append(mnbaccuracy)
            all_mnb_f1.append(mnbmarcof1)
            # using SVM with linear kernal
            clf = SVC(decision_function_shape='ovr', kernel='linear')
            svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            svcTP+=tp
            svcTN+=tn
            svcFP+=fp
            svcFN+=fn
            print("svc Accuracy: ",svcaccuracy)
            print("svc F1: ", svcmarcof1)
            all_svcLinear_accuracy.append(svcaccuracy)
            all_svcLinear_f1.append(svcmarcof1)
            # using logistic regression
            clf = LogisticRegression(multi_class='ovr')
            LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            lrTP+=tp
            lrTN+=tn
            lrFP+=fp
            lrFN+=fn
            print("LR Accuracy: ",LRaccuracy)
            print("LR F1: ", LRmarcof1)
            all_LR_accuracy.append(LRaccuracy)
            all_LR_f1.append(LRmarcof1)
    # print f1 for entire model
    print("mnb: TP: ",mnbTP, "TN: ",mnbTN, "FP: ",mnbFP,"FN: ",mnbFN)
    print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
    print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
    mnbF1 = 2*mnbTP / (2*mnbTP + mnbFP + mnbFN)
    svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
    lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
    modelMNBf1.append(mnbF1)
    modelSVCf1.append(svcF1)
    modelLRf1.append(lrF1)
    # write evaluation result to excel
    output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"average term in sample":average_textual_size,
                           "per_class_size":per_class_count,"mnb accuracy":all_mnb_accuracy, "mnb macro f1": all_mnb_f1,
                           "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                           "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

    savePath = "../../result/"+Dataset+"/skovr/"+feature+"/"
    filename = "feature="+feature+"_textual="+pp_textual+"_threshold="+str(threshold)+".csv"
    write_csv_df(savePath, filename, output)
    print(feature, " Done")

In [None]:
print(all_features)
print("mnb: ", modelMNBf1)
print("svc: ", modelSVCf1)
print("lr: ", modelLRf1)

In [9]:
# accuracy
from statistics import mean 
cleaned_mnb_accuracy = [x for x in all_mnb_accuracy if isinstance(x, float)]
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_mnb_accuracy))
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_mnb_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

289
289
289
0.9778910513144647
0.9572108961110675
0.977951254354412


In [10]:
# f1
from statistics import mean 
# remove string from result
cleaned_mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_mnb_f1))
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_mnb_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

289
289
289
0.9747506103432456
0.9517279039263197
0.9747488300982716


In [None]:
print(len(all_LR_f1))

In [None]:
test = [x for x in all_mnb_f1 if isinstance(x, float)]


In [None]:
%who