# Evaluation of feature importance
1. Only use Term Frequency (TF) for different features ("title", "keywords_mesh", "abstract", "text")
2. Apply filter to filter out less productive authors

Example: k-kim name group have 1111 samples for 57 authors.

With filter of 100:

k-kim name group have 504 paper for 3 authors now, each author write more than 100 papers. Other paper are omitted

This script is to evaluate different feature with tf vectorization

In [None]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

# parameters
#----- filter for selecting set of name group -----------#
filter_select_name_group = 100
#----- filter for selecting min paper write for authors in name group ----#
filter_lower = 10
filter_upper = 100

apply_filter_to_sample = True

Dataset = "pubmed"

In [None]:
import pickle
import gensim
# read tf embedding with different feature
def read_text_embedding_feature_wise(Dataset = "pubmed", feature = "off"):
    text_emb = []
    emb_pid = []
    while True:
        if feature == "off":
            break
        else:
            modelSaveDir = "../models/"+Dataset+"/tf/"+feature+"_sample=140k/"
            with open(modelSaveDir+'tf_features.pickle', "rb") as input_file:
                vec = pickle.load(input_file)
            with open(modelSaveDir+'feature_pid.pickle', "rb") as input_file:
                allPaperid = pickle.load(input_file)
            text_emb = vec.toarray()
            emb_pid = allPaperid
            print("Total text embedding size: ",len(text_emb))
            print("Vector dimension: ", len(text_emb[0]))
            break
    return text_emb, emb_pid

In [None]:
# load the file
import sys
import io
import os
import collections
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from statistics import mean 

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

mnb_diff_feature_average_f1_result = []
lr_diff_feature_average_f1_result = []
svm_diff_freature_average_f1_result = []

# ----------------------- different feature ---------------------------- #
train_feature = ["title", "keywords_mesh", "abstract", "text"]
# train_feature = ["text"]
for select_feature in train_feature:
    print("Load feature: ", select_feature)
    # read pretrained embeddings
    tf_all_sample, all_tf_pid = read_text_embedding_feature_wise(feature=select_feature)
    
    filter_change_all_average_mnb_f1s = []
    filter_change_all_average_lr_f1s = []
    filter_change_all_average_svm_f1s = []
    filter_change = []
    
    # -------------- different filter (step by 10) -----------------------#
    for step_filter in range(filter_lower, filter_upper, 10):
        filter_change.append(step_filter)
        # collect statistic to output
        allname, all_author_id, positive_sample_size, negative_sample_size = ([] for i in range(4))
        orginal_sample_size, selected_sample_size= ([] for i in range(2))
        
        all_mnb_accuracy, all_mnb_f1, all_mnb_details = ([] for i in range(3))
        all_LR_accuracy, all_LR_f1, all_LR_details = ([] for i in range(3))
        all_svcLinear_accuracy, all_svcLinear_f1, all_svc_details= ([] for i in range(3))
        
        total_selected_group = 0

        # read all file in labeled group
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read needed content in labeled file
            labeled_data = com_func.read_pid_aid(fileDir+file)
            # collect all labeled sample
            all_labeled_sample = labeled_data["paperID"].tolist()
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            # count number of paper each author write based on author ID
            authorCounter = collections.Counter(labeled_data["authorID"])
            # remove name group that do not contain pair of author write more than 100 papers
            for k in list(authorCounter):
                if authorCounter[k] < filter_select_name_group:
                    del authorCounter[k]
            # if only have one author or no author pass the filter, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                total_selected_group+= 1
                # --------------for each name group---------------- #
                if apply_filter_to_sample == True:
                    # ---------- only use sample pass filter ------- #
                    #--------select authors in name group are very productive (more than filter)---------#
                    print("Total sample size before apply filter: ",len(labeled_data))
                    # count number of paper each author write based on author ID
                    paperCounter = collections.Counter(labeled_data["authorID"])
                    print(paperCounter)
                    print("Total author before apply threshoid: ", len(paperCounter))
                    # collect per class statistic
                    for k in list(paperCounter):
                        if paperCounter[k] < step_filter:
                            del paperCounter[k]
                    temp =list(paperCounter.keys())
                    print(temp)
                    print("Total author after apply threshoid: ", len(temp))
                    # remove samples that are smaller than filter
                    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
                    author_list = set(temp)
                    print("Total sample size after apply filter: ",len(labeled_data))
                    print(name, " name group sample size: ",labeled_data.shape)
                else:
                    print(name, " name group sample size: ",labeled_data.shape)
                # ----------- use all sample in name group --------- #
                # shuffle the data
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # list of different data field
                part_collection = []
                # select feature wanted to fit to clustering/classification algorithm
                data_text = com_func.extract_embedding(tf_all_sample, all_tf_pid, labeled_data["paperID"])
                print(data_text.shape)
                part_collection.append(data_text)
                # merge different part of data data together by concatenate it all together
                # remove empty emb (when emb set off)
                part_collection = [part for part in part_collection if len(part)!=0]
                print(len(part_collection))
                if len(part_collection)>1:
                    combinedata = np.concatenate(part_collection,axis=1)
                elif len(part_collection)==1:
                    if isinstance(part_collection[0], pd.DataFrame):
                        combinedata = part_collection[0].values
                    else:
                        combinedata = part_collection[0]
                else:
                    print("No data available")
                    break
                print(combinedata.shape)
                print(combinedata[0])

                group_pid = labeled_data["paperID"].to_frame()
                
                if len(author_list)==2:
                    author_list = list(author_list)
                    print("Binary case", name)
                    label = labeled_data["authorID"].values
                    # append to statistic collection
                    allname.append(name)
                    all_author_id.append(author_list)
                    orginal_sample_size.append(len(all_labeled_sample))
                    selected_sample_size.append(len(labeled_data))
                    positive_sample_size.append(np.count_nonzero(label == author_list[0]))
                    negative_sample_size.append(np.count_nonzero(label == author_list[1]))
                    
                    # using converted feature vector to train classifier
                    # using multinomial naive bayes
                    mnbclf = MultinomialNB()
                    mnbaccuracy, mnbmarcof1, mnbtp, mnbtn, mnbfp, mnbfn= com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, mnbclf, k=10)
                    print("MNB Accuracy: ",mnbaccuracy)
                    print("MNB F1: ", mnbmarcof1)
                    all_mnb_accuracy.append(mnbaccuracy)
                    all_mnb_f1.append(mnbmarcof1)
                    all_mnb_details.append({"TP": mnbtp, "TN": mnbtn, "FP": mnbfp, "FN": mnbfn})
                    # using logistic regression
                    lrclf = LogisticRegression()
                    LRaccuracy, LRmarcof1, LRtp, LRtn, LRfp, LRfn = com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, lrclf, k=10)
                    print("LR Accuracy: ",LRaccuracy)
                    print("LR F1: ", LRmarcof1)
                    all_LR_accuracy.append(LRaccuracy)
                    all_LR_f1.append(LRmarcof1)
                    all_LR_details.append({"TP": LRtp, "TN": LRtn, "FP": LRfp, "FN": LRfn})
                    # using SVM with linear kernal
                    svmclf = SVC(kernel='linear')
                    svcaccuracy, svcmarcof1, svmtp, svmtn, svmfp, svmfn= com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, svmclf, k=10)
                    print("svc Accuracy: ",svcaccuracy)
                    print("svc F1: ", svcmarcof1)
                    all_svcLinear_accuracy.append(svcaccuracy)
                    all_svcLinear_f1.append(svcmarcof1)
                    all_svc_details.append({"TP": svmtp, "TN": svmtn, "FP": svmfp, "FN": svmfn})
                    
                else:
                    print("Muti-class case")
                    counter = 0
                    # loop through each author have label, one vs rest
                    for author in author_list:
                        author_name = name+'_'+str(counter)
                        print(author_name)
                        selected_labeled_samples = labeled_data["paperID"].tolist()
                        mask = labeled_data["authorID"] == author
                        temp = labeled_data[mask]
                        positive_sample_pid = temp["paperID"].tolist()
                        negative_sample_pid = com_func.extractNegativeSample(positive_sample_pid, selected_labeled_samples)
                        # append to statistic collection
                        allname.append(author_name)
                        all_author_id.append(author)
                        orginal_sample_size.append(len(all_labeled_sample))
                        selected_sample_size.append(len(labeled_data))
                        positive_sample_size.append(len(positive_sample_pid))
                        negative_sample_size.append(len(negative_sample_pid))
                        # form positive and negative (negative class come from similar name group)
                        all_authors = []
                        all_authors.append(positive_sample_pid)
                        all_authors.append(negative_sample_pid)
                        appended_data = []
                        for label, pid in enumerate(all_authors):
                            # create df save one author data 
                            authordf = pd.DataFrame({"paperID":pid})
                            authordf['label'] = label
                            appended_data.append(authordf)
                        processed_data = pd.concat(appended_data, axis=0,ignore_index=True)

                        # alignment 
                        processed_data = pd.merge(group_pid, processed_data, on="paperID")

                        # extract true label and it's corresponeding pid for matching
                        label = processed_data["label"]
                        pid = processed_data["paperID"]

                        # using converted feature vector to train classifier
                        # using multinomial naive bayes
                        mnbclf = MultinomialNB()
                        mnbaccuracy, mnbmarcof1, mnbtp, mnbtn, mnbfp, mnbfn= com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, mnbclf, k=10)
                        print("MNB Accuracy: ",mnbaccuracy)
                        print("MNB F1: ", mnbmarcof1)
                        all_mnb_accuracy.append(mnbaccuracy)
                        all_mnb_f1.append(mnbmarcof1)
                        all_mnb_details.append({"TP": mnbtp, "TN": mnbtn, "FP": mnbfp, "FN": mnbfn})
                        # using logistic regression
                        lrclf = LogisticRegression()
                        LRaccuracy, LRmarcof1, LRtp, LRtn, LRfp, LRfn = com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, lrclf, k=10)
                        print("LR Accuracy: ",LRaccuracy)
                        print("LR F1: ", LRmarcof1)
                        all_LR_accuracy.append(LRaccuracy)
                        all_LR_f1.append(LRmarcof1)
                        all_LR_details.append({"TP": LRtp, "TN": LRtn, "FP": LRfp, "FN": LRfn})
                        # using SVM with linear kernal
                        svmclf = SVC(kernel='linear')
                        svcaccuracy, svcmarcof1, svmtp, svmtn, svmfp, svmfn= com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, svmclf, k=10)
                        print("svc Accuracy: ",svcaccuracy)
                        print("svc F1: ", svcmarcof1)
                        all_svcLinear_accuracy.append(svcaccuracy)
                        all_svcLinear_f1.append(svcmarcof1)
                        all_svc_details.append({"TP": svmtp, "TN": svmtn, "FP": svmfp, "FN": svmfn})
                        counter+=1
                    
        # write evaluation result to excel
        output = pd.DataFrame({'Author':allname, "Author ID":all_author_id, "positive sample size":positive_sample_size, "negative sample size":negative_sample_size, 
                               "Orginal Name group sample size": orginal_sample_size, "Selected Name group sample size": selected_sample_size,
                               "MNB Accuracy":all_mnb_accuracy, "MNB F1": all_mnb_f1, "MNB details": all_mnb_details,
                               "SVM(linear) accuracy":all_svcLinear_accuracy, "SVM f1": all_svcLinear_f1, "SVM details": all_svc_details,
                               "LR accuracy":all_LR_accuracy, "LR f1": all_LR_f1,"LR details": all_LR_details})
        #savePath = "../../result/"+Dataset+"/binary_global_emb_sample=140k/"
        #filename = "(Global emb sample 140k) textEmb=tf_feature="+select_feature+"_filter="+str(step_filter)+"_namegroupcount="+str(total_selected_group)+".csv"
        #com_func.write_csv_df(savePath, filename, output)
        print("Done")
        
        # --------------------------- mean f1 score over all groups ------------------ #
        mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
        lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
        svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
        average_mnb_f1 = mean(mnb_f1)
        average_lr_f1 = mean(lr_f1)
        average_svcLinear_f1 = mean(svcLinear_f1)
        filter_change_all_average_mnb_f1s.append(average_mnb_f1)
        filter_change_all_average_lr_f1s.append(average_lr_f1)
        filter_change_all_average_svm_f1s.append(average_svcLinear_f1)
        
    mnb_diff_feature_average_f1_result.append(filter_change_all_average_mnb_f1s)
    lr_diff_feature_average_f1_result.append(filter_change_all_average_lr_f1s)
    svm_diff_freature_average_f1_result.append(filter_change_all_average_svm_f1s)