# Train each author as classifier (Different textual embeddings)
1. Either apply threshold to samples in each name group so each author classifier only trained with samples pass threshold

2. Or not apply threshold to samples so each author classifier trained with all samples (include samples can't pass threshold) 

Example: k-kim name group have 1111 samples for 57 author.

With threshold of 100:

method 1 have 504 sample for training 3 author classifier.

method 2 have 1111 sample for training 3 author classifier.

In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

# parameters
#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

pp_textual = ["lsa", "pv_dm", "pv_dbow"]
apply_threshold_to_sample = True

Dataset = "pubmed"

In [3]:
# load the file
import sys
import io
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from statistics import mean 

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

lr_diff_emb_f1_result = []
svm_diff_emb_f1_result = []


for select_emb in pp_textual:
    print("Load embedding: ", select_emb)
    # read pretrained embeddings
    emb_all_sample, emb_all_pid = com_func.read_textual_embedding(emb_type=select_emb, training_size = "3m")
    
    threshold_change_all_lr_f1s = []
    threshold_change_all_svm_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, positive_sample_size, negative_sample_size, total_sample_size= ([] for i in range(4))
        all_LR_accuracy, all_LR_f1, all_svcLinear_accuracy, all_svcLinear_f1 = ([] for i in range(4))
        
        total_selected_group = 0

        # read all file in labeled group
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read needed content in labeled file
            labeled_data = com_func.read_pid_aid(fileDir+file)
            # ---------------- collect all labeled sample -------------------- #
            # ---------------- if use all samples as negative --------------- #
            all_labeled_samples = labeled_data["paperID"].tolist()
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                total_selected_group+= 1
                # --------------for each name group---------------- #
                if apply_threshold_to_sample == True:
                    # ---------- only use sample pass threshold ------- #
                    #-------- only select authors in name group are very productive (more than threshold)---------#
                    labeled_data, author_list,_= com_func.only_select_productive_authors(labeled_data, step_threshold)
                    # ----------------- if use filtered samples as negative --------- #
                    filtered_all_labeled_samples = labeled_data["paperID"].tolist()
                else:
                    # ----------- use all sample in name group --------- #
                    author_list = com_func.productive_authors_list(labeled_data, step_threshold)
                    print(name, " name group sample size: ",labeled_data.shape)
                # shuffle the data
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # list of different data field
                part_collection = []
                # select feature wanted to fit to clustering/classification algorithm
                data_textual = com_func.extract_embedding(emb_all_sample, emb_all_pid, labeled_data["paperID"])
                print(data_textual.shape)
                part_collection.append(data_textual)
                # merge different part of data data together by concatenate it all together
                combinedata = com_func.merge_data_parts(part_collection)
                print(combinedata.shape)
                # ------------- index tracker -------------------- #
                group_pid = labeled_data["paperID"].to_frame()
                counter = 0
                # loop through each author have label, one vs rest
                for author in author_list:
                    total_sample_size.append(len(labeled_data))
                    author_name = name+'_'+str(counter)
                    allname.append(author_name)
                    print(author_name, " : ", author)
                    mask = labeled_data["authorID"] == author
                    temp = labeled_data[mask]
                    positive_sample_pid = temp["paperID"].tolist()
                    negative_sample_pid = com_func.extractNegativeSample(positive_sample_pid, filtered_all_labeled_samples)
                    # append to statistic collection
                    positive_sample_size.append(len(positive_sample_pid))
                    negative_sample_size.append(len(negative_sample_pid))
                    # form positive and negative (negative class come from similar name group)
                    all_authors = []
                    all_authors.append(positive_sample_pid)
                    all_authors.append(negative_sample_pid)
                    appended_data = []
                    for label, pid in enumerate(all_authors):
                        # create df save one author data 
                        authordf = pd.DataFrame({"paperID":pid})
                        authordf['label'] = label
                        appended_data.append(authordf)
                    processed_data = pd.concat(appended_data, axis=0,ignore_index=True)

                    # alignment 
                    processed_data = pd.merge(group_pid, processed_data, on="paperID")

                    # extract true label and it's corresponeding pid for matching
                    label = processed_data["label"]
                    pid = processed_data["paperID"]

                    # using converted feature vector to train classifier
                    # using logistic regression
                    clf = LogisticRegression()
                    LRaccuracy, LRmarcof1 = com_func.k_fold_cv(combinedata, label, clf, k=10)
                    print("LR Accuracy: ",LRaccuracy)
                    print("LR F1: ", LRmarcof1)
                    all_LR_accuracy.append(LRaccuracy)
                    all_LR_f1.append(LRmarcof1)
                    # using SVM with linear kernal
                    clf = SVC(kernel='linear')
                    svcaccuracy, svcmarcof1 = com_func.k_fold_cv(combinedata, label, clf, k=10)
                    print("svc Accuracy: ",svcaccuracy)
                    print("svc F1: ", svcmarcof1)
                    all_svcLinear_accuracy.append(svcaccuracy)
                    all_svcLinear_f1.append(svcmarcof1)
                    counter+=1
                break
#         # write evaluation result to excel
#         output = pd.DataFrame({'Author ':allname, "positive sample size":positive_sample_size,
#                                "negative sample size":negative_sample_size, "Name group sample size": total_sample_size,
#                                 "logistic regression accuracy":all_LR_accuracy, "logistic regression f1": all_LR_f1,
#                                "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) f1": all_svcLinear_f1})
#         savePath = "../../result/"+Dataset+"/binary_global_emb_sample=3m/"
#         filename = "(Global emb sample 3m) textual="+select_emb+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")
        
#         threshold_change_all_lr_f1s.append(all_LR_f1)
#         threshold_change_all_svm_f1s.append(all_svcLinear_f1)
        
#     lr_diff_emb_f1_result.append(threshold_change_all_lr_f1s)
#     svm_diff_emb_f1_result.append(threshold_change_all_lr_f1s)

Load embedding:  pv_dbow
Total textual vector records: 135796
Vector dimension:  100
For name:  j_read
j_read  pass
For name:  f_esteves
f_esteves  pass
For name:  c_miller
c_miller  pass
For name:  r_jha
r_jha  pass
For name:  a_lowe
a_lowe  pass
For name:  a_vega
a_vega  pass
For name:  k_smith
k_smith  pass
For name:  j_gordon
j_gordon  pass
For name:  s_liao
s_liao  pass
For name:  j_qian
j_qian  pass
For name:  s_bernardi
s_bernardi  pass
For name:  t_hill
t_hill  pass
For name:  s_schindler
s_schindler  pass
For name:  j_williams
j_williams  pass
For name:  s_jacobson
s_jacobson  pass
For name:  e_andrade
e_andrade  pass
For name:  t_santos
t_santos  pass
For name:  k_kim
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000-0002

In [None]:
print(train_feature)
print(mnb_diff_feature_average_f1_result)
print(lr_diff_feature_average_f1_result)
print(svm_diff_freature_average_f1_result)
print(threshold_change)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
threshold_change = np.array(threshold_change)
#--------------   logistic regression --------------------------#
# process result into np array
logistic_regression_result = np.array(combined_lr_result)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(combined_emb, logistic_regression_result):
    plt.plot(threshold_change, result, label=emb_type)
plt.legend()
plt.title('Average f1 for different feature in logistic regression')
plt.xlabel('Threshold')
plt.ylabel('marco f1 score')

# plt.savefig('diff_embedding_sample=3m_clf=logistic regression.eps', format='eps', dpi=300)


# -------------------- svm -------------------------------------#
svm_result = np.array(combined_svm_result)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(combined_emb, svm_result):
    plt.plot(threshold_change, result, label=emb_type)
plt.legend()
plt.title('Average f1 for different feature in SVM')
plt.xlabel('Threshold')
plt.ylabel('marco f1 score')

# plt.savefig('diff_embedding_sample=3m_clf=SVM.eps', format='eps', dpi=300)