In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import com_func

# parameters
#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_lower = 100
threshold_upper = 110

pp_textual = ["lsa", "pv_dm", "pv_dbow"]

Dataset = "pubmed"


In [2]:
# load the file
import io
import collections
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from statistics import mean 

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

lr_diff_embedding_result = []
svm_diff_embedding_result = []

# ----------------------- different ciataion embedding ----------------------#
for select_emb in pp_textual:
    
    # read pretrained embeddings
    print("Load textual embedding: ", select_emb)
    all_textual_embedding, all_textual_emb_pid = com_func.read_textual_embedding(emb_type=select_emb, training_size = "3m")
    
    threshold_change_all_lr_f1s = []
    threshold_change_all_svm_f1s = []
    threshold_change = []
    
    # -------------- different threshold (step by 10) -----------------------#
    for step_threshold in range(threshold_lower, threshold_upper, 10):
        threshold_change.append(step_threshold)
        # collect statistic to output
        allname, num_class, per_class_count, all_sample_count = ([] for i in range(4))
        
        all_mnb_accuracy, all_mnb_f1, all_LR_accuracy = ([] for i in range(3))
        all_LR_f1, all_svcLinear_accuracy, all_svcLinear_f1 = ([] for i in range(3))
    
        total_selected_group = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read needed content in labeled file
            labeled_data = com_func.read_pid_aid(fileDir+file)
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            # count number of paper each author write based on author ID
            authorCounter = com_func.select_productive_groups(labeled_data, threshold_select_name_group)
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                total_selected_group+= 1
                #--------select authors in name group are very productive (more than threshold)---------#
                labeled_data, author_list, paperCounter= com_func.only_select_productive_authors(labeled_data, step_threshold)
                all_sample_count.append(len(labeled_data))
                allname.append(name)
                num_class.append(len(paperCounter))
                per_class_count.append(paperCounter)
                #------------ extract paper representation -------------------------------------------#
                # shuffle the data
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # extract true label and pid
                label = labeled_data["authorID"]
                pid = labeled_data["paperID"]
                # list of different data field
                part_collection = []
                # select feature wanted to fit to clustering/classification algorithm
                # data part, textual information
                data_part_textual = com_func.extract_embedding(all_textual_embedding, all_textual_emb_pid, pid)
                print(data_part_textual.shape)
                part_collection.append(data_part_textual)
                # merge different part of data data together by concatenate it all together
                combinedata = com_func.merge_data_parts(part_collection)
                print(combinedata.shape)
                # -------------- using converted feature vector to train classifier-------------------#
                print(label)
                if select_emb == "tf":
                    # using multinomial naive bayes
                    clf = MultinomialNB()
                    mnbaccuracy, mnbmarcof1, tp, tn, fp, fn = com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, clf, k=10)
                    print("MNB Accuracy: ",mnbaccuracy)
                    print("MNB F1: ", mnbmarcof1)
                    all_mnb_accuracy.append(mnbaccuracy)
                    all_mnb_f1.append(mnbmarcof1)
                # using logistic regression
                clf = LogisticRegression(multi_class='ovr')
                LRaccuracy, LRmarcof1, tp, tn, fp, fn = com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, clf, k=10)
                print("LR Accuracy: ",LRaccuracy)
                print("LR F1: ", LRmarcof1)
                all_LR_accuracy.append(LRaccuracy)
                all_LR_f1.append(LRmarcof1)
                # using SVM with linear kernal
                clf = SVC(decision_function_shape='ovr', kernel='linear')
                svcaccuracy, svcmarcof1, tp, tn, fp, fn = com_func.k_fold_cv_with_accumulate_statistic(combinedata, label, clf, k=10)
                print("svc Accuracy: ",svcaccuracy)
                print("svc F1: ", svcmarcof1)
                all_svcLinear_accuracy.append(svcaccuracy)
                all_svcLinear_f1.append(svcmarcof1)
                break
            
#         if select_emb == "tf":
#             # write evaluation result to excel
#             output = pd.DataFrame({'Name Group':allname,"Class number":num_class,
#                                    "Per class size":per_class_count, "Total samples":all_sample_count,
#                                    "MNB Accuracy":all_mnb_accuracy, "MNB macro F1": all_mnb_f1, 
#                                    "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
#                                    "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})
#         else:
#             # write evaluation result to excel
#             output = pd.DataFrame({'Name Group':allname, "Class number":num_class,
#                                    "Per class size":per_class_count, "Total samples":all_sample_count,
#                                    "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
#                                    "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

#         savePath = "../../result/"+Dataset+"/OCEN_global_emb_sample=3m/"
#         filename = "(Global emb sample 3m) textual="+select_emb+"_threshold="+str(step_threshold)+"_namegroupcount="+str(total_selected_group)+".csv"
#         com_func.write_csv_df(savePath, filename, output)
#         print("Done")
        
#         threshold_change_all_lr_f1s.append(all_svcLinear_f1)
#         threshold_change_all_svm_f1s.append(all_LR_f1)
    
#     lr_diff_embedding_result.append(threshold_change_all_lr_f1s)
#     svm_diff_embedding_result.append(threshold_change_all_svm_f1s)

Load textual embedding:  lsa
Total textual vector records: 3151504
Vector dimension:  100
For name:  j_read
j_read  pass
For name:  f_esteves
f_esteves  pass
For name:  c_miller
c_miller  pass
For name:  r_jha
r_jha  pass
For name:  a_lowe
a_lowe  pass
For name:  a_vega
a_vega  pass
For name:  k_smith
k_smith  pass
For name:  j_gordon
j_gordon  pass
For name:  s_liao
s_liao  pass
For name:  j_qian
j_qian  pass
For name:  s_bernardi
s_bernardi  pass
For name:  t_hill
t_hill  pass
For name:  s_schindler
s_schindler  pass
For name:  j_williams
j_williams  pass
For name:  s_jacobson
s_jacobson  pass
For name:  e_andrade
e_andrade  pass
For name:  t_santos
t_santos  pass
For name:  k_kim
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000

0      0000-0002-6929-5359
1      0000-0002-6929-5359
2      0000-0001-9498-284X
3      0000-0001-9498-284X
4      0000-0002-6929-5359
5      0000-0001-9498-284X
6      0000-0002-6929-5359
7      0000-0001-9498-284X
8      0000-0002-6929-5359
9      0000-0001-9498-284X
10     0000-0002-5878-8895
11     0000-0002-6929-5359
12     0000-0001-9498-284X
13     0000-0001-9498-284X
14     0000-0002-6929-5359
15     0000-0002-5878-8895
16     0000-0002-5878-8895
17     0000-0002-6929-5359
18     0000-0002-6929-5359
19     0000-0001-9498-284X
20     0000-0002-6929-5359
21     0000-0001-9498-284X
22     0000-0002-6929-5359
23     0000-0002-6929-5359
24     0000-0001-9498-284X
25     0000-0002-5878-8895
26     0000-0001-9498-284X
27     0000-0002-6929-5359
28     0000-0002-6929-5359
29     0000-0002-5878-8895
              ...         
474    0000-0001-9498-284X
475    0000-0001-9498-284X
476    0000-0002-5878-8895
477    0000-0002-5878-8895
478    0000-0002-6929-5359
479    0000-0002-6929-5359
4

svc F1:  0.9496159024312979
Load textual embedding:  pv_dm
Total textual vector records: 135796
Vector dimension:  100
For name:  j_read
j_read  pass
For name:  f_esteves
f_esteves  pass
For name:  c_miller
c_miller  pass
For name:  r_jha
r_jha  pass
For name:  a_lowe
a_lowe  pass
For name:  a_vega
a_vega  pass
For name:  k_smith
k_smith  pass
For name:  j_gordon
j_gordon  pass
For name:  s_liao
s_liao  pass
For name:  j_qian
j_qian  pass
For name:  s_bernardi
s_bernardi  pass
For name:  t_hill
t_hill  pass
For name:  s_schindler
s_schindler  pass
For name:  j_williams
j_williams  pass
For name:  s_jacobson
s_jacobson  pass
For name:  e_andrade
e_andrade  pass
For name:  t_santos
t_santos  pass
For name:  k_kim
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '00

0      0000-0002-6929-5359
1      0000-0001-9498-284X
2      0000-0002-6929-5359
3      0000-0002-5878-8895
4      0000-0002-5878-8895
5      0000-0002-6929-5359
6      0000-0002-5878-8895
7      0000-0002-5878-8895
8      0000-0002-6929-5359
9      0000-0001-9498-284X
10     0000-0002-5878-8895
11     0000-0001-9498-284X
12     0000-0002-5878-8895
13     0000-0002-6929-5359
14     0000-0002-6929-5359
15     0000-0001-9498-284X
16     0000-0001-9498-284X
17     0000-0001-9498-284X
18     0000-0002-6929-5359
19     0000-0002-5878-8895
20     0000-0002-5878-8895
21     0000-0002-5878-8895
22     0000-0002-5878-8895
23     0000-0001-9498-284X
24     0000-0002-5878-8895
25     0000-0001-9498-284X
26     0000-0001-9498-284X
27     0000-0002-6929-5359
28     0000-0002-6929-5359
29     0000-0002-5878-8895
              ...         
474    0000-0001-9498-284X
475    0000-0001-9498-284X
476    0000-0002-6929-5359
477    0000-0002-5878-8895
478    0000-0002-5878-8895
479    0000-0001-9498-284X
4

0      0000-0002-6929-5359
1      0000-0001-9498-284X
2      0000-0002-6929-5359
3      0000-0002-5878-8895
4      0000-0002-5878-8895
5      0000-0002-6929-5359
6      0000-0002-5878-8895
7      0000-0002-5878-8895
8      0000-0002-6929-5359
9      0000-0001-9498-284X
10     0000-0002-5878-8895
11     0000-0001-9498-284X
12     0000-0002-5878-8895
13     0000-0002-6929-5359
14     0000-0002-6929-5359
15     0000-0001-9498-284X
16     0000-0001-9498-284X
17     0000-0001-9498-284X
18     0000-0002-6929-5359
19     0000-0002-5878-8895
20     0000-0002-5878-8895
21     0000-0002-5878-8895
22     0000-0002-5878-8895
23     0000-0001-9498-284X
24     0000-0002-5878-8895
25     0000-0001-9498-284X
26     0000-0001-9498-284X
27     0000-0002-6929-5359
28     0000-0002-6929-5359
29     0000-0002-5878-8895
              ...         
474    0000-0001-9498-284X
475    0000-0001-9498-284X
476    0000-0002-6929-5359
477    0000-0002-5878-8895
478    0000-0002-5878-8895
479    0000-0001-9498-284X
4

Total textual vector records: 135796
Vector dimension:  100
For name:  j_read
j_read  pass
For name:  f_esteves
f_esteves  pass
For name:  c_miller
c_miller  pass
For name:  r_jha
r_jha  pass
For name:  a_lowe
a_lowe  pass
For name:  a_vega
a_vega  pass
For name:  k_smith
k_smith  pass
For name:  j_gordon
j_gordon  pass
For name:  s_liao
s_liao  pass
For name:  j_qian
j_qian  pass
For name:  s_bernardi
s_bernardi  pass
For name:  t_hill
t_hill  pass
For name:  s_schindler
s_schindler  pass
For name:  j_williams
j_williams  pass
For name:  s_jacobson
s_jacobson  pass
For name:  e_andrade
e_andrade  pass
For name:  t_santos
t_santos  pass
For name:  k_kim
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000-0002-4899-1929': 25, '0000-00

<class 'pandas.core.series.Series'>
0      0000-0001-9498-284X
1      0000-0002-6929-5359
2      0000-0002-6929-5359
3      0000-0002-6929-5359
4      0000-0001-9498-284X
5      0000-0001-9498-284X
6      0000-0002-6929-5359
7      0000-0002-5878-8895
8      0000-0002-6929-5359
9      0000-0001-9498-284X
10     0000-0001-9498-284X
11     0000-0002-6929-5359
12     0000-0002-6929-5359
13     0000-0001-9498-284X
14     0000-0001-9498-284X
15     0000-0002-6929-5359
16     0000-0001-9498-284X
17     0000-0002-6929-5359
18     0000-0002-6929-5359
19     0000-0002-6929-5359
20     0000-0002-5878-8895
21     0000-0002-5878-8895
22     0000-0002-6929-5359
23     0000-0002-6929-5359
24     0000-0002-6929-5359
25     0000-0002-5878-8895
26     0000-0002-5878-8895
27     0000-0002-5878-8895
28     0000-0001-9498-284X
29     0000-0002-6929-5359
              ...         
474    0000-0002-6929-5359
475    0000-0001-9498-284X
476    0000-0001-9498-284X
477    0000-0001-9498-284X
478    0000-0002-69

0      0000-0001-9498-284X
1      0000-0002-6929-5359
2      0000-0002-6929-5359
3      0000-0002-6929-5359
4      0000-0001-9498-284X
5      0000-0001-9498-284X
6      0000-0002-6929-5359
7      0000-0002-5878-8895
8      0000-0002-6929-5359
9      0000-0001-9498-284X
10     0000-0001-9498-284X
11     0000-0002-6929-5359
12     0000-0002-6929-5359
13     0000-0001-9498-284X
14     0000-0001-9498-284X
15     0000-0002-6929-5359
16     0000-0001-9498-284X
17     0000-0002-6929-5359
18     0000-0002-6929-5359
19     0000-0002-6929-5359
20     0000-0002-5878-8895
21     0000-0002-5878-8895
22     0000-0002-6929-5359
23     0000-0002-6929-5359
24     0000-0002-6929-5359
25     0000-0002-5878-8895
26     0000-0002-5878-8895
27     0000-0002-5878-8895
28     0000-0001-9498-284X
29     0000-0002-6929-5359
              ...         
474    0000-0002-6929-5359
475    0000-0001-9498-284X
476    0000-0001-9498-284X
477    0000-0001-9498-284X
478    0000-0002-6929-5359
479    0000-0002-6929-5359
4

In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
print(pp_textual)
print(allname)
# 3d, d1 diff emb, d2 diff threshold, d3 result for different author
print(lr_diff_embedding_result)
print(svm_diff_embedding_result)
print(threshold_change)


In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
# -------------- extract result for plot --------------------- #
lr_per_author = []
lr_lsa_per_author_result = lr_diff_embedding_result[0][0]
lr_pv_dm_per_author_result = lr_diff_embedding_result[1][0]
lr_pv_dbow_per_author_result = lr_diff_embedding_result[2][0]
lr_per_author.append(lr_lsa_per_author_result)
lr_per_author.append(lr_pv_dm_per_author_result)
lr_per_author.append(lr_pv_dbow_per_author_result)

svm_per_author = []
svm_lsa_per_author_result = lr_diff_embedding_result[0][0]
svm_pv_dm_per_author_result = svm_diff_embedding_result[1][0]
svm_pv_dbow_per_author_result = svm_diff_embedding_result[2][0]
svm_per_author.append(svm_lsa_per_author_result)
svm_per_author.append(svm_pv_dm_per_author_result)
svm_per_author.append(svm_pv_dbow_per_author_result)

In [None]:
# ----------- plot f1 score w.r.t each name group on different embedding -------------- #
%matplotlib inline
import matplotlib.pyplot as plt
#--------------   logistic regression --------------------------#
# process result into np array
logistic_regression_result = np.array(lr_per_author)
name_group = np.array(allname)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(pp_textual, logistic_regression_result):
    plt.xticks(range(len(result)), name_group)
    plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    plt.plot(result, label=emb_type)
ax.autoscale_view()
plt.legend()
plt.title('F1 for different embedding method in logistic regression')
plt.xlabel('Name group')
plt.ylabel('marco f1 score')
# plt.savefig('diff_textual_embedding_sample=3m_clf=logistic regression_threshold=100.eps', format='eps', dpi=300)

#--------------   svm  -------------- --------------------------#
# process result into np array
svm_result = np.array(svm_per_author)
name_group = np.array(allname)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(pp_textual, svm_result):
    plt.xticks(range(len(result)), name_group)
    plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    plt.plot(result, label=emb_type)
ax.autoscale_view()
plt.legend()
plt.title('F1 for different embedding method in svm')
plt.xlabel('Name group')
plt.ylabel('marco f1 score')
# plt.savefig('diff_textual_embedding_sample=3m_clf=svm_threshold=100.eps', format='eps', dpi=300)

In [None]:
# ----------- plot f1 score change w.r.t threshold on different embedding ------------- #
print(pp_textual)
print(lr_diff_embedding_result)
print(svm_diff_embedding_result)
print(threshold_change)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
threshold_change = np.array(threshold_change)
#--------------   logistic regression --------------------------#
# process result into np array
logistic_regression_result = np.array(lr_diff_embedding_result)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(pp_textual, logistic_regression_result):
    plt.plot(threshold_change, result, label=emb_type)
plt.legend()
plt.title('Average f1 for different embedding in logistic regression')
plt.xlabel('Threshold')
plt.ylabel('marco f1 score')

# plt.savefig('diff_textual_emb_smaple=3m_clf=logistic regression.eps', format='eps', dpi=300)


# -------------------- svm -------------------------------------#
svm_result = np.array(svm_diff_embedding_result)
fig = plt.figure()
ax = plt.axes()
for emb_type, result in zip(pp_textual, svm_result):
    plt.plot(threshold_change, result, label=emb_type)
plt.legend()
plt.title('Average f1 for different embedding in SVM')
plt.xlabel('Threshold')
plt.ylabel('marco f1 score')

# plt.savefig('diff_textual_emb_smaple=3m_clf=svm.eps', format='eps', dpi=300)

In [None]:
print("textual =",pp_textual,"_citation =",citation_emb)
print("svc: ", svcF1)
print("lr:", lrF1)

In [None]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

In [None]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))