In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func

def dummy(doc):
    return doc

# parameters
threshold = 30
cutoff = 3

pp_textual = "tf"

Dataset = "pubmed"

In [2]:
import pandas as pd
# load text information
Dataset = "pubmed"
raw_filepath = "../../Data"+"/"+Dataset+"/id_textual_combined.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # keyword and mesh
        key_mesh = keywords+" "+mesh
        # title and abstract
        title_abstract = title+" "+abstract
        # title keywords mesh
        title_key_mesh = title+" "+key_mesh
        # abstract keywords mesh
        abstract_key_mesh = abstract+" "+key_mesh
        # all feature combined
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "title":title, "keywords_mesh":key_mesh, "abstract": abstract,
                              "title_abstract":title_abstract,"title_key_mesh":title_key_mesh, 
                              "abstract_key_mesh":abstract_key_mesh, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

Total  3151504  paper have text information


In [3]:
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [4]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = count_vectorizer.fit_transform(cleaned_token).toarray()
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [5]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [6]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [7]:
# load the file
import io
import collections
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# model f1
modelMNBf1, modelSVCf1, modelLRf1 = ([] for i in range(3))

# list of features
all_features = ["title_abstract", "combine_textual"]

for feature in all_features:
    # collect statistic to output
    allname, num_class, per_class_count, average_textual_size = ([] for i in range(4))

    all_mnb_accuracy, all_mnb_f1,all_svcLinear_accuracy = ([] for i in range(3))
    all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(3))
    
    # collect overall tp, tn, fp, fn
    mnbTP=mnbTN=mnbFP=mnbFN = 0
    svcTP=svcTN=svcFP=svcFN = 0
    lrTP=lrTN=lrFP=lrFN = 0
    
    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data_part = read_labeled_file(fileDir+file)
        print("total sample size before apply threshold: ",len(labeled_data_part))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data_part["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove samples that are smaller than threshold
        labeled_data_part = labeled_data_part[labeled_data_part.authorID.isin(temp)]
        print("Total sample size after apply threshold: ",len(labeled_data_part))
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name, " pass")
        else:
            allname.append(name)
            num_class.append(len(paperCounter))
            per_class_count.append(paperCounter)
            # convert author id to label
            gather_label = []
            for index, record in labeled_data_part.iterrows():
                gather_label.append(temp.index(record["authorID"]))
            labeled_data_part["label"] = gather_label
            # merge title and abstract from all raw data to labeled dataset
            labeled_data = pd.merge(left=labeled_data_part,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
            # shuffle the data
            labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
            # extract true label and pid
            label = labeled_data["label"]
            pid = labeled_data["paperID"]
            # list of different data field
            part_collection = []
            # select feature wanted to fit to clustering/classification algorithm
            # data part 3, textual information
            data_part_textual, avg_textual_size = raw_text_to_vector(labeled_data[feature], emb_type=pp_textual)
            average_textual_size.append(avg_textual_size)
            print(data_part_textual.shape)
            part_collection.append(data_part_textual)
            # merge different part of data data together by concatenate it all together
            # remove empty emb (when emb set off)
            part_collection = [part for part in part_collection if len(part)!=0]
            if len(part_collection)>1:
                combinedata = np.concatenate(part_collection,axis=1)
            else:
                combinedata = part_collection[0]
            print(combinedata.shape)
            # using converted feature vector to train classifier
            # using Multinomial naive bayes
            clf = MultinomialNB()
            # use 10 fold cv
            mnbaccuracy, mnbmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            mnbTP+=tp
            mnbTN+=tn
            mnbFP+=fp
            mnbFN+=fn
            print("MNB Accuracy: ",mnbaccuracy)
            print("MNB F1: ", mnbmarcof1)
            all_mnb_accuracy.append(mnbaccuracy)
            all_mnb_f1.append(mnbmarcof1)
            # using SVM with linear kernal
            clf = SVC(decision_function_shape='ovr', kernel='linear')
            svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            svcTP+=tp
            svcTN+=tn
            svcFP+=fp
            svcFN+=fn
            print("svc Accuracy: ",svcaccuracy)
            print("svc F1: ", svcmarcof1)
            all_svcLinear_accuracy.append(svcaccuracy)
            all_svcLinear_f1.append(svcmarcof1)
            # using logistic regression
            clf = LogisticRegression(multi_class='ovr')
            LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            lrTP+=tp
            lrTN+=tn
            lrFP+=fp
            lrFN+=fn
            print("LR Accuracy: ",LRaccuracy)
            print("LR F1: ", LRmarcof1)
            all_LR_accuracy.append(LRaccuracy)
            all_LR_f1.append(LRmarcof1)
    # print f1 for entire model
    print("mnb: TP: ",mnbTP, "TN: ",mnbTN, "FP: ",mnbFP,"FN: ",mnbFN)
    print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
    print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
    mnbF1 = 2*mnbTP / (2*mnbTP + mnbFP + mnbFN)
    svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
    lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
    modelMNBf1.append(mnbF1)
    modelSVCf1.append(svcF1)
    modelLRf1.append(lrF1)
    # write evaluation result to excel
    output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"average term in sample":average_textual_size,
                           "per_class_size":per_class_count,"mnb accuracy":all_mnb_accuracy, "mnb macro f1": all_mnb_f1,
                           "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                           "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

    savePath = "../../result/"+Dataset+"/skovr/"+feature+"/"
    filename = "feature="+feature+"_textual="+pp_textual+"_threshold="+str(threshold)+".csv"
    write_csv_df(savePath, filename, output)
    print(feature, " Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
Minimal sample size:  3
maximal sample size:  223
(127, 1072)
(127, 1072)
MNB Accuracy:  0.9763779527559056
MNB F1:  0.9735304266112633
svc Accuracy:  0.8582677165354331
svc F1:  0.8544985971964464
LR Accuracy:  0.968503937007874
LR F1:  0.9662876190054792
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
[]
Total sample size after apply threshold:  0
f_esteves  pass
For name:  c_miller
total sample size before apply threshold:  252
Counter({'0000-0003-4341-128

Minimal sample size:  2
maximal sample size:  314
(810, 3847)
(810, 3847)
MNB Accuracy:  0.9320987654320988
MNB F1:  0.9133440238410291
svc Accuracy:  0.8518518518518519
svc F1:  0.8166115896300603
LR Accuracy:  0.928395061728395
LR F1:  0.9085008570831677
For name:  d_ricci
total sample size before apply threshold:  40
Counter({'0000-0003-0015-6374': 26, '0000-0003-2853-4816': 12, '0000-0001-9678-904X': 1, '0000-0002-9790-0552': 1})
[]
Total sample size after apply threshold:  0
d_ricci  pass
For name:  s_cameron
total sample size before apply threshold:  66
Counter({'0000-0002-6694-4130': 41, '0000-0002-3050-7262': 16, '0000-0001-9570-135X': 7, '0000-0001-5680-2641': 2})
['0000-0002-6694-4130']
Total sample size after apply threshold:  41
s_cameron  pass
For name:  t_wright
total sample size before apply threshold:  31
Counter({'0000-0001-5071-9978': 19, '0000-0002-5813-9991': 6, '0000-0001-8338-5935': 5, '0000-0001-7836-6705': 1})
[]
Total sample size after apply threshold:  0
t_wri

Minimal sample size:  2
maximal sample size:  297
(329, 1941)
(329, 1941)
MNB Accuracy:  0.9848024316109423
MNB F1:  0.9837484675305079
svc Accuracy:  0.9544072948328267
svc F1:  0.9477948213421373
LR Accuracy:  0.9848024316109423
LR F1:  0.9836029833620196
For name:  r_morris
total sample size before apply threshold:  409
Counter({'0000-0001-7240-4563': 107, '0000-0001-7809-0315': 73, '0000-0001-8661-1520': 59, '0000-0002-7574-9388': 51, '0000-0003-3080-2613': 44, '0000-0002-5018-1239': 21, '0000-0001-7431-6401': 20, '0000-0001-7450-5923': 14, '0000-0001-5511-3457': 10, '0000-0003-4764-3639': 7, '0000-0001-7443-7406': 2, '0000-0002-9193-3417': 1})
['0000-0003-3080-2613', '0000-0001-7809-0315', '0000-0001-7240-4563', '0000-0002-7574-9388', '0000-0001-8661-1520']
Total sample size after apply threshold:  334
Minimal sample size:  0
maximal sample size:  587
(334, 2336)
(334, 2336)
MNB Accuracy:  0.9700598802395209
MNB F1:  0.9655355899573544
svc Accuracy:  0.8892215568862275
svc F1:  0.

Minimal sample size:  5
maximal sample size:  235
(170, 1343)
(170, 1343)
MNB Accuracy:  0.9764705882352941
MNB F1:  0.9717658512839235
svc Accuracy:  0.9235294117647059
svc F1:  0.9163676394696608
LR Accuracy:  0.9470588235294117
LR F1:  0.9407268814048475
For name:  c_cardoso
total sample size before apply threshold:  52
Counter({'0000-0001-6239-6651': 15, '0000-0003-3645-5368': 12, '0000-0001-7273-0676': 10, '0000-0002-9339-8075': 8, '0000-0003-3323-4447': 4, '0000-0002-7527-3973': 2, '0000-0003-1914-9553': 1})
[]
Total sample size after apply threshold:  0
c_cardoso  pass
For name:  j_matthews
total sample size before apply threshold:  65
Counter({'0000-0002-9815-8636': 46, '0000-0001-6184-1813': 7, '0000-0002-5993-7610': 5, '0000-0002-1832-4420': 4, '0000-0002-7282-8929': 1, '0000-0002-6888-9438': 1, '0000-0002-3968-8282': 1})
['0000-0002-9815-8636']
Total sample size after apply threshold:  46
j_matthews  pass
For name:  g_lee
total sample size before apply threshold:  202
Counte

Minimal sample size:  2
maximal sample size:  250
(167, 1452)
(167, 1452)
MNB Accuracy:  0.9760479041916168
MNB F1:  0.9703900709219858
svc Accuracy:  0.9640718562874252
svc F1:  0.9522857142857143
LR Accuracy:  0.9820359281437125
LR F1:  0.9770236184361386
For name:  m_soares
total sample size before apply threshold:  247
Counter({'0000-0001-9701-836X': 75, '0000-0002-9314-4833': 68, '0000-0001-6071-0272': 44, '0000-0003-1579-8513': 32, '0000-0002-5213-2377': 10, '0000-0001-8860-0470': 7, '0000-0003-4227-4141': 4, '0000-0002-7181-1906': 3, '0000-0002-4614-8209': 2, '0000-0002-8059-7067': 1, '0000-0002-9013-2570': 1})
['0000-0001-6071-0272', '0000-0003-1579-8513', '0000-0002-9314-4833', '0000-0001-9701-836X']
Total sample size after apply threshold:  219
Minimal sample size:  2
maximal sample size:  457
(219, 1846)
(219, 1846)
MNB Accuracy:  0.9863013698630136
MNB F1:  0.9871125107093972
svc Accuracy:  0.9223744292237442
svc F1:  0.9176132923097066
LR Accuracy:  0.9817351598173516
LR F

Minimal sample size:  9
maximal sample size:  246
(94, 784)
(94, 784)
MNB Accuracy:  1.0
MNB F1:  1.0
svc Accuracy:  1.0
svc F1:  1.0
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_viana
total sample size before apply threshold:  139
Counter({'0000-0002-0464-4845': 34, '0000-0003-4356-8109': 31, '0000-0002-4073-3802': 29, '0000-0001-9665-2115': 26, '0000-0001-9288-2108': 13, '0000-0002-3074-767X': 5, '0000-0002-5657-5570': 1})
['0000-0003-4356-8109', '0000-0002-0464-4845']
Total sample size after apply threshold:  65
Minimal sample size:  5
maximal sample size:  270
(65, 618)
(65, 618)
MNB Accuracy:  0.9692307692307692
MNB F1:  0.9691650853889944
svc Accuracy:  0.9538461538461539
svc F1:  0.9538461538461539
LR Accuracy:  0.9846153846153847
LR F1:  0.9846008054963278
For name:  t_inoue
total sample size before apply threshold:  70
Counter({'0000-0002-2728-0060': 52, '0000-0003-3289-4478': 9, '0000-0002-7710-1526': 8, '0000-0003-0582-0908': 1})
['0000-0002-2728-0060']
Total sample size after

Minimal sample size:  3
maximal sample size:  326
(312, 2044)
(312, 2044)
MNB Accuracy:  0.9871794871794872
MNB F1:  0.986026645768025
svc Accuracy:  0.9006410256410257
svc F1:  0.8907428392635348
LR Accuracy:  0.9647435897435898
LR F1:  0.9614239793707025
For name:  r_coleman
total sample size before apply threshold:  34
Counter({'0000-0003-4136-5914': 15, '0000-0002-5194-8550': 13, '0000-0001-7118-524X': 3, '0000-0002-9731-7498': 3})
[]
Total sample size after apply threshold:  0
r_coleman  pass
For name:  b_kang
total sample size before apply threshold:  20
Counter({'0000-0001-5902-0549': 10, '0000-0001-6946-2279': 5, '0000-0003-2637-4695': 2, '0000-0003-0901-4903': 1, '0000-0002-4299-2170': 1, '0000-0002-1690-7753': 1})
[]
Total sample size after apply threshold:  0
b_kang  pass
For name:  s_carter
total sample size before apply threshold:  205
Counter({'0000-0002-3585-9400': 124, '0000-0003-2617-8694': 44, '0000-0002-9080-519X': 15, '0000-0002-4670-0884': 12, '0000-0002-9817-0029'

svc Accuracy:  0.9285714285714286
svc F1:  0.9278028904310179
LR Accuracy:  0.9642857142857143
LR F1:  0.9641224392944712
For name:  c_zou
total sample size before apply threshold:  32
Counter({'0000-0003-2484-7292': 22, '0000-0001-8569-3747': 8, '0000-0003-4305-5055': 1, '0000-0002-9712-4282': 1})
[]
Total sample size after apply threshold:  0
c_zou  pass
For name:  s_rana
total sample size before apply threshold:  42
Counter({'0000-0002-8039-1149': 30, '0000-0001-9197-8378': 9, '0000-0003-0628-7076': 2, '0000-0002-6604-997X': 1})
['0000-0002-8039-1149']
Total sample size after apply threshold:  30
s_rana  pass
For name:  a_nunes
total sample size before apply threshold:  61
Counter({'0000-0003-2760-3277': 18, '0000-0001-9102-3600': 11, '0000-0001-8893-9247': 9, '0000-0001-8844-8333': 5, '0000-0002-3296-0183': 5, '0000-0002-0595-5821': 4, '0000-0002-5001-3534': 2, '0000-0002-4789-0253': 2, '0000-0003-4440-0391': 2, '0000-0001-6847-5764': 2, '0000-0001-8665-4459': 1})
[]
Total sample s

Minimal sample size:  2
maximal sample size:  323
(398, 2179)
(398, 2179)
MNB Accuracy:  0.992462311557789
MNB F1:  0.9922557546747611
svc Accuracy:  0.9798994974874372
svc F1:  0.979219423051821
LR Accuracy:  0.992462311557789
LR F1:  0.9922424714940064
For name:  j_moraes
total sample size before apply threshold:  26
Counter({'0000-0002-5766-6802': 13, '0000-0002-8563-6432': 7, '0000-0002-4490-8307': 4, '0000-0002-3067-5194': 2})
[]
Total sample size after apply threshold:  0
j_moraes  pass
For name:  e_moreno
total sample size before apply threshold:  83
Counter({'0000-0002-2309-4826': 26, '0000-0001-5040-452X': 21, '0000-0001-9490-7030': 14, '0000-0002-8434-2483': 8, '0000-0003-0491-7951': 5, '0000-0002-2301-4558': 4, '0000-0002-7197-5679': 3, '0000-0001-8520-8086': 1, '0000-0002-2733-0267': 1})
[]
Total sample size after apply threshold:  0
e_moreno  pass
For name:  r_little
total sample size before apply threshold:  4
Counter({'0000-0002-4000-946X': 2, '0000-0002-7732-157X': 1, '

Counter({'0000-0003-1424-7568': 9, '0000-0002-5653-0145': 4, '0000-0002-4148-3526': 2, '0000-0002-4334-1900': 1})
[]
Total sample size after apply threshold:  0
e_shaw  pass
For name:  m_cameron
total sample size before apply threshold:  28
Counter({'0000-0001-5788-8790': 17, '0000-0002-2277-7035': 9, '0000-0001-9464-8796': 1, '0000-0002-2508-7718': 1})
[]
Total sample size after apply threshold:  0
m_cameron  pass
For name:  a_reid
total sample size before apply threshold:  44
Counter({'0000-0002-0523-926X': 18, '0000-0003-1752-3302': 18, '0000-0003-4713-2951': 6, '0000-0002-2500-2980': 2})
[]
Total sample size after apply threshold:  0
a_reid  pass
For name:  d_gil
total sample size before apply threshold:  60
Counter({'0000-0003-3179-1987': 23, '0000-0002-2770-4767': 16, '0000-0003-4241-1302': 16, '0000-0001-8910-2780': 4, '0000-0003-0791-8298': 1})
[]
Total sample size after apply threshold:  0
d_gil  pass
For name:  s_morgan
total sample size before apply threshold:  83
Counter({'

Minimal sample size:  0
maximal sample size:  289
(840, 4318)
(840, 4318)
MNB Accuracy:  0.9238095238095239
MNB F1:  0.9144030163588228
svc Accuracy:  0.7845238095238095
svc F1:  0.7842497311341794
LR Accuracy:  0.8952380952380953
LR F1:  0.8939815654768555
For name:  j_gao
total sample size before apply threshold:  222
Counter({'0000-0003-3215-7013': 44, '0000-0001-9341-1287': 36, '0000-0001-9778-4312': 26, '0000-0002-6200-4141': 24, '0000-0001-9803-0256': 20, '0000-0001-5732-9905': 14, '0000-0002-4545-1126': 12, '0000-0002-9943-4786': 12, '0000-0002-5739-1781': 11, '0000-0002-3952-208X': 8, '0000-0003-2059-0290': 7, '0000-0002-9959-5600': 2, '0000-0001-6659-5770': 1, '0000-0002-1181-4531': 1, '0000-0003-1160-6553': 1, '0000-0003-2668-6672': 1, '0000-0003-4024-4694': 1, '0000-0002-5977-0021': 1})
['0000-0001-9341-1287', '0000-0003-3215-7013']
Total sample size after apply threshold:  80
Minimal sample size:  5
maximal sample size:  190
(80, 610)
(80, 610)
MNB Accuracy:  1.0
MNB F1:  1

Minimal sample size:  3
maximal sample size:  203
(120, 998)
(120, 998)
MNB Accuracy:  0.9666666666666667
MNB F1:  0.9589603283173734
svc Accuracy:  0.9583333333333334
svc F1:  0.9499958329860823
LR Accuracy:  0.9833333333333333
LR F1:  0.9798319327731092
For name:  s_alavi
total sample size before apply threshold:  38
Counter({'0000-0003-4328-4747': 23, '0000-0003-4009-4921': 14, '0000-0003-1130-3165': 1})
[]
Total sample size after apply threshold:  0
s_alavi  pass
For name:  r_marques
total sample size before apply threshold:  41
Counter({'0000-0002-6949-0947': 11, '0000-0002-4749-7523': 11, '0000-0002-3125-3911': 8, '0000-0001-6239-5456': 3, '0000-0002-9416-1299': 2, '0000-0001-8261-4409': 1, '0000-0001-6925-041X': 1, '0000-0002-9197-9845': 1, '0000-0002-0672-9260': 1, '0000-0001-8622-9786': 1, '0000-0003-0314-3675': 1})
[]
Total sample size after apply threshold:  0
r_marques  pass
For name:  m_wheeler
total sample size before apply threshold:  163
Counter({'0000-0002-7480-7267': 

Minimal sample size:  6
maximal sample size:  243
(254, 1773)
(254, 1773)
MNB Accuracy:  0.952755905511811
MNB F1:  0.9488399283032466
svc Accuracy:  0.889763779527559
svc F1:  0.8842855977152183
LR Accuracy:  0.9566929133858267
LR F1:  0.9512212550376903
For name:  s_bae
total sample size before apply threshold:  83
Counter({'0000-0003-0551-7618': 19, '0000-0002-3019-0584': 17, '0000-0002-4995-6543': 17, '0000-0002-8993-8884': 9, '0000-0003-0098-8816': 8, '0000-0003-1926-5466': 6, '0000-0001-7603-7676': 6, '0000-0003-0637-4110': 1})
[]
Total sample size after apply threshold:  0
s_bae  pass
For name:  s_fernandes
total sample size before apply threshold:  38
Counter({'0000-0003-1128-833X': 20, '0000-0002-1295-5010': 6, '0000-0002-9035-793X': 5, '0000-0002-7871-6717': 5, '0000-0002-0790-303X': 2})
[]
Total sample size after apply threshold:  0
s_fernandes  pass
For name:  a_miller
total sample size before apply threshold:  109
Counter({'0000-0002-7056-8502': 33, '0000-0001-8474-5090': 

KeyboardInterrupt: 

In [8]:
print(all_features)
print("mnb: ", modelMNBf1)
print("svc: ", modelSVCf1)
print("lr: ", modelLRf1)

['title_abstract', 'combine_textual']
mnb:  []
svc:  []
lr:  []


In [9]:
# accuracy
from statistics import mean 
cleaned_mnb_accuracy = [x for x in all_mnb_accuracy if isinstance(x, float)]
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_mnb_accuracy))
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_mnb_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

289
289
289
0.9778910513144647
0.9572108961110675
0.977951254354412


In [10]:
# f1
from statistics import mean 
# remove string from result
cleaned_mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_mnb_f1))
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_mnb_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

289
289
289
0.9747506103432456
0.9517279039263197
0.9747488300982716


In [None]:
print(len(all_LR_f1))

In [None]:
test = [x for x in all_mnb_f1 if isinstance(x, float)]


In [None]:
%who