In [1]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

threshold = 100
cutoff = 3
Dataset = "pubmed"

pp_textual = "pv_dbow"
citation_emb = "n2v"


In [11]:
# read trained rec to rec textual graph
def read_textual_embedding(Dataset = "pubmed", emb_type = "off"):
    textual_emb = []
    while True:
        if emb_type == "pv_dm":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/extracted_labeled_pv_dm.txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()

            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[0])
            break
        elif emb_type == "pv_dbow":
            loadDir = "../../Data/"+Dataset+"/vectors/d2v/textual_sample=3m/extracted_labeled_pv_dbow.txt"
            with open(loadDir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    paper_Vectors = read_data
                    textual_emb.append(paper_Vectors)
            f.close()
            
            print("Total textual vector records:",len(textual_emb))
            print(textual_emb[0])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return textual_emb

In [3]:
# read trained rec to rec node2vec citation graph
def read_citation_embedding(Dataset = "pubmed", emb_type = "off"):
    citation_emb = []
    while True:
        if emb_type == "n2v":
            citation_emb_dir = "../../Data/"+Dataset+"/vectors/"+emb_type+"/extracted_labeled_n2v.txt"
            with open(citation_emb_dir, 'r', encoding = 'utf8') as f:
                for line in f:
                    read_data = line.split(" ")
                    if(len(read_data)==101):
                        paper_Vectors = read_data
                        citation_emb.append(paper_Vectors)
            f.close()
            print("Total citation vector records:",len(citation_emb))
            print(citation_emb[:3])
            break
        elif emb_type == "off":
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return citation_emb

In [4]:
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [5]:
def extract_embedding(all_embedding, pid):
    extracted_emb = []
    wanted_pid = pid.values.tolist()
    # only if embedding exist
    if len(all_embedding)>0:
        for paper_embedding in all_embedding:
            if paper_embedding[0] in wanted_pid:
                extracted_emb.append(paper_embedding)
    
    extracted_emb = pd.DataFrame(extracted_emb)
    # only if embedding exist
    if len(all_embedding)>0:
        # reorder embedding with pid and fill empty record with 0
        extracted_emb = pd.merge(pid.to_frame(), extracted_emb, left_on='paperID', right_on=0, how='outer')
        # fill missing value with 0
        extracted_emb.fillna(0, inplace = True)
        # remove index
        extracted_emb.drop(['paperID', 0], axis=1, inplace=True)
    return extracted_emb

In [6]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score

def normal_group_predict(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    # get predicted label
    label_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,label_pred)
    f1 = f1_score(y_test, label_pred,average='macro')
    
    print(metrics.classification_report(y_test, label_pred))
    print(metrics.confusion_matrix(y_test, label_pred).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(y_test, label_pred)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)
    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [7]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [8]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [12]:
# read pretrained embeddings
all_textual_embedding = read_textual_embedding(emb_type = pp_textual)
all_citation_embedding = read_citation_embedding(emb_type = citation_emb)

Total textual vector records: 135796
['8077', '-0.14659140', '-0.16460477', '-0.50664663', '-0.17956261', '0.21054362', '0.26002276', '0.15514752', '0.13244890', '-0.27113414', '0.47227725', '0.07357255', '-0.08964530', '0.35950011', '0.37851566', '-0.04907404', '0.56523114', '-0.60256726', '-0.21556917', '-0.09287039', '-0.18874674', '0.59881312', '-0.32156968', '0.39462098', '0.35133442', '0.08628392', '-0.04479222', '0.25453219', '0.23234852', '-0.10687385', '-0.00707190', '-0.11578006', '0.06657255', '0.19292782', '0.09975667', '-0.04673584', '0.47342294', '0.50503510', '-0.13644342', '0.35020310', '0.27452260', '0.45986831', '0.72157681', '-0.08654509', '-0.36922029', '-0.28984016', '0.26503867', '-0.14659104', '0.19001262', '-0.24055083', '0.10608102', '-0.21904105', '-0.02745518', '-0.27935785', '0.67074525', '0.57324684', '0.16567072', '-0.12955795', '-0.73991919', '0.20633785', '-0.13949864', '0.07348444', '-0.45851952', '-0.28298637', '0.65005982', '-0.16004808', '-0.33634639

In [14]:
# load the file
import io
import collections
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# fix random seed for reproducibility
np.random.seed(1)

# collect statistic to output
allname, num_class, per_class_count = ([] for i in range(3))

all_svcLinear_accuracy, all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(4))

# collect overall tp, tn, fp, fn
svcTP=svcTN=svcFP=svcFN = 0
lrTP=lrTN=lrFP=lrFN = 0

# read all file in labeled group
for file in listfiles:
    # group name
    temp = file.split("_")
    name = temp[1]+"_"+temp[-1]
    print("For name: ",name)
    # read needed content in labeled file
    labeled_data = read_labeled_file(fileDir+file)
    print("total sample size before apply threshold: ",len(labeled_data))
    # count number of paper each author write based on author ID
    paperCounter = collections.Counter(labeled_data["authorID"])
    print(paperCounter)
    # collect per class statistic
    for k in list(paperCounter):
        if paperCounter[k] < threshold:
            del paperCounter[k]
    temp =list(paperCounter.keys())
    print(temp)
    # remove samples that are smaller than threshold
    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
    print("Total sample size after apply threshold: ",len(labeled_data))
    # if only have one class or no class pass the threshold, not applicable
    if(len(paperCounter)<2):
        print(name, " pass")
    else:
        allname.append(name)
        num_class.append(len(paperCounter))
        per_class_count.append(paperCounter)
        # extract true label and pid
        label = labeled_data["authorID"]
        pid = labeled_data["paperID"]
        # list of different data field
        part_collection = []
        # select feature wanted to fit to clustering/classification algorithm
        # data part extract textual embedding
        data_part_textual = extract_embedding(all_textual_embedding, pid)
        print(data_part_textual.shape)
        part_collection.append(data_part_textual)
        # data part read citation embedding 
        data_part_citation = extract_embedding(all_citation_embedding, pid)
        print(data_part_citation.shape)
        part_collection.append(data_part_citation)
        # merge different part of data data together by concatenate it all together
        # remove empty emb (when emb set off)
        part_collection = [part for part in part_collection if len(part)!=0]
        print(len(part_collection))
        if len(part_collection)>1:
            combinedata = np.concatenate(part_collection,axis=1)
        elif len(part_collection)==1:
            if isinstance(part_collection[0], pd.DataFrame):
                combinedata = part_collection[0].values
            else:
                combinedata = part_collection[0]
        else:
            print("No data available")
            break
        print(combinedata.shape)
        # shuffle split train and test
        X_train, X_test, y_train, y_test = train_test_split(combinedata, label,
                                                            stratify=label, 
                                                            test_size=0.20)
        print(X_train.shape)
        print(y_train.shape)
        print(X_test.shape)
        print(y_test.shape)
        # using converted feature vector to train classifier
        # using SVM with linear kernal
        clf = SVC(decision_function_shape='ovr', kernel='linear')
        # normal predict
        svcaccuracy, svcmarcof1, tp, tn, fp, fn = normal_group_predict(X_train, y_train, X_test, y_test, clf)
#         # use 10 fold cv
#         svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
        svcTP+=tp
        svcTN+=tn
        svcFP+=fp
        svcFN+=fn
        print(name, " in group svc Accuracy: ",svcaccuracy)
        print(name, " in group svc F1: ", svcmarcof1)
        all_svcLinear_accuracy.append(svcaccuracy)
        all_svcLinear_f1.append(svcmarcof1)
        # using logistic regression
        clf = LogisticRegression(multi_class='ovr')
        # normal predict
        LRaccuracy, LRmarcof1, tp, tn, fp, fn = normal_group_predict(X_train, y_train, X_test, y_test, clf)
#         # 10 fold
#         LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
        lrTP+=tp
        lrTN+=tn
        lrFP+=fp
        lrFN+=fn
        print("LR Accuracy: ",LRaccuracy)
        print("LR F1: ", LRmarcof1)
        all_LR_accuracy.append(LRaccuracy)
        all_LR_f1.append(LRmarcof1)

# print f1 for entire model
print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
        
# # write evaluation result to excel
# output = pd.DataFrame({'Name Group':allname,"Class number":num_class, "per_class_size":per_class_count,
#                        "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
#                        "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

# savePath = "../../result/"+Dataset+"/skovr/"
# filename = "textual="+pp_textual+"_citation="+citation_emb+"_threshold="+str(threshold)+".csv"
# write_csv_df(savePath, filename, output)
# print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
[]
Total sample size after apply threshold:  0
j_read  pass
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
[]
Total sample size after apply threshold:  0
f_esteves  pass
For name:  c_miller
total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-0002-2936-7717': 6, '0000-0003-3898-9734': 6, '0000-0002-5074-6914': 2, '0000-0003-4266-6700': 1, '0000-0002-9

total sample size before apply threshold:  43
Counter({'0000-0002-8431-8256': 31, '0000-0002-9580-3716': 6, '0000-0002-2449-3749': 3, '0000-0002-9724-898X': 3})
[]
Total sample size after apply threshold:  0
w_fang  pass
For name:  m_amaral
total sample size before apply threshold:  134
Counter({'0000-0002-0828-8630': 101, '0000-0002-3209-3366': 21, '0000-0003-4966-2614': 6, '0000-0002-4301-2760': 4, '0000-0001-5607-6475': 1, '0000-0001-9686-1312': 1})
['0000-0002-0828-8630']
Total sample size after apply threshold:  101
m_amaral  pass
For name:  h_song
total sample size before apply threshold:  210
Counter({'0000-0001-5684-4059': 88, '0000-0001-5553-2539': 30, '0000-0002-3134-782X': 29, '0000-0003-3845-8079': 20, '0000-0002-7844-2293': 14, '0000-0001-5486-2560': 8, '0000-0002-8720-6436': 6, '0000-0002-2721-3626': 2, '0000-0002-3563-9504': 2, '0000-0003-2197-1562': 2, '0000-0002-9849-8091': 2, '0000-0002-2164-8813': 2, '0000-0001-6000-1572': 1, '0000-0001-5747-8847': 1, '0000-0002-2791

total sample size before apply threshold:  94
Counter({'0000-0002-4175-5982': 17, '0000-0002-7665-2182': 12, '0000-0003-0779-6438': 11, '0000-0003-4280-0068': 8, '0000-0001-9295-4992': 7, '0000-0001-9508-8209': 7, '0000-0001-6930-5994': 6, '0000-0001-9478-5344': 6, '0000-0001-5809-0027': 5, '0000-0002-4149-4938': 4, '0000-0002-1581-2357': 4, '0000-0001-5956-4618': 2, '0000-0001-7063-7742': 2, '0000-0002-2541-837X': 1, '0000-0001-6259-7082': 1, '0000-0002-4515-2070': 1})
[]
Total sample size after apply threshold:  0
d_zhang  pass
For name:  b_huang
total sample size before apply threshold:  48
Counter({'0000-0001-9082-2216': 16, '0000-0002-1981-5838': 12, '0000-0002-1246-7447': 9, '0000-0001-6189-814X': 5, '0000-0001-5009-3928': 3, '0000-0003-2838-6315': 3})
[]
Total sample size after apply threshold:  0
b_huang  pass
For name:  m_chong
total sample size before apply threshold:  43
Counter({'0000-0001-9324-5901': 20, '0000-0002-9586-6303': 20, '0000-0003-0587-2505': 1, '0000-0002-5507-

['0000-0002-0440-2387']
Total sample size after apply threshold:  110
j_richard  pass
For name:  p_robinson
total sample size before apply threshold:  275
Counter({'0000-0002-7878-0313': 133, '0000-0002-0736-9199': 119, '0000-0002-3156-3418': 19, '0000-0002-0577-3147': 4})
['0000-0002-0736-9199', '0000-0002-7878-0313']
Total sample size after apply threshold:  252
(252, 100)
(252, 100)
2
(252, 200)
(201, 200)
(201,)
(51, 200)
(51,)
                     precision    recall  f1-score   support

0000-0002-0736-9199       0.92      1.00      0.96        24
0000-0002-7878-0313       1.00      0.93      0.96        27

          micro avg       0.96      0.96      0.96        51
          macro avg       0.96      0.96      0.96        51
       weighted avg       0.96      0.96      0.96        51

[24  0  2 25]
p_robinson  in group svc Accuracy:  0.9607843137254902
p_robinson  in group svc F1:  0.9607692307692308
                     precision    recall  f1-score   support

0000-0002-0736-

(398, 100)
(398, 100)
2
(398, 200)
(318, 200)
(318,)
(80, 200)
(80,)
                     precision    recall  f1-score   support

0000-0002-7751-1058       1.00      1.00      1.00        34
0000-0003-0960-6415       1.00      1.00      1.00        46

          micro avg       1.00      1.00      1.00        80
          macro avg       1.00      1.00      1.00        80
       weighted avg       1.00      1.00      1.00        80

[34  0  0 46]
d_richardson  in group svc Accuracy:  1.0
d_richardson  in group svc F1:  1.0
                     precision    recall  f1-score   support

0000-0002-7751-1058       1.00      0.97      0.99        34
0000-0003-0960-6415       0.98      1.00      0.99        46

          micro avg       0.99      0.99      0.99        80
          macro avg       0.99      0.99      0.99        80
       weighted avg       0.99      0.99      0.99        80

[33  1  0 46]
LR Accuracy:  0.9875
LR F1:  0.9871609693468143
For name:  j_moraes
total sample size b

Total sample size after apply threshold:  238
(238, 100)
(238, 100)
2
(238, 200)
(190, 200)
(190,)
(48, 200)
(48,)
                     precision    recall  f1-score   support

0000-0001-8592-0698       1.00      0.92      0.96        24
0000-0003-0852-0767       0.92      1.00      0.96        24

          micro avg       0.96      0.96      0.96        48
          macro avg       0.96      0.96      0.96        48
       weighted avg       0.96      0.96      0.96        48

[22  2  0 24]
y_wang  in group svc Accuracy:  0.9583333333333334
y_wang  in group svc F1:  0.9582608695652175
                     precision    recall  f1-score   support

0000-0001-8592-0698       1.00      0.92      0.96        24
0000-0003-0852-0767       0.92      1.00      0.96        24

          micro avg       0.96      0.96      0.96        48
          macro avg       0.96      0.96      0.96        48
       weighted avg       0.96      0.96      0.96        48

[22  2  0 24]
LR Accuracy:  0.9583333

(208, 100)
(208, 100)
2
(208, 200)
(166, 200)
(166,)
(42, 200)
(42,)
                     precision    recall  f1-score   support

0000-0001-5833-989X       1.00      1.00      1.00        20
0000-0003-3171-7672       1.00      1.00      1.00        22

          micro avg       1.00      1.00      1.00        42
          macro avg       1.00      1.00      1.00        42
       weighted avg       1.00      1.00      1.00        42

[20  0  0 22]
w_lee  in group svc Accuracy:  1.0
w_lee  in group svc F1:  1.0
                     precision    recall  f1-score   support

0000-0001-5833-989X       1.00      1.00      1.00        20
0000-0003-3171-7672       1.00      1.00      1.00        22

          micro avg       1.00      1.00      1.00        42
          macro avg       1.00      1.00      1.00        42
       weighted avg       1.00      1.00      1.00        42

[20  0  0 22]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_cheng
total sample size before apply threshold:  66
Counte

[]
Total sample size after apply threshold:  0
m_ruiz  pass
For name:  a_levy
total sample size before apply threshold:  23
Counter({'0000-0003-4770-1886': 13, '0000-0002-6709-4190': 6, '0000-0002-5856-8294': 3, '0000-0002-1521-658X': 1})
[]
Total sample size after apply threshold:  0
a_levy  pass
For name:  j_murray
total sample size before apply threshold:  213
Counter({'0000-0002-2282-3839': 78, '0000-0002-8897-0161': 32, '0000-0002-8992-7317': 23, '0000-0002-6928-2347': 23, '0000-0001-9314-2283': 18, '0000-0001-8224-679X': 13, '0000-0003-1941-9090': 11, '0000-0002-8577-7964': 8, '0000-0003-2994-4155': 3, '0000-0002-8741-4964': 1, '0000-0003-4390-1039': 1, '0000-0001-9721-992X': 1, '0000-0003-3000-9199': 1})
[]
Total sample size after apply threshold:  0
j_murray  pass
For name:  y_hou
total sample size before apply threshold:  162
Counter({'0000-0001-6546-2597': 97, '0000-0002-3995-7219': 29, '0000-0002-0420-0726': 14, '0000-0002-8114-166X': 12, '0000-0002-7360-5751': 5, '0000-0002

total sample size before apply threshold:  154
Counter({'0000-0001-8451-9421': 63, '0000-0002-5486-0407': 22, '0000-0003-4750-1550': 19, '0000-0003-4059-0538': 13, '0000-0002-6028-2084': 13, '0000-0002-0053-3347': 6, '0000-0003-4028-811X': 6, '0000-0002-5496-752X': 5, '0000-0001-8245-9306': 2, '0000-0002-7273-974X': 1, '0000-0001-8503-4880': 1, '0000-0001-9039-1014': 1, '0000-0002-8698-6143': 1, '0000-0001-5682-6897': 1})
[]
Total sample size after apply threshold:  0
j_moore  pass
For name:  a_gray
total sample size before apply threshold:  121
Counter({'0000-0003-4299-2194': 107, '0000-0003-1062-7942': 5, '0000-0002-6273-0637': 5, '0000-0002-5711-4872': 3, '0000-0003-0239-7278': 1})
['0000-0003-4299-2194']
Total sample size after apply threshold:  107
a_gray  pass
For name:  v_martins
total sample size before apply threshold:  104
Counter({'0000-0002-2909-8502': 71, '0000-0001-7611-861X': 18, '0000-0001-7565-9641': 6, '0000-0003-2465-5880': 5, '0000-0002-8824-7328': 3, '0000-0002-032

total sample size before apply threshold:  65
Counter({'0000-0002-8527-8145': 57, '0000-0002-8152-1400': 4, '0000-0002-9294-314X': 3, '0000-0002-3571-4658': 1})
[]
Total sample size after apply threshold:  0
c_gu  pass
For name:  a_soto
total sample size before apply threshold:  32
Counter({'0000-0002-0144-1399': 17, '0000-0001-9672-9004': 5, '0000-0002-7265-0956': 4, '0000-0002-2641-9032': 3, '0000-0001-8648-8032': 3})
[]
Total sample size after apply threshold:  0
a_soto  pass
For name:  h_hsieh
total sample size before apply threshold:  70
Counter({'0000-0001-8302-2472': 53, '0000-0003-3201-3677': 13, '0000-0002-2583-7670': 3, '0000-0002-4483-1768': 1})
[]
Total sample size after apply threshold:  0
h_hsieh  pass
For name:  m_crespo
total sample size before apply threshold:  49
Counter({'0000-0002-7732-7808': 20, '0000-0002-1852-2259': 12, '0000-0001-8762-7874': 9, '0000-0002-7086-9751': 8})
[]
Total sample size after apply threshold:  0
m_crespo  pass
For name:  s_phillips
total sa

['0000-0003-3791-7587', '0000-0001-8153-1441']
Total sample size after apply threshold:  261


KeyboardInterrupt: 

In [None]:
print("textual =",pp_textual,"_citation =",citation_emb)
print("svc: ", svcF1)
print("lr:", lrF1)

In [None]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

In [None]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))