In [22]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import com_func

def dummy(doc):
    return doc

# parameters
threshold = 30
cutoff = 3

pp_textual = ["tf","tf_idf","lsa"]

Dataset = "pubmed"

In [2]:
import pandas as pd
# load text information
Dataset = "pubmed"
raw_filepath = "../../Data"+"/"+Dataset+"/id_textual_combined.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # textual information can be defined as title+abstract
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

Total  3151504  paper have text information


In [3]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [4]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [5]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = normalize(count_vectorizer.fit_transform(cleaned_token).toarray())
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [30]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    # accumulate statistic for entire model f1
    cnf_matrix = confusion_matrix(allTrueLabel, allPredLabel)
    TP = np.diag(cnf_matrix)
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)
#     print(cnf_matrix)
#     print("TP: ",TP, "TN: ",TN, "FP: ",FP,"FN: ",FN)

    return accuracy, f1, TP.sum(), TN.sum(), FP.sum(), FN.sum()

In [None]:
def write_csv_df(savePath, filename, df):
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    # Give the filename you wish to save the file to
    pathfile = os.path.normpath(os.path.join(savePath,filename))

    # Use this function to search for any files which match your filename
    files_present = os.path.isfile(pathfile) 
    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        df.to_csv(pathfile, encoding='utf-8',index=False)
    else:
        overwrite = input("WARNING: " + pathfile + " already exists! Do you want to overwrite <y/n>? \n ")
        if overwrite == 'y':
            df.to_csv(pathfile, encoding='utf-8',index=False)
        elif overwrite == 'n':
            new_filename = input("Type new filename: \n ")
            write_csv_df(savePath,new_filename,df)
        else:
            print("Not a valid input. Data is NOT saved!\n")

In [31]:
# load the file
import io
import collections
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# entire model f1
modelSVCf1, modelLRf1 = ([] for i in range(2))

for emb in pp_textual:
    # collect statistic to output
    allname, num_class, per_class_count, average_textual_size = ([] for i in range(4))

    all_svcLinear_accuracy, all_svcLinear_f1, all_LR_accuracy, all_LR_f1 = ([] for i in range(4))
    
    # collect overall tp, tn, fp, fn
    svcTP=svcTN=svcFP=svcFN = 0
    lrTP=lrTN=lrFP=lrFN = 0
    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data_part = read_labeled_file(fileDir+file)
        print("total sample size before apply threshold: ",len(labeled_data_part))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data_part["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove samples that are smaller than threshold
        labeled_data_part = labeled_data_part[labeled_data_part.authorID.isin(temp)]
        print("Total sample size after apply threshold: ",len(labeled_data_part))
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name, " pass")
        else:
            allname.append(name)
            num_class.append(len(paperCounter))
            per_class_count.append(paperCounter)
            # convert author id to label
            gather_label = []
            for index, record in labeled_data_part.iterrows():
                gather_label.append(temp.index(record["authorID"]))
            labeled_data_part["label"] = gather_label
            # merge title and abstract from all raw data to labeled dataset
            labeled_data = pd.merge(left=labeled_data_part,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
            # shuffle the data
            labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
            # extract true label and pid
            label = labeled_data["label"]
            pid = labeled_data["paperID"]
            # list of different data field
            part_collection = []
            # select feature wanted to fit to clustering/classification algorithm
            # data part, textual information
            data_part_textual, avg_textual_size = raw_text_to_vector(labeled_data['combine_textual'], emb_type=emb)
            average_textual_size.append(avg_textual_size)
            print(data_part_textual.shape)
            part_collection.append(data_part_textual)
            # merge different part of data data together by concatenate it all together
            # remove empty emb (when emb set off)
            part_collection = [part for part in part_collection if len(part)!=0]
            print(len(part_collection))
            if len(part_collection)>1:
                combinedata = np.concatenate(part_collection,axis=1)
            elif len(part_collection)==1:
                if isinstance(part_collection[0], pd.DataFrame):
                    combinedata = part_collection[0].values
                else:
                    combinedata = part_collection[0]
            else:
                print("No data available")
                break
            print(len(combinedata))
            # using converted feature vector to train classifier
            # using SVM with linear kernal
            clf = SVC(decision_function_shape='ovr', kernel='linear')
            svcaccuracy, svcmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            svcTP+=tp
            svcTN+=tn
            svcFP+=fp
            svcFN+=fn
            print("svc Accuracy: ",svcaccuracy)
            print("svc F1: ", svcmarcof1)
            all_svcLinear_accuracy.append(svcaccuracy)
            all_svcLinear_f1.append(svcmarcof1)
            # using logistic regression
            clf = LogisticRegression(multi_class='ovr')
            LRaccuracy, LRmarcof1, tp, tn, fp, fn = k_fold_cv(combinedata, label, clf, k=10)
            lrTP+=tp
            lrTN+=tn
            lrFP+=fp
            lrFN+=fn
            print("LR Accuracy: ",LRaccuracy)
            print("LR F1: ", LRmarcof1)
            all_LR_accuracy.append(LRaccuracy)
            all_LR_f1.append(LRmarcof1)
    # print f1 for entire model
    print("svc: TP: ",svcTP, "TN: ",svcTN, "FP: ",svcFP,"FN: ",svcFN)
    print("lr: TP: ",lrTP, "TN: ",lrTN, "FP: ",lrFP,"FN: ",lrFN)
    svcF1 = 2*svcTP / (2*svcTP + svcFP + svcFN)
    lrF1 = 2*lrTP / (2*lrTP + lrFP + lrFN)
    modelSVCf1.append(svcF1)
    modelLRf1.append(lrF1)
    # write evaluation result to excel
    output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"per_class_size":per_class_count, 
                           "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                           "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

    savePath = "../../result/"+Dataset+"/skovr/"
    filename = "textual="+emb+"_threshold="+str(threshold)+".csv"
    write_csv_df(savePath, filename, output)
    print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
Minimal sample size:  6
maximal sample size:  248
(127, 1166)
1
127
              precision    recall  f1-score   support

           0       0.91      0.94      0.92        31
           1       1.00      0.90      0.95        39
           2       0.92      0.96      0.94        57

   micro avg       0.94      0.94      0.94       127
   macro avg       0.94      0.93      0.94       127
weighted avg       0.94      0.94      0.94       127

[29  0  2  1 35  3  2  0 55]
svc Accuracy:  0.937007874015748
svc F1:  0.9355839355839356
              precision    recall  f1-score   support

    

Minimal sample size:  8
maximal sample size:  330
(419, 2913)
1
419
              precision    recall  f1-score   support

           0       0.98      0.90      0.94        51
           1       0.89      0.97      0.93        79
           2       0.94      0.93      0.93        82
           3       0.99      0.99      0.99       141
           4       1.00      0.95      0.98        66

   micro avg       0.96      0.96      0.96       419
   macro avg       0.96      0.95      0.95       419
weighted avg       0.96      0.96      0.96       419

[ 46   4   1   0   0   1  77   1   0   0   0   4  76   2   0   0   0   2
 139   0   0   2   1   0  63]
svc Accuracy:  0.9570405727923628
svc F1:  0.9523122959768558
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        51
           1       0.97      0.97      0.97        79
           2       0.98      0.96      0.97        82
           3       0.98      1.00      0.99       141
       

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       123
           1       1.00      0.86      0.93        37

   micro avg       0.97      0.97      0.97       160
   macro avg       0.98      0.93      0.95       160
weighted avg       0.97      0.97      0.97       160

[123   0   5  32]
svc Accuracy:  0.96875
svc F1:  0.9538079565794793
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       123
           1       0.97      0.95      0.96        37

   micro avg       0.98      0.98      0.98       160
   macro avg       0.98      0.97      0.97       160
weighted avg       0.98      0.98      0.98       160

[122   1   2  35]
LR Accuracy:  0.98125
LR F1:  0.9733791803005934
For name:  k_xu
total sample size before apply threshold:  37
Counter({'0000-0002-2788-194X': 19, '0000-0003-2036-3469': 14, '0000-0002-3985-739X': 3, '0000-0001-7851-2629': 1})
[]
Total sample size after apply

Minimal sample size:  17
maximal sample size:  316
(119, 1174)
1
119
              precision    recall  f1-score   support

           0       0.91      0.94      0.93        34
           1       1.00      0.98      0.99        43
           2       0.95      0.95      0.95        42

   micro avg       0.96      0.96      0.96       119
   macro avg       0.96      0.96      0.96       119
weighted avg       0.96      0.96      0.96       119

[32  0  2  1 42  0  2  0 40]
svc Accuracy:  0.957983193277311
svc F1:  0.9560508261275524
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        34
           1       1.00      1.00      1.00        43
           2       1.00      0.95      0.98        42

   micro avg       0.98      0.98      0.98       119
   macro avg       0.98      0.98      0.98       119
weighted avg       0.98      0.98      0.98       119

[34  0  0  0 43  0  2  0 40]
LR Accuracy:  0.9831932773109243
LR F1:  0.9823461

Minimal sample size:  8
maximal sample size:  351
(283, 2248)
1
283
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       149
           1       0.97      0.87      0.91        98
           2       0.85      0.94      0.89        36

   micro avg       0.92      0.92      0.92       283
   macro avg       0.91      0.92      0.91       283
weighted avg       0.92      0.92      0.92       283

[142   3   4  11  85   2   2   0  34]
svc Accuracy:  0.9222614840989399
svc F1:  0.9143086210149028
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       149
           1       0.96      0.92      0.94        98
           2       0.97      0.97      0.97        36

   micro avg       0.95      0.95      0.95       283
   macro avg       0.96      0.95      0.95       283
weighted avg       0.95      0.95      0.95       283

[144   4   1   8  90   0   1   0  35]
LR Accuracy:  0.950530035335689


Counter({'0000-0002-9723-4924': 163, '0000-0001-8633-2417': 9, '0000-0002-6434-9290': 4, '0000-0003-3506-0401': 2, '0000-0002-5615-2893': 2, '0000-0002-7115-0001': 1})
['0000-0002-9723-4924']
Total sample size after apply threshold:  163
For name:  r_gomes
total sample size before apply threshold:  52
Counter({'0000-0001-7155-0059': 15, '0000-0002-9197-8279': 10, '0000-0003-0278-4876': 10, '0000-0002-7242-6540': 6, '0000-0002-9012-3287': 6, '0000-0002-5984-0712': 4, '0000-0002-6375-7014': 1})
[]
Total sample size after apply threshold:  0
For name:  r_bennett
total sample size before apply threshold:  93
Counter({'0000-0002-7526-3425': 74, '0000-0002-7227-4831': 11, '0000-0002-5780-8786': 3, '0000-0002-3746-367X': 3, '0000-0002-5210-1386': 1, '0000-0002-1200-2068': 1})
['0000-0002-7526-3425']
Total sample size after apply threshold:  74
For name:  m_collins
total sample size before apply threshold:  57
Counter({'0000-0002-7656-4975': 20, '0000-0003-3785-6008': 14, '0000-0003-3969-5797'

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        45
           1       0.99      0.99      0.99       122

   micro avg       0.99      0.99      0.99       167
   macro avg       0.98      0.98      0.98       167
weighted avg       0.99      0.99      0.99       167

[ 44   1   1 121]
LR Accuracy:  0.9880239520958084
LR F1:  0.9847905282331512
For name:  m_soares
total sample size before apply threshold:  247
Counter({'0000-0001-9701-836X': 75, '0000-0002-9314-4833': 68, '0000-0001-6071-0272': 44, '0000-0003-1579-8513': 32, '0000-0002-5213-2377': 10, '0000-0001-8860-0470': 7, '0000-0003-4227-4141': 4, '0000-0002-7181-1906': 3, '0000-0002-4614-8209': 2, '0000-0002-8059-7067': 1, '0000-0002-9013-2570': 1})
['0000-0001-6071-0272', '0000-0003-1579-8513', '0000-0002-9314-4833', '0000-0001-9701-836X']
Total sample size after apply threshold:  219
Minimal sample size:  12
maximal sample size:  500
(219, 2019)
1
219
              preci

Minimal sample size:  18
maximal sample size:  333
(148, 1242)
1
148
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       102
           1       0.95      0.91      0.93        46

   micro avg       0.96      0.96      0.96       148
   macro avg       0.96      0.95      0.95       148
weighted avg       0.96      0.96      0.96       148

[100   2   4  42]
svc Accuracy:  0.9594594594594594
svc F1:  0.9521035598705501
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       102
           1       0.95      0.91      0.93        46

   micro avg       0.96      0.96      0.96       148
   macro avg       0.96      0.95      0.95       148
weighted avg       0.96      0.96      0.96       148

[100   2   4  42]
LR Accuracy:  0.9594594594594594
LR F1:  0.9521035598705501
For name:  j_marques
total sample size before apply threshold:  183
Counter({'0000-0001-8865-8189': 30, '0000-0001-8157

Minimal sample size:  13
maximal sample size:  317
(269, 2025)
1
269
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       238
           1       1.00      0.94      0.97        31

   micro avg       0.99      0.99      0.99       269
   macro avg       1.00      0.97      0.98       269
weighted avg       0.99      0.99      0.99       269

[238   0   2  29]
svc Accuracy:  0.9925650557620818
svc F1:  0.9812412831241283
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       238
           1       1.00      1.00      1.00        31

   micro avg       1.00      1.00      1.00       269
   macro avg       1.00      1.00      1.00       269
weighted avg       1.00      1.00      1.00       269

[238   0   0  31]
LR Accuracy:  1.0
LR F1:  1.0
For name:  k_jacobsen
total sample size before apply threshold:  113
Counter({'0000-0002-4198-6246': 93, '0000-0002-1121-2979': 17, '0000-0002-3450-0

Minimal sample size:  21
maximal sample size:  366
(113, 1178)
1
113
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        74
           1       0.95      0.97      0.96        39

   micro avg       0.97      0.97      0.97       113
   macro avg       0.97      0.97      0.97       113
weighted avg       0.97      0.97      0.97       113

[72  2  1 38]
svc Accuracy:  0.9734513274336283
svc F1:  0.970808576595195
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        74
           1       0.97      0.95      0.96        39

   micro avg       0.97      0.97      0.97       113
   macro avg       0.97      0.97      0.97       113
weighted avg       0.97      0.97      0.97       113

[73  1  2 37]
LR Accuracy:  0.9734513274336283
LR F1:  0.9704523664255209
For name:  a_nielsen
total sample size before apply threshold:  132
Counter({'0000-0003-4372-9961': 70, '0000-0001-6616-0187': 2

Minimal sample size:  2
maximal sample size:  269
(202, 1487)
1
202
              precision    recall  f1-score   support

           0       0.97      0.80      0.88        35
           1       0.96      0.99      0.98       167

   micro avg       0.96      0.96      0.96       202
   macro avg       0.96      0.90      0.93       202
weighted avg       0.96      0.96      0.96       202

[ 28   7   1 166]
svc Accuracy:  0.9603960396039604
svc F1:  0.9257352941176471
              precision    recall  f1-score   support

           0       1.00      0.91      0.96        35
           1       0.98      1.00      0.99       167

   micro avg       0.99      0.99      0.99       202
   macro avg       0.99      0.96      0.97       202
weighted avg       0.99      0.99      0.98       202

[ 32   3   0 167]
LR Accuracy:  0.9851485148514851
LR F1:  0.9731609017228398
For name:  m_reilly
total sample size before apply threshold:  20
Counter({'0000-0001-8029-0084': 17, '0000-0002-5526-82

Minimal sample size:  3
maximal sample size:  624
(533, 3790)
1
533
              precision    recall  f1-score   support

           0       0.92      0.75      0.83        32
           1       0.94      0.94      0.94        65
           2       0.95      0.92      0.93        84
           3       0.98      0.96      0.97       113
           4       0.91      0.99      0.95       154
           5       1.00      0.95      0.98        85

   micro avg       0.95      0.95      0.95       533
   macro avg       0.95      0.92      0.93       533
weighted avg       0.95      0.95      0.95       533

[ 24   2   0   0   6   0   2  61   1   0   1   0   0   2  77   1   4   0
   0   0   2 109   2   0   0   0   1   0 153   0   0   0   0   1   3  81]
svc Accuracy:  0.9474671669793621
svc F1:  0.9326445666526952
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        32
           1       0.97      0.97      0.97        65
           2     

Minimal sample size:  9
maximal sample size:  309
(287, 2074)
1
287
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        77
           1       0.96      1.00      0.98       210

   micro avg       0.97      0.97      0.97       287
   macro avg       0.98      0.95      0.96       287
weighted avg       0.97      0.97      0.97       287

[ 69   8   0 210]
svc Accuracy:  0.9721254355400697
svc F1:  0.963256945333504
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        77
           1       0.97      1.00      0.98       210

   micro avg       0.98      0.98      0.98       287
   macro avg       0.98      0.95      0.97       287
weighted avg       0.98      0.98      0.98       287

[ 70   7   0 210]
LR Accuracy:  0.975609756097561
LR F1:  0.9679937548790007
For name:  j_abrantes
total sample size before apply threshold:  57
Counter({'0000-0002-8391-7134': 42, '0000-0003-1902-90

Total sample size after apply threshold:  101
Minimal sample size:  3
maximal sample size:  331
(101, 786)
1
101
              precision    recall  f1-score   support

           0       0.87      0.90      0.89        30
           1       0.79      0.89      0.84        38
           2       0.89      0.73      0.80        33

   micro avg       0.84      0.84      0.84       101
   macro avg       0.85      0.84      0.84       101
weighted avg       0.85      0.84      0.84       101

[27  2  1  2 34  2  2  7 24]
svc Accuracy:  0.8415841584158416
svc F1:  0.8415840248262835
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        30
           1       0.86      0.95      0.90        38
           2       0.93      0.82      0.87        33

   micro avg       0.90      0.90      0.90       101
   macro avg       0.91      0.90      0.90       101
weighted avg       0.90      0.90      0.90       101

[28  1  1  1 36  1  1  5 27]
LR Ac

Minimal sample size:  12
maximal sample size:  205
(80, 687)
1
80
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        36
           1       1.00      0.95      0.98        44

   micro avg       0.97      0.97      0.97        80
   macro avg       0.97      0.98      0.97        80
weighted avg       0.98      0.97      0.98        80

[36  0  2 42]
svc Accuracy:  0.975
svc F1:  0.9748585795097424
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        44

   micro avg       1.00      1.00      1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

[36  0  0 44]
LR Accuracy:  1.0
LR F1:  1.0
For name:  d_fernandes
total sample size before apply threshold:  40
Counter({'0000-0003-0599-3200': 20, '0000-0002-5056-5734': 9, '0000-0001-5263-2737': 5, '0000-0001-6155-

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        90
           1       0.96      0.98      0.97       166
           2       0.95      0.91      0.93        44

   micro avg       0.97      0.97      0.97       300
   macro avg       0.96      0.95      0.96       300
weighted avg       0.97      0.97      0.97       300

[ 88   2   0   2 162   2   0   4  40]
LR Accuracy:  0.9666666666666667
LR F1:  0.9593567387189444
For name:  y_jia
total sample size before apply threshold:  46
Counter({'0000-0002-2784-1905': 24, '0000-0003-3852-7302': 10, '0000-0002-8852-7557': 3, '0000-0001-9657-0806': 3, '0000-0001-7978-9312': 3, '0000-0001-9395-2139': 2, '0000-0003-4972-1004': 1})
[]
Total sample size after apply threshold:  0
For name:  p_gaspar
total sample size before apply threshold:  93
Counter({'0000-0003-4217-5717': 87, '0000-0001-5967-0584': 3, '0000-0002-4832-8537': 2, '0000-0003-3388-1724': 1})
['0000-0003-4217-5717']
Total sample

Minimal sample size:  11
maximal sample size:  311
(74, 886)
1
74
              precision    recall  f1-score   support

           0       0.94      0.89      0.91        36
           1       0.90      0.95      0.92        38

   micro avg       0.92      0.92      0.92        74
   macro avg       0.92      0.92      0.92        74
weighted avg       0.92      0.92      0.92        74

[32  4  2 36]
svc Accuracy:  0.918918918918919
svc F1:  0.9186813186813187
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        36
           1       0.97      0.97      0.97        38

   micro avg       0.97      0.97      0.97        74
   macro avg       0.97      0.97      0.97        74
weighted avg       0.97      0.97      0.97        74

[35  1  1 37]
LR Accuracy:  0.972972972972973
LR F1:  0.972953216374269
For name:  s_woo
total sample size before apply threshold:  25
Counter({'0000-0003-3692-7169': 22, '0000-0001-8788-2875': 1, '0000-00

              precision    recall  f1-score   support

           0       0.95      0.92      0.94        39
           1       0.96      1.00      0.98       100
           2       1.00      1.00      1.00        45
           3       0.95      0.92      0.94        39
           4       1.00      0.94      0.97        31

   micro avg       0.97      0.97      0.97       254
   macro avg       0.97      0.96      0.96       254
weighted avg       0.97      0.97      0.97       254

[ 36   1   0   2   0   0 100   0   0   0   0   0  45   0   0   2   1   0
  36   0   0   2   0   0  29]
LR Accuracy:  0.968503937007874
LR F1:  0.9634377387318563
For name:  s_bae
total sample size before apply threshold:  83
Counter({'0000-0003-0551-7618': 19, '0000-0002-3019-0584': 17, '0000-0002-4995-6543': 17, '0000-0002-8993-8884': 9, '0000-0003-0098-8816': 8, '0000-0003-1926-5466': 6, '0000-0001-7603-7676': 6, '0000-0003-0637-4110': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ferna

['0000-0002-4861-8649', '0000-0002-7769-4712']
Total sample size after apply threshold:  100
Minimal sample size:  23
maximal sample size:  408
(100, 1136)
1
100
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        57
           1       0.98      0.98      0.98        43

   micro avg       0.98      0.98      0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100

[56  1  1 42]
svc Accuracy:  0.98
svc F1:  0.9796001631986944
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        57
           1       0.98      1.00      0.99        43

   micro avg       0.99      0.99      0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

[56  1  0 43]
LR Accuracy:  0.99
LR F1:  0.9898280948021565
For name:  y_tseng
total sample size before apply threshold:  

Minimal sample size:  20
maximal sample size:  266
(98, 992)
1
98
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.98      1.00      0.99        58

   micro avg       0.99      0.99      0.99        98
   macro avg       0.99      0.99      0.99        98
weighted avg       0.99      0.99      0.99        98

[39  1  0 58]
svc Accuracy:  0.9897959183673469
svc F1:  0.9893973818024451
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.98      1.00      0.99        58

   micro avg       0.99      0.99      0.99        98
   macro avg       0.99      0.99      0.99        98
weighted avg       0.99      0.99      0.99        98

[39  1  0 58]
LR Accuracy:  0.9897959183673469
LR F1:  0.9893973818024451
For name:  h_brown
total sample size before apply threshold:  48
Counter({'0000-0001-8578-5510': 17, '0000-0002-0067-991X': 9, '00

Minimal sample size:  10
maximal sample size:  388
(295, 2077)
1
295
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       155
           1       0.93      0.93      0.93        44
           2       0.81      0.80      0.81        60
           3       0.88      0.58      0.70        36

   micro avg       0.87      0.87      0.87       295
   macro avg       0.87      0.81      0.84       295
weighted avg       0.87      0.87      0.86       295

[146   1   7   1   1  41   2   0   9   1  48   2  12   1   2  21]
svc Accuracy:  0.8677966101694915
svc F1:  0.8356414096739175
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       155
           1       0.98      0.91      0.94        44
           2       0.88      0.82      0.84        60
           3       0.92      0.61      0.73        36

   micro avg       0.89      0.89      0.89       295
   macro avg       0.91      0.83      0.8

Minimal sample size:  3
maximal sample size:  259
(120, 1124)
1
120
              precision    recall  f1-score   support

           0       0.98      0.95      0.97        64
           1       0.95      0.98      0.96        56

   micro avg       0.97      0.97      0.97       120
   macro avg       0.97      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120

[61  3  1 55]
svc Accuracy:  0.9666666666666667
svc F1:  0.9665831244778613
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        64
           1       0.98      0.98      0.98        56

   micro avg       0.98      0.98      0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120

[63  1  1 55]
LR Accuracy:  0.9833333333333333
LR F1:  0.9832589285714286
For name:  r_luz
total sample size before apply threshold:  20
Counter({'0000-0002-3999-4298': 9, '0000-0002-1021-5772': 6, '000

Minimal sample size:  5
maximal sample size:  312
(651, 4117)
1
651
              precision    recall  f1-score   support

           0       0.93      0.90      0.92        31
           1       0.96      0.99      0.97        73
           2       0.82      0.85      0.84        33
           3       0.73      0.84      0.78        55
           4       0.80      0.97      0.88        92
           5       0.96      0.90      0.93        87
           6       0.98      0.94      0.96        49
           7       0.86      0.72      0.78        43
           8       1.00      0.97      0.98        61
           9       0.87      0.83      0.85        41
          10       0.91      0.82      0.87        51
          11       0.83      0.71      0.77        35

   micro avg       0.89      0.89      0.89       651
   macro avg       0.89      0.87      0.88       651
weighted avg       0.89      0.89      0.89       651

[28  0  1  0  2  0  0  0  0  0  0  0  0 72  0  1  0  0  0  0  0  

Minimal sample size:  2
maximal sample size:  307
(303, 2211)
1
303
              precision    recall  f1-score   support

           0       0.80      0.91      0.85        57
           1       0.97      0.95      0.96        81
           2       1.00      0.92      0.96        48
           3       0.97      0.96      0.97       117

   micro avg       0.94      0.94      0.94       303
   macro avg       0.94      0.93      0.93       303
weighted avg       0.95      0.94      0.94       303

[ 52   2   0   3   4  77   0   0   4   0  44   0   5   0   0 112]
svc Accuracy:  0.9405940594059405
svc F1:  0.934249499225797
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        57
           1       0.99      0.99      0.99        81
           2       1.00      0.98      0.99        48
           3       0.98      0.98      0.98       117

   micro avg       0.98      0.98      0.98       303
   macro avg       0.98      0.97      0.97 

Minimal sample size:  16
maximal sample size:  293
(94, 942)
1
94
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        39
           1       0.96      0.98      0.97        55

   micro avg       0.97      0.97      0.97        94
   macro avg       0.97      0.97      0.97        94
weighted avg       0.97      0.97      0.97        94

[37  2  1 54]
svc Accuracy:  0.9680851063829787
svc F1:  0.9670059670059671
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        39
           1       0.98      1.00      0.99        55

   micro avg       0.99      0.99      0.99        94
   macro avg       0.99      0.99      0.99        94
weighted avg       0.99      0.99      0.99        94

[38  1  0 55]
LR Accuracy:  0.9893617021276596
LR F1:  0.9890019890019889
For name:  a_lombardi
total sample size before apply threshold:  90
Counter({'0000-0002-2013-3009': 49, '0000-0001-5421-9970': 21,

Minimal sample size:  6
maximal sample size:  276
(115, 998)
1
115
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        79

   micro avg       1.00      1.00      1.00       115
   macro avg       1.00      1.00      1.00       115
weighted avg       1.00      1.00      1.00       115

[36  0  0 79]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        79

   micro avg       1.00      1.00      1.00       115
   macro avg       1.00      1.00      1.00       115
weighted avg       1.00      1.00      1.00       115

[36  0  0 79]
LR Accuracy:  1.0
LR F1:  1.0
For name:  w_zheng
total sample size before apply threshold:  93
Counter({'0000-0002-6236-9765': 48, '0000-0003-1034-0757': 24, '0000-0003-0021-6672': 9, '0000-0003-0799-3474': 7, '0000-000

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       0.93      0.93      0.93        69
           2       0.91      0.95      0.93       106
           3       0.95      0.95      0.95        40
           4       0.97      0.87      0.92        45

   micro avg       0.94      0.94      0.94       295
   macro avg       0.95      0.94      0.95       295
weighted avg       0.94      0.94      0.94       295

[ 35   0   0   0   0   0  64   3   2   0   0   4 101   0   1   0   1   1
  38   0   0   0   6   0  39]
LR Accuracy:  0.9389830508474576
LR F1:  0.9452117733488908
For name:  r_thomas
total sample size before apply threshold:  368
Counter({'0000-0002-0518-8386': 95, '0000-0002-2340-0301': 95, '0000-0003-1448-7182': 74, '0000-0003-2062-8623': 46, '0000-0001-9251-5543': 13, '0000-0002-2970-6352': 10, '0000-0002-2165-5917': 8, '0000-0003-1282-7825': 5, '0000-0003-3588-2317': 5, '0000-0002-7286-2764': 4, '0000-

Total sample size after apply threshold:  372
Minimal sample size:  12
maximal sample size:  285
(372, 2690)
1
372
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       112
           1       0.97      0.89      0.93        80
           2       0.94      0.98      0.96       180

   micro avg       0.96      0.96      0.96       372
   macro avg       0.96      0.95      0.95       372
weighted avg       0.96      0.96      0.96       372

[108   0   4   1  71   8   1   2 177]
svc Accuracy:  0.956989247311828
svc F1:  0.9534757138774356
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       112
           1       1.00      0.91      0.95        80
           2       0.95      1.00      0.98       180

   micro avg       0.98      0.98      0.98       372
   macro avg       0.98      0.96      0.97       372
weighted avg       0.98      0.98      0.98       372

[110   0   2   0  73   7

Total sample size after apply threshold:  446
Minimal sample size:  8
maximal sample size:  334
(446, 3056)
1
446
              precision    recall  f1-score   support

           0       0.84      0.69      0.76        39
           1       0.97      0.88      0.92        33
           2       0.96      0.88      0.91        49
           3       0.92      0.99      0.95       146
           4       0.88      0.96      0.92       115
           5       0.95      0.86      0.90        64

   micro avg       0.91      0.91      0.91       446
   macro avg       0.92      0.88      0.89       446
weighted avg       0.92      0.91      0.91       446

[ 27   0   1   2   6   3   0  29   0   4   0   0   2   0  43   1   3   0
   0   1   0 144   1   0   1   0   1   3 110   0   2   0   0   2   5  55]
svc Accuracy:  0.9147982062780269
svc F1:  0.8946733854954684
              precision    recall  f1-score   support

           0       0.89      0.82      0.85        39
           1       1.00  

Total sample size after apply threshold:  0
For name:  a_schmid
total sample size before apply threshold:  61
Counter({'0000-0002-5196-151X': 28, '0000-0001-7759-0211': 19, '0000-0001-6483-8759': 10, '0000-0002-0141-0971': 4})
[]
Total sample size after apply threshold:  0
For name:  k_cheung
total sample size before apply threshold:  16
Counter({'0000-0002-6759-4961': 9, '0000-0002-8348-1561': 4, '0000-0003-4107-7840': 2, '0000-0001-7648-4556': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ma
total sample size before apply threshold:  136
Counter({'0000-0002-1897-7069': 69, '0000-0002-2029-7943': 42, '0000-0002-1810-8357': 9, '0000-0002-0232-8590': 6, '0000-0001-8581-2216': 3, '0000-0002-2704-3540': 2, '0000-0001-8087-0249': 1, '0000-0001-6361-9706': 1, '0000-0002-7995-2041': 1, '0000-0003-4846-9513': 1, '0000-0002-8992-1177': 1})
['0000-0002-2029-7943', '0000-0002-1897-7069']
Total sample size after apply threshold:  111
Minimal sample size:  8
maximal sample size: 

Total sample size after apply threshold:  0
For name:  r_day
total sample size before apply threshold:  202
Counter({'0000-0002-6045-6937': 149, '0000-0003-3442-2298': 39, '0000-0003-1766-4068': 6, '0000-0001-5913-2292': 5, '0000-0002-6155-5910': 2, '0000-0003-1467-3196': 1})
['0000-0002-6045-6937', '0000-0003-3442-2298']
Total sample size after apply threshold:  188
Minimal sample size:  8
maximal sample size:  404
(188, 1574)
1
188
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       149
           1       0.97      0.85      0.90        39

   micro avg       0.96      0.96      0.96       188
   macro avg       0.97      0.92      0.94       188
weighted avg       0.96      0.96      0.96       188

[148   1   6  33]
svc Accuracy:  0.9627659574468085
svc F1:  0.9405036394050363
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       149
           1       0.97      0.87      0.92   

Minimal sample size:  13
maximal sample size:  296
(184, 1512)
1
184
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       112
           1       1.00      0.99      0.99        72

   micro avg       0.99      0.99      0.99       184
   macro avg       1.00      0.99      0.99       184
weighted avg       0.99      0.99      0.99       184

[112   0   1  71]
svc Accuracy:  0.9945652173913043
svc F1:  0.9942812742812743
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00        72

   micro avg       1.00      1.00      1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184

[112   0   0  72]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_nguyen
total sample size before apply threshold:  27
Counter({'0000-0002-8578-7396': 20, '0000-0002-4747-5383': 2, '0000-0003-3574-6278'

Minimal sample size:  9
maximal sample size:  282
(139, 1195)
1
139
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        57
           1       1.00      0.97      0.98        30
           2       0.98      0.96      0.97        52

   micro avg       0.97      0.97      0.97       139
   macro avg       0.98      0.97      0.97       139
weighted avg       0.97      0.97      0.97       139

[56  0  1  1 29  0  2  0 50]
svc Accuracy:  0.9712230215827338
svc F1:  0.9731472917482348
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        57
           1       1.00      0.97      0.98        30
           2       0.98      1.00      0.99        52

   micro avg       0.99      0.99      0.99       139
   macro avg       0.99      0.98      0.99       139
weighted avg       0.99      0.99      0.99       139

[56  0  1  1 29  0  0  0 52]
LR Accuracy:  0.9856115107913669
LR F1:  0.9853277

Total sample size after apply threshold:  439
Minimal sample size:  11
maximal sample size:  358
(439, 2681)
1
439
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        67
           1       0.97      0.97      0.97        35
           2       0.91      0.88      0.90        78
           3       0.97      0.91      0.94        32
           4       0.97      0.99      0.98       171
           5       0.91      0.91      0.91        56

   micro avg       0.93      0.93      0.93       439
   macro avg       0.93      0.92      0.92       439
weighted avg       0.93      0.93      0.93       439

[ 56   0   5   1   1   4   0  34   0   0   1   0   6   0  69   0   2   1
   1   0   1  29   1   0   0   0   1   0 170   0   4   1   0   0   0  51]
svc Accuracy:  0.9316628701594533
svc F1:  0.9220350798790758
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        67
           1       0.97 

Minimal sample size:  15
maximal sample size:  259
(100, 1028)
1
100
              precision    recall  f1-score   support

           0       0.95      1.00      0.98        42
           1       1.00      0.97      0.98        58

   micro avg       0.98      0.98      0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100

[42  0  2 56]
svc Accuracy:  0.98
svc F1:  0.9796001631986944
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        42
           1       1.00      0.98      0.99        58

   micro avg       0.99      0.99      0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

[42  0  1 57]
LR Accuracy:  0.99
LR F1:  0.9897698209718669
For name:  h_tanaka
total sample size before apply threshold:  28
Counter({'0000-0002-4378-5747': 21, '0000-0003-1511-8557': 4, '0000-0002-3153-8802': 1, '

Minimal sample size:  11
maximal sample size:  321
(149, 1405)
1
149
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       115
           1       1.00      0.97      0.99        34

   micro avg       0.99      0.99      0.99       149
   macro avg       1.00      0.99      0.99       149
weighted avg       0.99      0.99      0.99       149

[115   0   1  33]
svc Accuracy:  0.9932885906040269
svc F1:  0.9903728112683337
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       115
           1       1.00      1.00      1.00        34

   micro avg       1.00      1.00      1.00       149
   macro avg       1.00      1.00      1.00       149
weighted avg       1.00      1.00      1.00       149

[115   0   0  34]
LR Accuracy:  1.0
LR F1:  1.0
For name:  c_west
total sample size before apply threshold:  181
Counter({'0000-0002-0839-3449': 155, '0000-0001-7595-6777': 20, '0000-0001-7649-9600

Minimal sample size:  10
maximal sample size:  301
(401, 2739)
1
401
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       175
           1       0.97      0.98      0.98       185
           2       1.00      0.88      0.94        41

   micro avg       0.98      0.98      0.98       401
   macro avg       0.98      0.95      0.96       401
weighted avg       0.98      0.98      0.97       401

[173   2   0   3 182   0   1   4  36]
svc Accuracy:  0.9750623441396509
svc F1:  0.9646302647308009
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       175
           1       0.98      1.00      0.99       185
           2       1.00      0.93      0.96        41

   micro avg       0.99      0.99      0.99       401
   macro avg       0.99      0.98      0.98       401
weighted avg       0.99      0.99      0.99       401

[175   0   0   0 185   0   0   3  38]
LR Accuracy:  0.992518703241895

Minimal sample size:  7
maximal sample size:  296
(420, 2611)
1
420
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       322
           1       0.94      0.92      0.93        37
           2       0.96      0.90      0.93        61

   micro avg       0.97      0.97      0.97       420
   macro avg       0.96      0.94      0.95       420
weighted avg       0.97      0.97      0.97       420

[319   2   1   2  34   1   6   0  55]
svc Accuracy:  0.9714285714285714
svc F1:  0.9489203622010679
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       322
           1       0.97      0.92      0.94        37
           2       0.98      0.97      0.98        61

   micro avg       0.98      0.98      0.98       420
   macro avg       0.98      0.96      0.97       420
weighted avg       0.98      0.98      0.98       420

[320   1   1   3  34   0   2   0  59]
LR Accuracy:  0.9833333333333333

Minimal sample size:  20
maximal sample size:  288
(248, 2034)
1
248
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       1.00      0.98      0.99        51
           2       1.00      1.00      1.00       149

   micro avg       1.00      1.00      1.00       248
   macro avg       0.99      0.99      0.99       248
weighted avg       1.00      1.00      1.00       248

[ 48   0   0   1  50   0   0   0 149]
svc Accuracy:  0.9959677419354839
svc F1:  0.9932632438501582
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        51
           2       1.00      1.00      1.00       149

   micro avg       1.00      1.00      1.00       248
   macro avg       1.00      1.00      1.00       248
weighted avg       1.00      1.00      1.00       248

[ 48   0   0   0  51   0   0   0 149]
LR Accuracy:  1.0
LR F1:  1.0
F

Minimal sample size:  5
maximal sample size:  303
(401, 2673)
1
401
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       188
           1       1.00      0.98      0.99        58
           2       0.87      0.96      0.92        57
           3       0.89      0.84      0.87        58
           4       0.84      0.80      0.82        40

   micro avg       0.95      0.95      0.95       401
   macro avg       0.92      0.92      0.92       401
weighted avg       0.95      0.95      0.95       401

[188   0   0   0   0   0  57   0   0   1   0   0  55   2   0   0   0   4
  49   5   0   0   4   4  32]
svc Accuracy:  0.9501246882793017
svc F1:  0.9191480944347431
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       188
           1       1.00      0.98      0.99        58
           2       0.93      0.96      0.95        57
           3       0.87      0.90      0.88        58
       

Minimal sample size:  6
maximal sample size:  329
(140, 1176)
1
140
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        98

   micro avg       0.99      0.99      0.99       140
   macro avg       0.99      0.99      0.99       140
weighted avg       0.99      0.99      0.99       140

[41  1  0 98]
svc Accuracy:  0.9928571428571429
svc F1:  0.991437832548468
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        98

   micro avg       1.00      1.00      1.00       140
   macro avg       1.00      1.00      1.00       140
weighted avg       1.00      1.00      1.00       140

[42  0  0 98]
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_ahmed
total sample size before apply threshold:  27
Counter({'0000-0002-4729-9068': 12, '0000-0002-1921-0724': 3, '0000-0002-4863-0402': 3, '0000-

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       104
           1       0.97      0.87      0.92        39
           2       0.96      1.00      0.98       487
           3       0.93      0.74      0.83        35
           4       1.00      0.89      0.94        57
           5       1.00      0.98      0.99       105

   micro avg       0.97      0.97      0.97       827
   macro avg       0.98      0.91      0.94       827
weighted avg       0.97      0.97      0.97       827

[104   0   0   0   0   0   0  34   4   1   0   0   1   0 485   1   0   0
   0   1   8  26   0   0   0   0   6   0  51   0   0   0   2   0   0 103]
LR Accuracy:  0.9709794437726723
LR F1:  0.9420304492991249
For name:  w_choi
total sample size before apply threshold:  118
Counter({'0000-0003-1801-9386': 79, '0000-0002-7896-7655': 16, '0000-0002-6623-3806': 7, '0000-0002-4203-0457': 6, '0000-0001-8038-5876': 3, '0000-0002-7183-3400': 3, '0000-0003-4233-01

Minimal sample size:  14
maximal sample size:  271
(114, 1029)
1
114
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        81
           1       1.00      0.94      0.97        33

   micro avg       0.98      0.98      0.98       114
   macro avg       0.99      0.97      0.98       114
weighted avg       0.98      0.98      0.98       114

[81  0  2 31]
svc Accuracy:  0.9824561403508771
svc F1:  0.9782774390243902
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        81
           1       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

[81  0  0 33]
LR Accuracy:  1.0
LR F1:  1.0
For name:  b_moreno
total sample size before apply threshold:  8
Counter({'0000-0001-5799-9802': 6, '0000-0002-8881-4329': 1, '0000-0002-1530-4977': 1})
[]
T

Minimal sample size:  7
maximal sample size:  290
(131, 1205)
1
131
              precision    recall  f1-score   support

           0       0.93      0.98      0.95        63
           1       1.00      0.93      0.97        30
           2       0.97      0.92      0.95        38

   micro avg       0.95      0.95      0.95       131
   macro avg       0.97      0.95      0.96       131
weighted avg       0.96      0.95      0.95       131

[62  0  1  2 28  0  3  0 35]
svc Accuracy:  0.9541984732824428
svc F1:  0.9551031137238034
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        63
           1       1.00      1.00      1.00        30
           2       0.97      1.00      0.99        38

   micro avg       0.99      0.99      0.99       131
   macro avg       0.99      0.99      0.99       131
weighted avg       0.99      0.99      0.99       131

[62  0  1  0 30  0  0  0 38]
LR Accuracy:  0.9923664122137404
LR F1:  0.9930043

Minimal sample size:  6
maximal sample size:  271
(78, 775)
1
78
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        37
           1       1.00      0.93      0.96        41

   micro avg       0.96      0.96      0.96        78
   macro avg       0.96      0.96      0.96        78
weighted avg       0.96      0.96      0.96        78

[37  0  3 38]
svc Accuracy:  0.9615384615384616
svc F1:  0.9615321387473287
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        37
           1       1.00      0.98      0.99        41

   micro avg       0.99      0.99      0.99        78
   macro avg       0.99      0.99      0.99        78
weighted avg       0.99      0.99      0.99        78

[37  0  1 40]
LR Accuracy:  0.9871794871794872
LR F1:  0.9871604938271604
For name:  c_meyer
total sample size before apply threshold:  136
Counter({'0000-0001-7599-3973': 34, '0000-0002-9877-1393': 29, '0

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       125
           1       1.00      1.00      1.00        41

   micro avg       1.00      1.00      1.00       166
   macro avg       1.00      1.00      1.00       166
weighted avg       1.00      1.00      1.00       166

[125   0   0  41]
LR Accuracy:  1.0
LR F1:  1.0
For name:  c_guo
total sample size before apply threshold:  6
Counter({'0000-0001-9253-3469': 2, '0000-0002-0432-8121': 2, '0000-0002-4000-8141': 1, '0000-0003-2182-3287': 1})
[]
Total sample size after apply threshold:  0
For name:  m_hansen
total sample size before apply threshold:  252
Counter({'0000-0001-5372-4828': 55, '0000-0002-8087-8731': 40, '0000-0002-4663-8742': 29, '0000-0001-7114-8051': 27, '0000-0003-3333-2856': 24, '0000-0002-8619-1519': 17, '0000-0002-2607-461X': 16, '0000-0002-5695-6728': 11, '0000-0002-1582-7866': 6, '0000-0003-1684-8578': 6, '0000-0002-1940-0616': 5, '0000-0003-3083-4850': 4, '0000-

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       244
           1       0.96      0.87      0.91        30
           2       0.98      0.97      0.98        67
           3       1.00      0.93      0.97        30
           4       0.96      0.94      0.95        71
           5       0.89      1.00      0.94        51

   micro avg       0.97      0.97      0.97       493
   macro avg       0.97      0.95      0.96       493
weighted avg       0.97      0.97      0.97       493

[243   0   0   0   0   1   0  26   0   0   3   1   0   0  65   0   0   2
   0   0   0  28   0   2   2   1   1   0  67   0   0   0   0   0   0  51]
LR Accuracy:  0.973630831643002
LR F1:  0.9573176062007703
For name:  j_burton
total sample size before apply threshold:  46
Counter({'0000-0003-1176-7592': 34, '0000-0003-2817-7353': 6, '0000-0001-5267-1277': 4, '0000-0002-3205-8819': 2})
['0000-0003-1176-7592']
Total sample size after apply threshold:  34
F

Minimal sample size:  35
maximal sample size:  251
(86, 755)
1
86
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        43

   micro avg       1.00      1.00      1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

[43  0  0 43]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        43

   micro avg       1.00      1.00      1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

[43  0  0 43]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_lynch
total sample size before apply threshold:  20
Counter({'0000-0002-1227-2252': 7, '0000-0003-0889-2616': 6, '0000-0003-3624-2741': 4, '0000-0003-0108-2127': 2, '0000-0002-4

Total sample size after apply threshold:  83
Minimal sample size:  16
maximal sample size:  281
(83, 796)
1
83
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        31
           1       0.98      0.94      0.96        52

   micro avg       0.95      0.95      0.95        83
   macro avg       0.94      0.96      0.95        83
weighted avg       0.95      0.95      0.95        83

[30  1  3 49]
svc Accuracy:  0.9518072289156626
svc F1:  0.9491421568627451
              precision    recall  f1-score   support

           0       0.94      0.97      0.95        31
           1       0.98      0.96      0.97        52

   micro avg       0.96      0.96      0.96        83
   macro avg       0.96      0.96      0.96        83
weighted avg       0.96      0.96      0.96        83

[30  1  2 50]
LR Accuracy:  0.963855421686747
LR F1:  0.9616273693943597
For name:  j_huber
total sample size before apply threshold:  96
Counter({'0000-0001-7

Minimal sample size:  9
maximal sample size:  350
(280, 2380)
1
280
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        53
           1       1.00      0.98      0.99        44
           2       1.00      0.98      0.99        92
           3       0.95      0.99      0.97        91

   micro avg       0.98      0.98      0.98       280
   macro avg       0.98      0.98      0.98       280
weighted avg       0.98      0.98      0.98       280

[51  0  0  2  0 43  0  1  0  0 90  2  1  0  0 90]
svc Accuracy:  0.9785714285714285
svc F1:  0.9791718107624671
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        53
           1       1.00      1.00      1.00        44
           2       1.00      1.00      1.00        92
           3       0.98      0.99      0.98        91

   micro avg       0.99      0.99      0.99       280
   macro avg       0.99      0.99      0.99       280
weigh

Minimal sample size:  22
maximal sample size:  367
(124, 1163)
1
124
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        47
           1       0.93      0.98      0.95        41
           2       0.97      0.94      0.96        36

   micro avg       0.97      0.97      0.97       124
   macro avg       0.97      0.97      0.97       124
weighted avg       0.97      0.97      0.97       124

[46  1  0  0 40  1  0  2 34]
svc Accuracy:  0.967741935483871
svc F1:  0.9664582476940496
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        47
           1       1.00      1.00      1.00        41
           2       0.97      1.00      0.99        36

   micro avg       0.99      0.99      0.99       124
   macro avg       0.99      0.99      0.99       124
weighted avg       0.99      0.99      0.99       124

[46  0  1  0 41  0  0  0 36]
LR Accuracy:  0.9919354838709677
LR F1:  0.9918495

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        90

   micro avg       1.00      1.00      1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

[110   0   0  90]
LR Accuracy:  1.0
LR F1:  1.0
For name:  a_sharma
total sample size before apply threshold:  223
Counter({'0000-0002-2653-0806': 85, '0000-0003-3349-4417': 23, '0000-0002-7668-3501': 14, '0000-0002-0172-5033': 12, '0000-0003-2264-2007': 10, '0000-0003-0553-4039': 9, '0000-0001-6906-190X': 9, '0000-0002-7029-9867': 8, '0000-0002-7442-8494': 8, '0000-0003-3281-2081': 6, '0000-0001-6539-9970': 6, '0000-0001-5061-9731': 5, '0000-0002-6201-7639': 5, '0000-0002-4117-8775': 4, '0000-0002-8458-9216': 3, '0000-0002-5251-9045': 3, '0000-0001-7570-852X': 2, '0000-0002-1655-5997': 2, '0000-0002-4374-4259': 2, '0000-0003-4841-0108': 2, '0000-0002-6862-136X': 2

Minimal sample size:  10
maximal sample size:  356
(287, 2396)
1
287
              precision    recall  f1-score   support

           0       0.95      0.84      0.89        50
           1       1.00      0.92      0.96        71
           2       0.86      0.93      0.89        71
           3       0.81      1.00      0.89        55
           4       0.94      0.78      0.85        40

   micro avg       0.90      0.90      0.90       287
   macro avg       0.91      0.89      0.90       287
weighted avg       0.91      0.90      0.90       287

[42  0  4  3  1  0 65  5  1  0  1  0 66  3  1  0  0  0 55  0  1  0  2  6
 31]
svc Accuracy:  0.9024390243902439
svc F1:  0.897003055538449
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        50
           1       0.97      0.92      0.94        71
           2       0.88      0.96      0.92        71
           3       0.95      1.00      0.97        55
           4       0.94      0.8

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      0.94      0.97        31
           2       0.99      1.00      0.99       138
           3       1.00      1.00      1.00        78
           4       0.95      0.95      0.95        38

   micro avg       0.99      0.99      0.99       339
   macro avg       0.99      0.98      0.98       339
weighted avg       0.99      0.99      0.99       339

[ 54   0   0   0   0   0  29   0   0   2   0   0 138   0   0   0   0   0
  78   0   0   0   2   0  36]
LR Accuracy:  0.9882005899705014
LR F1:  0.9813681686229965
For name:  m_richardson
total sample size before apply threshold:  175
Counter({'0000-0001-5672-9552': 166, '0000-0002-1650-0064': 5, '0000-0002-7390-9480': 3, '0000-0003-2694-5486': 1})
['0000-0001-5672-9552']
Total sample size after apply threshold:  166
For name:  c_ryan
total sample size before apply threshold:  159
Counter({'0000-0003-2158-9

Minimal sample size:  7
maximal sample size:  345
(97, 891)
1
97
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        51
           1       0.96      1.00      0.98        46

   micro avg       0.98      0.98      0.98        97
   macro avg       0.98      0.98      0.98        97
weighted avg       0.98      0.98      0.98        97

[49  2  0 46]
svc Accuracy:  0.979381443298969
svc F1:  0.9793617021276595
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        46

   micro avg       1.00      1.00      1.00        97
   macro avg       1.00      1.00      1.00        97
weighted avg       1.00      1.00      1.00        97

[51  0  0 46]
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_acosta
total sample size before apply threshold:  47
Counter({'0000-0002-5018-339X': 24, '0000-0003-4827-7271': 17, '0000-0003-0611-6672': 4, '0000-0

['0000-0002-7748-4440', '0000-0001-6883-3752', '0000-0001-5051-9896', '0000-0003-0708-6073', '0000-0003-0676-4610', '0000-0002-5799-6705', '0000-0001-5108-8338', '0000-0001-6758-1995']
Total sample size after apply threshold:  517
Minimal sample size:  10
maximal sample size:  440
(517, 3515)
1
517
              precision    recall  f1-score   support

           0       0.77      0.87      0.82        39
           1       0.97      0.92      0.94        36
           2       0.81      0.65      0.72        40
           3       0.96      0.95      0.95        73
           4       0.95      0.88      0.91        40
           5       0.85      0.91      0.88        93
           6       0.97      0.98      0.97       147
           7       0.94      0.94      0.94        49

   micro avg       0.91      0.91      0.91       517
   macro avg       0.90      0.89      0.89       517
weighted avg       0.91      0.91      0.91       517

[ 34   0   1   1   0   3   0   0   0  33   0   0 

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        33
           1       0.93      0.88      0.90        42
           2       0.91      0.75      0.82        56
           3       0.91      0.99      0.95       219
           4       0.98      0.97      0.97       194

   micro avg       0.94      0.94      0.94       544
   macro avg       0.95      0.89      0.91       544
weighted avg       0.94      0.94      0.94       544

[ 28   0   0   4   1   0  37   0   5   0   0   2  42   9   3   0   0   2
 217   0   0   1   2   3 188]
LR Accuracy:  0.9411764705882353
LR F1:  0.9135532519435567
For name:  e_lee
total sample size before apply threshold:  300
Counter({'0000-0003-0232-7704': 81, '0000-0003-0418-1454': 48, '0000-0001-7494-1776': 48, '0000-0003-1255-9808': 40, '0000-0001-7188-3857': 29, '0000-0002-6369-7429': 16, '0000-0001-9670-3242': 10, '0000-0001-8131-6872': 8, '0000-0001-5144-2552': 3, '0000-0003-4725-4959': 3, '0000-00

Minimal sample size:  2
maximal sample size:  300
(225, 1796)
1
225
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        31
           1       0.99      0.98      0.99       194

   micro avg       0.98      0.98      0.98       225
   macro avg       0.95      0.98      0.96       225
weighted avg       0.98      0.98      0.98       225

[ 30   1   3 191]
svc Accuracy:  0.9822222222222222
svc F1:  0.9635686528497409
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00       194

   micro avg       1.00      1.00      1.00       225
   macro avg       1.00      1.00      1.00       225
weighted avg       1.00      1.00      1.00       225

[ 31   0   0 194]
LR Accuracy:  1.0
LR F1:  1.0
For name:  k_hong
total sample size before apply threshold:  127
Counter({'0000-0002-4684-6111': 44, '0000-0002-2852-5111': 29, '0000-0001-7325-1036':

Minimal sample size:  5
maximal sample size:  265
(85, 893)
1
85
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        42
           1       0.96      1.00      0.98        43

   micro avg       0.98      0.98      0.98        85
   macro avg       0.98      0.98      0.98        85
weighted avg       0.98      0.98      0.98        85

[40  2  0 43]
svc Accuracy:  0.9764705882352941
svc F1:  0.9764412416851441
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        43

   micro avg       1.00      1.00      1.00        85
   macro avg       1.00      1.00      1.00        85
weighted avg       1.00      1.00      1.00        85

[42  0  0 43]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_regan
total sample size before apply threshold:  27
Counter({'0000-0003-2164-9151': 10, '0000-0001-5816-4516': 9, '0000-0001-9987-7942': 7, '0000-00

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       194
           1       0.88      0.90      0.89        39
           2       0.84      0.87      0.85        30
           3       0.91      0.91      0.91       101

   micro avg       0.95      0.95      0.95       364
   macro avg       0.91      0.92      0.91       364
weighted avg       0.95      0.95      0.95       364

[192   0   0   2   0  35   1   3   0   0  26   4   0   5   4  92]
svc Accuracy:  0.9478021978021978
svc F1:  0.9110611769297957
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       194
           1       0.90      0.92      0.91        39
           2       0.87      0.90      0.89        30
           3       0.94      0.93      0.94       101

   micro avg       0.96      0.96      0.96       364
   macro avg       0.93      0.94      0.93       364
weighted avg       0.96      0.96      0.96       364

[1

total sample size before apply threshold:  17
Counter({'0000-0002-1247-6896': 9, '0000-0003-3580-022X': 4, '0000-0002-9680-4518': 1, '0000-0003-0534-9749': 1, '0000-0002-9686-2769': 1, '0000-0001-8985-9679': 1})
[]
Total sample size after apply threshold:  0
For name:  f_martins
total sample size before apply threshold:  65
Counter({'0000-0003-0960-4620': 19, '0000-0003-4189-1228': 10, '0000-0003-2668-2401': 9, '0000-0002-1812-2300': 8, '0000-0003-2161-459X': 6, '0000-0002-9863-6255': 5, '0000-0002-3277-1809': 4, '0000-0002-0680-3643': 3, '0000-0003-4997-3973': 1})
[]
Total sample size after apply threshold:  0
For name:  s_wolf
total sample size before apply threshold:  363
Counter({'0000-0003-2972-3440': 173, '0000-0002-7467-7028': 102, '0000-0002-5337-5063': 46, '0000-0003-0832-6315': 15, '0000-0002-3747-8097': 12, '0000-0003-1752-6175': 9, '0000-0003-3921-6629': 3, '0000-0001-7717-6993': 2, '0000-0002-6748-3911': 1})
['0000-0002-7467-7028', '0000-0002-5337-5063', '0000-0003-2972-34

Minimal sample size:  19
maximal sample size:  332
(72, 843)
1
72
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        35
           1       0.95      0.95      0.95        37

   micro avg       0.94      0.94      0.94        72
   macro avg       0.94      0.94      0.94        72
weighted avg       0.94      0.94      0.94        72

[33  2  2 35]
svc Accuracy:  0.9444444444444444
svc F1:  0.9444015444015443
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        35
           1       1.00      0.97      0.99        37

   micro avg       0.99      0.99      0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72

[35  0  1 36]
LR Accuracy:  0.9861111111111112
LR F1:  0.9861084314103801
For name:  a_moura
total sample size before apply threshold:  36
Counter({'0000-0003-0339-1230': 15, '0000-0002-2105-7319': 14, '0

Minimal sample size:  10
maximal sample size:  262
(172, 1308)
1
172
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       120
           1       0.98      1.00      0.99        52

   micro avg       0.99      0.99      0.99       172
   macro avg       0.99      1.00      0.99       172
weighted avg       0.99      0.99      0.99       172

[119   1   0  52]
svc Accuracy:  0.9941860465116279
svc F1:  0.9931460450288903
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       120
           1       0.96      0.98      0.97        52

   micro avg       0.98      0.98      0.98       172
   macro avg       0.98      0.98      0.98       172
weighted avg       0.98      0.98      0.98       172

[118   2   1  51]
LR Accuracy:  0.9825581395348837
LR F1:  0.9794381350866705
For name:  h_shin
total sample size before apply threshold:  114
Counter({'0000-0001-7615-9809': 34, '0000-0001-7080-60

Minimal sample size:  4
maximal sample size:  607
(144, 1281)
1
144
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       102
           1       0.95      0.98      0.96        42

   micro avg       0.98      0.98      0.98       144
   macro avg       0.97      0.98      0.97       144
weighted avg       0.98      0.98      0.98       144

[100   2   1  41]
svc Accuracy:  0.9791666666666666
svc F1:  0.9749637786148941
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       144
   macro avg       1.00      1.00      1.00       144
weighted avg       1.00      1.00      1.00       144

[102   0   0  42]
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_kang
total sample size before apply threshold:  131
Counter({'0000-0003-1595-1717': 38, '0000-0003-3245-144X': 19, '0000-0002-2039-4866':

Minimal sample size:  10
maximal sample size:  313
(346, 2526)
1
346
              precision    recall  f1-score   support

           0       0.76      0.78      0.77        41
           1       0.97      0.84      0.90        38
           2       0.92      0.86      0.89        42
           3       0.97      0.92      0.94        37
           4       0.98      0.96      0.97        50
           5       1.00      0.89      0.94        38
           6       1.00      0.96      0.98        46
           7       0.73      0.94      0.82        54

   micro avg       0.90      0.90      0.90       346
   macro avg       0.92      0.89      0.90       346
weighted avg       0.91      0.90      0.90       346

[32  0  3  1  0  0  0  5  1 32  0  0  0  0  0  5  4  0 36  0  0  0  0  2
  1  0  0 34  1  0  0  1  0  0  0  0 48  0  0  2  1  0  0  0  0 34  0  3
  1  0  0  0  0  0 44  1  2  1  0  0  0  0  0 51]
svc Accuracy:  0.8988439306358381
svc F1:  0.9025407448084298
              precisio

For name:  k_zhu
total sample size before apply threshold:  6
Counter({'0000-0001-7664-7204': 3, '0000-0003-4361-1138': 1, '0000-0003-2784-3190': 1, '0000-0003-2293-3568': 1})
[]
Total sample size after apply threshold:  0
For name:  a_machado
total sample size before apply threshold:  150
Counter({'0000-0002-8132-5610': 54, '0000-0002-5677-7332': 30, '0000-0003-4380-3711': 25, '0000-0001-6200-3686': 16, '0000-0003-0732-1571': 14, '0000-0003-1999-1206': 4, '0000-0003-1947-8605': 4, '0000-0001-8957-661X': 2, '0000-0001-9341-5827': 1})
['0000-0002-8132-5610', '0000-0002-5677-7332']
Total sample size after apply threshold:  84
Minimal sample size:  2
maximal sample size:  312
(84, 747)
1
84
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        54
           1       1.00      0.93      0.97        30

   micro avg       0.98      0.98      0.98        84
   macro avg       0.98      0.97      0.97        84
weighted avg       0.98      0.

Minimal sample size:  3
maximal sample size:  365
(118, 1033)
1
118
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        61
           1       0.91      0.89      0.90        57

   micro avg       0.91      0.91      0.91       118
   macro avg       0.91      0.91      0.91       118
weighted avg       0.91      0.91      0.91       118

[56  5  6 51]
svc Accuracy:  0.9067796610169492
svc F1:  0.906611986473847
              precision    recall  f1-score   support

           0       1.00      0.93      0.97        61
           1       0.93      1.00      0.97        57

   micro avg       0.97      0.97      0.97       118
   macro avg       0.97      0.97      0.97       118
weighted avg       0.97      0.97      0.97       118

[57  4  0 57]
LR Accuracy:  0.9661016949152542
LR F1:  0.9661016949152543
For name:  a_sinclair
total sample size before apply threshold:  109
Counter({'0000-0003-2741-7992': 64, '0000-0001-8510-8691': 3

['0000-0003-1378-4273']
Total sample size after apply threshold:  47
For name:  d_patel
total sample size before apply threshold:  33
Counter({'0000-0002-1154-3444': 9, '0000-0002-5744-568X': 8, '0000-0002-2236-7757': 5, '0000-0002-1110-0125': 3, '0000-0002-7198-1163': 2, '0000-0002-3746-8171': 2, '0000-0002-0375-2318': 2, '0000-0002-9592-1990': 2})
[]
Total sample size after apply threshold:  0
For name:  a_james
total sample size before apply threshold:  154
Counter({'0000-0002-4125-4053': 64, '0000-0002-1411-9307': 37, '0000-0002-0873-3714': 29, '0000-0001-8523-0857': 9, '0000-0002-6174-6696': 4, '0000-0001-8454-6219': 3, '0000-0003-4573-932X': 2, '0000-0001-5655-1213': 2, '0000-0002-0023-4363': 2, '0000-0001-9274-7803': 1, '0000-0002-2002-622X': 1})
['0000-0002-4125-4053', '0000-0002-1411-9307']
Total sample size after apply threshold:  101
Minimal sample size:  12
maximal sample size:  339
(101, 1046)
1
101
              precision    recall  f1-score   support

           0       

Minimal sample size:  0
maximal sample size:  283
(325, 1987)
1
325
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        34
           1       0.94      0.95      0.94        95
           2       0.97      0.94      0.95       109
           3       0.76      0.94      0.84        48
           4       0.86      0.77      0.81        39

   micro avg       0.91      0.91      0.91       325
   macro avg       0.89      0.88      0.88       325
weighted avg       0.91      0.91      0.91       325

[ 28   1   3   1   1   0  90   0   3   2   2   1 102   4   0   0   1   0
  45   2   0   3   0   6  30]
svc Accuracy:  0.9076923076923077
svc F1:  0.8845223422277295
              precision    recall  f1-score   support

           0       0.94      0.91      0.93        34
           1       0.96      0.98      0.97        95
           2       0.99      0.94      0.97       109
           3       0.81      0.98      0.89        48
       

Minimal sample size:  7
maximal sample size:  299
(661, 3576)
1
661
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       587
           1       0.93      0.95      0.94        43
           2       0.96      0.84      0.90        31

   micro avg       0.98      0.98      0.98       661
   macro avg       0.96      0.93      0.94       661
weighted avg       0.98      0.98      0.98       661

[583   3   1   2  41   0   5   0  26]
svc Accuracy:  0.983358547655068
svc F1:  0.9432448884591972
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       587
           1       0.98      0.93      0.95        43
           2       0.94      0.94      0.94        31

   micro avg       0.99      0.99      0.99       661
   macro avg       0.97      0.95      0.96       661
weighted avg       0.99      0.99      0.99       661

[584   1   2   3  40   0   2   0  29]
LR Accuracy:  0.9878971255673222


Total sample size after apply threshold:  0
For name:  m_longo
total sample size before apply threshold:  44
Counter({'0000-0003-1117-1772': 33, '0000-0001-5062-6245': 5, '0000-0002-6364-8184': 3, '0000-0002-2450-4903': 2, '0000-0001-8325-4003': 1})
['0000-0003-1117-1772']
Total sample size after apply threshold:  33
For name:  h_chiang
total sample size before apply threshold:  44
Counter({'0000-0002-2979-6108': 18, '0000-0001-8781-5146': 14, '0000-0002-2333-9117': 7, '0000-0001-5041-9705': 5})
[]
Total sample size after apply threshold:  0
For name:  m_o'brien
total sample size before apply threshold:  34
Counter({'0000-0002-8509-3650': 20, '0000-0002-1721-0464': 9, '0000-0003-1096-1991': 4, '0000-0003-4990-3289': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ray
total sample size before apply threshold:  123
Counter({'0000-0002-1051-7260': 75, '0000-0001-5675-1258': 30, '0000-0001-8034-7706': 9, '0000-0002-2414-2930': 5, '0000-0002-4640-708X': 2, '0000-0003-2566-71

              precision    recall  f1-score   support

           0       0.95      0.91      0.93        46
           1       0.92      0.98      0.95        57
           2       0.96      0.90      0.93        30

   micro avg       0.94      0.94      0.94       133
   macro avg       0.95      0.93      0.94       133
weighted avg       0.94      0.94      0.94       133

[42  3  1  1 56  0  1  2 27]
svc Accuracy:  0.9398496240601504
svc F1:  0.9378401194882784
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        46
           1       0.98      0.98      0.98        57
           2       0.97      0.93      0.95        30

   micro avg       0.97      0.97      0.97       133
   macro avg       0.97      0.96      0.97       133
weighted avg       0.97      0.97      0.97       133

[45  0  1  1 56  0  1  1 28]
LR Accuracy:  0.9699248120300752
LR F1:  0.9664502060692098
For name:  w_xu
total sample size before apply threshold: 

              precision    recall  f1-score   support

           0       0.90      0.95      0.92        56
           1       0.83      0.94      0.88       124
           2       0.83      0.77      0.80        31
           3       0.94      0.91      0.92        32
           4       0.68      0.70      0.69        73
           5       0.85      0.91      0.88        32
           6       0.89      0.89      0.89        46
           7       0.91      0.82      0.86        38
           8       0.94      0.97      0.95        30
           9       0.63      0.84      0.72        38
          10       0.70      0.65      0.67        51
          11       0.95      0.90      0.92        41
          12       0.89      0.69      0.78        36
          13       0.76      0.65      0.70        78
          14       0.71      0.45      0.56        33
          15       0.97      0.98      0.98       200
          16       0.86      0.83      0.84        52
          17       0.95    

Total sample size after apply threshold:  288
Minimal sample size:  10
maximal sample size:  330
(288, 2062)
1
288
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       119
           1       1.00      0.85      0.92        48
           2       0.94      0.96      0.95        85
           3       0.97      0.97      0.97        36

   micro avg       0.96      0.96      0.96       288
   macro avg       0.97      0.95      0.95       288
weighted avg       0.96      0.96      0.96       288

[118   0   1   0   4  41   3   0   2   0  82   1   0   0   1  35]
svc Accuracy:  0.9583333333333334
svc F1:  0.9545630811399618
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       119
           1       1.00      0.92      0.96        48
           2       0.94      0.99      0.97        85
           3       1.00      0.94      0.97        36

   micro avg       0.97      0.97      0.97       

Minimal sample size:  7
maximal sample size:  295
(157, 1432)
1
157
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        60
           1       1.00      0.96      0.98        97

   micro avg       0.97      0.97      0.97       157
   macro avg       0.97      0.98      0.97       157
weighted avg       0.98      0.97      0.97       157

[60  0  4 93]
svc Accuracy:  0.9745222929936306
svc F1:  0.9733446519524618
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        60
           1       1.00      0.99      0.99        97

   micro avg       0.99      0.99      0.99       157
   macro avg       0.99      0.99      0.99       157
weighted avg       0.99      0.99      0.99       157

[60  0  1 96]
LR Accuracy:  0.9936305732484076
LR F1:  0.9932770950199118
For name:  m_aguilar
total sample size before apply threshold:  108
Counter({'0000-0002-1935-6619': 59, '0000-0001-7395-5754': 1

Minimal sample size:  7
maximal sample size:  344
(514, 3321)
1
514
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        40
           1       0.91      0.98      0.95        53
           2       0.97      0.94      0.95        64
           3       0.96      0.87      0.92        31
           4       0.84      0.88      0.86        56
           5       0.95      0.97      0.96        98
           6       0.65      0.67      0.66        30
           7       0.87      0.98      0.93        63
           8       1.00      0.74      0.85        31
           9       1.00      0.94      0.97        48

   micro avg       0.92      0.92      0.92       514
   macro avg       0.91      0.89      0.90       514
weighted avg       0.92      0.92      0.92       514

[38  0  0  0  0  0  0  2  0  0  0 52  0  0  0  1  0  0  0  0  0  0 60  1
  0  1  1  1  0  0  0  0  2 27  0  1  1  0  0  0  0  0  0  0 49  0  6  1
  0  0  0  0  0  0  0 95  3

Minimal sample size:  14
maximal sample size:  310
(148, 1422)
1
148
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        89
           1       0.98      0.95      0.97        59

   micro avg       0.97      0.97      0.97       148
   macro avg       0.97      0.97      0.97       148
weighted avg       0.97      0.97      0.97       148

[88  1  3 56]
svc Accuracy:  0.972972972972973
svc F1:  0.9716475095785441
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        89
           1       1.00      0.97      0.98        59

   micro avg       0.99      0.99      0.99       148
   macro avg       0.99      0.98      0.99       148
weighted avg       0.99      0.99      0.99       148

[89  0  2 57]
LR Accuracy:  0.9864864864864865
LR F1:  0.985823754789272
For name:  m_sousa
total sample size before apply threshold:  211
Counter({'0000-0002-3009-3290': 117, '0000-0001-9424-4150': 28,

Minimal sample size:  19
maximal sample size:  305
(180, 1691)
1
180
              precision    recall  f1-score   support

           0       0.88      0.92      0.90        48
           1       0.94      0.97      0.95        61
           2       1.00      0.94      0.97        36
           3       0.97      0.91      0.94        35

   micro avg       0.94      0.94      0.94       180
   macro avg       0.95      0.94      0.94       180
weighted avg       0.94      0.94      0.94       180

[44  3  0  1  2 59  0  0  2  0 34  0  2  1  0 32]
svc Accuracy:  0.9388888888888889
svc F1:  0.9405442822290206
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        48
           1       0.98      1.00      0.99        61
           2       1.00      1.00      1.00        36
           3       0.97      0.91      0.94        35

   micro avg       0.97      0.97      0.97       180
   macro avg       0.97      0.97      0.97       180
weig

Minimal sample size:  9
maximal sample size:  351
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(208, 1646)
1
208
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       133
           1       0.99      0.99      0.99        75

   micro avg       0.99      0.99      0.99       208
   macro avg       0.99      0.99      0.99       208
weighted avg       0.99      0.99      0.99       208

[132   1   1  74]
svc Accuracy:  0.9903846153846154
svc F1:  0.9895739348370928
   

Minimal sample size:  2
maximal sample size:  319
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(810, 4121)
1
810
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       154
           1       0.96      1.00      0.98       211
           2       0.98      0.84      0.91        57
           3       0.95      0.96      0.95       139
           4       1.00      0.91      0.95        45
           5       0.96      0.88      0.92        57
           6       0.96      0.

Minimal sample size:  15
maximal sample size:  292
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(93, 869)
1
93
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        61
           1       1.00      0.72      0.84        32

   micro avg       0.90      0.90      0.90        93
   macro avg       0.94      0.86      0.88        93
weighted avg       0.92      0.90      0.90        93

[61  0  9 23]
svc Accuracy:  0.9032258064516129
svc F1:  0.8838306731436503
         

Minimal sample size:  9
maximal sample size:  297
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(329, 2058)
1
329
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       205
           1       1.00      0.97      0.99        38
           2       1.00      0.95      0.98        86

   micro avg       0.98      0.98      0.98       329
   macro avg       0.99      0.98      0.98       329
weighted avg       0.99      0.98      0.98       329

[205   0   0   1  37   0   4 

Minimal sample size:  8
maximal sample size:  351
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(283, 2248)
1
283
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       149
           1       0.96      0.90      0.93        98
           2       0.97      0.94      0.96        36

   micro avg       0.94      0.94      0.94       283
   macro avg       0.95      0.94      0.94       283
weighted avg       0.94      0.94      0.94       283

[144   4   1  10  88   0   2 

              precision    recall  f1-score   support

           0       0.95      0.99      0.97        85
           1       0.97      1.00      0.98        31
           2       1.00      0.93      0.96        54

   micro avg       0.97      0.97      0.97       170
   macro avg       0.97      0.97      0.97       170
weighted avg       0.97      0.97      0.97       170

[84  1  0  0 31  0  4  0 50]
svc Accuracy:  0.9705882352941176
svc F1:  0.9722545705204665
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        85
           1       1.00      0.65      0.78        31
           2       1.00      0.91      0.95        54

   micro avg       0.91      0.91      0.91       170
   macro avg       0.95      0.85      0.88       170
weighted avg       0.92      0.91      0.90       170

[85  0  0 11 20  0  5  0 49]
LR Accuracy:  0.9058823529411765
LR F1:  0.8832495102644878
For name:  c_cardoso
total sample size before apply thresh

Minimal sample size:  4
maximal sample size:  289
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(189, 1677)
1
189
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        65
           1       1.00      1.00      1.00       124

   micro avg       1.00      1.00      1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189

[ 65   0   0 124]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

Minimal sample size:  8
maximal sample size:  248
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(103, 801)
1
103
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       103
   macro avg       1.00      1.00      1.00       103
weighted avg       1.00      1.00      1.00       103

[61  0  0 42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-

Minimal sample size:  29
maximal sample size:  277
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(94, 889)
1
94
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        53

   micro avg       1.00      1.00      1.00        94
   macro avg       1.00      1.00      1.00        94
weighted avg       1.00      1.00      1.00        94

[41  0  0 53]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

Counter({'0000-0002-3765-3318': 238, '0000-0002-4450-3772': 31, '0000-0001-9862-6578': 5, '0000-0002-3494-9658': 3, '0000-0003-0684-0740': 2})
['0000-0002-3765-3318', '0000-0002-4450-3772']
Total sample size after apply threshold:  269
Minimal sample size:  13
maximal sample size:  317
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(269, 2025)
1
269
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       238
           1       1.00      1.00      1.00        31

   micro 

Minimal sample size:  9
maximal sample size:  292
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(168, 1429)
1
168
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       124
           1       1.00      0.98      0.99        44

   micro avg       0.99      0.99      0.99       168
   macro avg       1.00      0.99      0.99       168
weighted avg       0.99      0.99      0.99       168

[124   0   1  43]
svc Accuracy:  0.9940476190476191
svc F1:  0.9922448414347043
   

Minimal sample size:  4
maximal sample size:  340
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(167, 1263)
1
167
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        57

   micro avg       1.00      1.00      1.00       167
   macro avg       1.00      1.00      1.00       167
weighted avg       1.00      1.00      1.00       167

[110   0   0  57]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

Minimal sample size:  8
maximal sample size:  330
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(168, 1484)
1
168
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        44
           1       0.97      0.98      0.97        59
           2       0.97      0.97      0.97        65

   micro avg       0.98      0.98      0.98       168
   macro avg       0.98      0.98      0.98       168
weighted avg       0.98      0.98      0.98       168

[43  0  1  0 58  1  0  2 63]


Minimal sample size:  5
maximal sample size:  352
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(398, 2348)
1
398
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       231
           1       1.00      1.00      1.00       167

   micro avg       1.00      1.00      1.00       398
   macro avg       1.00      1.00      1.00       398
weighted avg       1.00      1.00      1.00       398

[231   0   0 167]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

Minimal sample size:  1
maximal sample size:  288
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(351, 2437)
1
351
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       188
           1       0.98      1.00      0.99        65
           2       0.99      0.96      0.97        98

   micro avg       0.99      0.99      0.99       351
   macro avg       0.99      0.98      0.99       351
weighted avg       0.99      0.99      0.99       351

[187   0   1   0  65   0   3 

Minimal sample size:  3
maximal sample size:  331
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(101, 786)
1
101
              precision    recall  f1-score   support

           0       1.00      0.93      0.97        30
           1       0.95      0.97      0.96        38
           2       0.91      0.94      0.93        33

   micro avg       0.95      0.95      0.95       101
   macro avg       0.95      0.95      0.95       101
weighted avg       0.95      0.95      0.95       101

[28  0  2  0 37  1  0  2 31]
s

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.92      0.77      0.84        31
           2       0.90      0.87      0.88        30
           3       0.96      0.84      0.90        57
           4       0.98      0.89      0.93        55
           5       0.93      0.81      0.86        31
           6       0.87      0.96      0.91       121
           7       0.71      0.98      0.83       117
           8       0.90      0.93      0.92        30
           9       0.98      0.98      0.98        60
          10       0.91      0.96      0.94        54
          11       0.97      0.86      0.91        37
          12       0.84      0.53      0.65        40
          13       0.97      0.74      0.84        42
          14       0.94      0.55      0.69        31
          15       0.85      0.90      0.87        69

   micro avg       0.88      0.88      0.88       840
   macro avg       0.91   

Minimal sample size:  6
maximal sample size:  277
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(118, 1027)
1
118
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        44
           1       1.00      1.00      1.00        32
           2       0.98      1.00      0.99        42

   micro avg       0.99      0.99      0.99       118
   macro avg       0.99      0.99      0.99       118
weighted avg       0.99      0.99      0.99       118

[43  0  1  0 32  0  0  0 42]


Minimal sample size:  6
maximal sample size:  320
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(120, 1150)
1
120
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        34
           1       0.98      1.00      0.99        86

   micro avg       0.98      0.98      0.98       120
   macro avg       0.99      0.97      0.98       120
weighted avg       0.98      0.98      0.98       120

[32  2  0 86]
svc Accuracy:  0.9833333333333333
svc F1:  0.9791013584117032
       

Minimal sample size:  3
maximal sample size:  354
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(428, 2713)
1
428
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        36
           1       0.92      0.88      0.90        40
           2       0.93      0.99      0.96       108
           3       0.98      0.99      0.98        82
           4       0.99      0.97      0.98       100
           5       0.98      0.95      0.97        62

   micro avg       0.96      0

Minimal sample size:  14
maximal sample size:  313
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(113, 1085)
1
113
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        73

   micro avg       1.00      1.00      1.00       113
   macro avg       1.00      1.00      1.00       113
weighted avg       1.00      1.00      1.00       113

[40  0  0 73]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f

Minimal sample size:  23
maximal sample size:  408
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(100, 1136)
1
100
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        57
           1       0.98      1.00      0.99        43

   micro avg       0.99      0.99      0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

[56  1  0 43]
svc Accuracy:  0.99
svc F1:  0.9898280948021565
              precis

Minimal sample size:  3
maximal sample size:  292
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(81, 789)
1
81
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        38

   micro avg       1.00      1.00      1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81

[43  0  0 38]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-sc

For name:  c_feng
total sample size before apply threshold:  88
Counter({'0000-0002-1854-356X': 30, '0000-0002-2130-8851': 26, '0000-0003-3267-0968': 12, '0000-0002-7031-4211': 12, '0000-0002-3278-9451': 7, '0000-0003-1085-4395': 1})
['0000-0002-1854-356X']
Total sample size after apply threshold:  30
For name:  j_coutinho
total sample size before apply threshold:  129
Counter({'0000-0002-3841-743X': 105, '0000-0002-6303-9549': 13, '0000-0002-1562-0099': 8, '0000-0003-0280-366X': 3})
['0000-0002-3841-743X']
Total sample size after apply threshold:  105
For name:  s_huber
total sample size before apply threshold:  44
Counter({'0000-0002-4125-159X': 26, '0000-0003-3558-351X': 12, '0000-0002-8271-7835': 3, '0000-0002-5842-5859': 2, '0000-0001-6303-5188': 1})
[]
Total sample size after apply threshold:  0
For name:  a_rocha
total sample size before apply threshold:  73
Counter({'0000-0003-3218-7001': 26, '0000-0001-9710-9835': 21, '0000-0003-2165-5519': 12, '0000-0002-4094-7982': 3, '0000-

Minimal sample size:  0
maximal sample size:  435
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(296, 2258)
1
296
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       1.00      0.97      0.98        30
           2       0.98      0.99      0.99       124
           3       0.99      1.00      0.99        99

   micro avg       0.99      0.99      0.99       296
   macro avg       0.99      0.98      0.99       296
weighted avg       0.99      0

Minimal sample size:  5
maximal sample size:  493
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(116, 1209)
1
116
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        75

   micro avg       1.00      1.00      1.00       116
   macro avg       1.00      1.00      1.00       116
weighted avg       1.00      1.00      1.00       116

[41  0  0 75]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1

Minimal sample size:  5
maximal sample size:  312
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(651, 4117)
1
651
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        31
           1       0.99      1.00      0.99        73
           2       1.00      0.94      0.97        33
           3       0.88      0.93      0.90        55
           4       0.97      1.00      0.98        92
           5       0.90      0.99      0.94        87
           6       0.96      0.

Minimal sample size:  2
maximal sample size:  307
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(303, 2211)
1
303
              precision    recall  f1-score   support

           0       0.93      0.96      0.95        57
           1       1.00      0.99      0.99        81
           2       1.00      0.96      0.98        48
           3       0.97      0.98      0.98       117

   micro avg       0.98      0.98      0.98       303
   macro avg       0.98      0.97      0.97       303
weighted avg       0.98      0

Minimal sample size:  1
maximal sample size:  249
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(91, 716)
1
91
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        54
           1       1.00      0.97      0.99        37

   micro avg       0.99      0.99      0.99        91
   macro avg       0.99      0.99      0.99        91
weighted avg       0.99      0.99      0.99        91

[54  0  1 36]
svc Accuracy:  0.989010989010989
svc F1:  0.9885635289682041
           

Minimal sample size:  16
maximal sample size:  243
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(86, 872)
1
86
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        46

   micro avg       1.00      1.00      1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

[40  0  0 46]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

Minimal sample size:  8
maximal sample size:  303
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(149, 1271)
1
149
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       107
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       149
   macro avg       1.00      1.00      1.00       149
weighted avg       1.00      1.00      1.00       149

[107   0   0  42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

[ 34   0   1   0   0   0  56  12   1   0   0   2 104   0   0   0   5   3
  32   0   0   0  11   0  34]
LR Accuracy:  0.8813559322033898
LR F1:  0.8898202096202714
For name:  r_thomas
total sample size before apply threshold:  368
Counter({'0000-0002-0518-8386': 95, '0000-0002-2340-0301': 95, '0000-0003-1448-7182': 74, '0000-0003-2062-8623': 46, '0000-0001-9251-5543': 13, '0000-0002-2970-6352': 10, '0000-0002-2165-5917': 8, '0000-0003-1282-7825': 5, '0000-0003-3588-2317': 5, '0000-0002-7286-2764': 4, '0000-0001-8784-1707': 2, '0000-0001-5256-3313': 2, '0000-0002-2069-1799': 2, '0000-0002-8745-7462': 2, '0000-0001-5296-3114': 1, '0000-0002-8872-7866': 1, '0000-0003-3473-2579': 1, '0000-0002-5362-4816': 1, '0000-0001-7194-3653': 1})
['0000-0002-0518-8386', '0000-0003-2062-8623', '0000-0003-1448-7182', '0000-0002-2340-0301']
Total sample size after apply threshold:  310
Minimal sample size:  4
maximal sample size:  447
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
 

total sample size before apply threshold:  248
Counter({'0000-0003-0224-6322': 70, '0000-0003-2434-7511': 33, '0000-0001-6303-3801': 26, '0000-0003-0420-2374': 23, '0000-0002-4379-0909': 19, '0000-0002-2736-4037': 13, '0000-0001-6623-4369': 11, '0000-0002-9971-0541': 10, '0000-0001-9223-8590': 9, '0000-0003-3053-0929': 8, '0000-0001-7886-1765': 5, '0000-0003-1265-0337': 3, '0000-0001-7350-9578': 3, '0000-0003-4262-1895': 3, '0000-0002-3863-1719': 2, '0000-0002-3463-0196': 2, '0000-0002-9252-0331': 2, '0000-0002-6631-3232': 2, '0000-0003-3160-4643': 1, '0000-0002-2932-440X': 1, '0000-0003-3497-2513': 1, '0000-0002-7357-2136': 1})
['0000-0003-0224-6322', '0000-0003-2434-7511']
Total sample size after apply threshold:  103
Minimal sample size:  25
maximal sample size:  269
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ng

Minimal sample size:  12
maximal sample size:  324
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(115, 1094)
1
115
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        43

   micro avg       1.00      1.00      1.00       115
   macro avg       1.00      1.00      1.00       115
weighted avg       1.00      1.00      1.00       115

[72  0  0 43]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f

Minimal sample size:  5
maximal sample size:  355
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(121, 1145)
1
121
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        69

   micro avg       1.00      1.00      1.00       121
   macro avg       1.00      1.00      1.00       121
weighted avg       1.00      1.00      1.00       121

[52  0  0 69]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1

For name:  b_thompson
total sample size before apply threshold:  83
Counter({'0000-0002-5885-0652': 65, '0000-0002-5358-0796': 8, '0000-0002-2302-0886': 7, '0000-0002-3845-824X': 3})
['0000-0002-5885-0652']
Total sample size after apply threshold:  65
For name:  j_blanco
total sample size before apply threshold:  362
Counter({'0000-0003-0264-4136': 102, '0000-0002-2225-0217': 91, '0000-0001-8142-0450': 74, '0000-0003-3765-0640': 41, '0000-0003-0647-3856': 40, '0000-0002-5071-4760': 7, '0000-0002-6524-4335': 5, '0000-0002-7351-5342': 1, '0000-0003-0191-2063': 1})
['0000-0003-0264-4136', '0000-0003-0647-3856', '0000-0002-2225-0217', '0000-0003-3765-0640', '0000-0001-8142-0450']
Total sample size after apply threshold:  348
Minimal sample size:  7
maximal sample size:  313
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ng

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       149
           1       1.00      0.90      0.95        39

   micro avg       0.98      0.98      0.98       188
   macro avg       0.99      0.95      0.97       188
weighted avg       0.98      0.98      0.98       188

[149   0   4  35]
svc Accuracy:  0.9787234042553191
svc F1:  0.9663504564166816
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       149
           1       1.00      0.15      0.27        39

   micro avg       0.82      0.82      0.82       188
   macro avg       0.91      0.58      0.58       188
weighted avg       0.86      0.82      0.77       188

[149   0  33   6]
LR Accuracy:  0.824468085106383
LR F1:  0.583484390735146
For name:  j_young
total sample size before apply threshold:  267
Counter({'0000-0002-1514-1522': 124, '0000-0003-4182-341X': 40, '0000-0003-3849-3392': 30, '0000-0002-1294-942X': 23, '0000-

Minimal sample size:  13
maximal sample size:  296
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(184, 1512)
1
184
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00        72

   micro avg       1.00      1.00      1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184

[112   0   0  72]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recal

Minimal sample size:  5
maximal sample size:  283
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(171, 1229)
1
171
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        61
           1       0.96      0.96      0.96        70
           2       0.97      0.97      0.97        40

   micro avg       0.96      0.96      0.96       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.96      0.96      0.96       171

[59  2  0  2 67  1  0  1 39]


              precision    recall  f1-score   support

           0       0.88      0.95      0.92        64
           1       0.89      0.88      0.88        56
           2       1.00      0.96      0.98        48
           3       0.96      0.92      0.94        48
           4       1.00      0.93      0.97        45
           5       0.88      1.00      0.94       104
           6       0.93      0.84      0.88        31
           7       1.00      0.78      0.88        37

   micro avg       0.93      0.93      0.93       433
   macro avg       0.94      0.91      0.92       433
weighted avg       0.93      0.93      0.93       433

[ 61   1   0   1   0   1   0   0   1  49   0   1   0   3   2   0   1   0
  46   0   0   1   0   0   4   0   0  44   0   0   0   0   0   0   0   0
  42   3   0   0   0   0   0   0   0 104   0   0   0   5   0   0   0   0
  26   0   2   0   0   0   0   6   0  29]
LR Accuracy:  0.9260969976905312
LR F1:  0.9222084652867977
For name:  m_young
total sam

Minimal sample size:  16
maximal sample size:  267
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(98, 846)
1
98
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        67
           1       1.00      1.00      1.00        31

   micro avg       1.00      1.00      1.00        98
   macro avg       1.00      1.00      1.00        98
weighted avg       1.00      1.00      1.00        98

[67  0  0 31]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

Minimal sample size:  1
maximal sample size:  457
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(537, 3435)
1
537
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        39
           1       0.95      0.91      0.93       156
           2       0.95      0.99      0.97       158
           3       0.94      0.97      0.95        74
           4       0.97      0.93      0.95        30
           5       0.72      0.80      0.76        41
           6       1.00      0.

Minimal sample size:  20
maximal sample size:  328
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(286, 2159)
1
286
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       166
           1       1.00      0.97      0.99        80
           2       1.00      0.95      0.97        40

   micro avg       0.99      0.99      0.99       286
   macro avg       0.99      0.97      0.98       286
weighted avg       0.99      0.99      0.99       286

[166   0   0   2  78   0   2

Minimal sample size:  10
maximal sample size:  301
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(401, 2739)
1
401
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       175
           1       0.98      1.00      0.99       185
           2       1.00      0.95      0.97        41

   micro avg       0.99      0.99      0.99       401
   macro avg       0.99      0.98      0.99       401
weighted avg       0.99      0.99      0.99       401

[174   1   0   0 185   0   0

Minimal sample size:  7
maximal sample size:  296
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(420, 2611)
1
420
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       322
           1       1.00      0.92      0.96        37
           2       0.98      0.97      0.98        61

   micro avg       0.99      0.99      0.99       420
   macro avg       0.99      0.96      0.97       420
weighted avg       0.99      0.99      0.99       420

[321   0   1   3  34   0   2 

Minimal sample size:  14
maximal sample size:  304
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(78, 795)
1
78
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00        78
   macro avg       1.00      1.00      1.00        78
weighted avg       1.00      1.00      1.00        78

[43  0  0 35]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

Minimal sample size:  8
maximal sample size:  268
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(229, 1555)
1
229
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       140
           1       1.00      1.00      1.00        89

   micro avg       1.00      1.00      1.00       229
   macro avg       1.00      1.00      1.00       229
weighted avg       1.00      1.00      1.00       229

[140   0   0  89]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

Minimal sample size:  5
maximal sample size:  303
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(401, 2673)
1
401
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       188
           1       1.00      0.98      0.99        58
           2       0.95      0.98      0.97        57
           3       0.89      0.95      0.92        58
           4       0.97      0.82      0.89        40

   micro avg       0.97      0.97      0.97       401
   macro avg       0.96      0

              precision    recall  f1-score   support

           0       0.95      0.96      0.96        85
           1       0.94      0.94      0.94        33
           2       0.93      0.90      0.92        31
           3       1.00      1.00      1.00        38

   micro avg       0.96      0.96      0.96       187
   macro avg       0.96      0.95      0.95       187
weighted avg       0.96      0.96      0.96       187

[82  1  2  0  2 31  0  0  2  1 28  0  0  0  0 38]
svc Accuracy:  0.9572192513368984
svc F1:  0.9541227634411414
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        85
           1       0.96      0.73      0.83        33
           2       1.00      0.48      0.65        31
           3       1.00      0.97      0.99        38

   micro avg       0.86      0.86      0.86       187
   macro avg       0.93      0.80      0.83       187
weighted avg       0.89      0.86      0.85       187

[85  0  0  0  9 24

Minimal sample size:  16
maximal sample size:  254
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(107, 1081)
1
107
              precision    recall  f1-score   support

           0       0.94      0.90      0.92        50
           1       0.92      0.95      0.93        57

   micro avg       0.93      0.93      0.93       107
   macro avg       0.93      0.92      0.92       107
weighted avg       0.93      0.93      0.93       107

[45  5  3 54]
svc Accuracy:  0.9252336448598131
svc F1:  0.924700914848698
       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      0.96      0.98       104
           1       1.00      0.31      0.47        39
           2       0.82      1.00      0.90       487
           3       0.00      0.00      0.00        35
           4       1.00      0.44      0.61        57
           5       1.00      0.94      0.97       105

   micro avg       0.87      0.87      0.87       827
   macro avg       0.80      0.61      0.66       827
weighted avg       0.85      0.87      0.84       827

[100   0   4   0   0   0   0  12  27   0   0   0   0   0 487   0   0   0
   0   0  35   0   0   0   0   0  32   0  25   0   0   0   6   0   0  99]
LR Accuracy:  0.8742442563482467
LR F1:  0.6558082952323575
For name:  w_choi
total sample size before apply threshold:  118
Counter({'0000-0003-1801-9386': 79, '0000-0002-7896-7655': 16, '0000-0002-6623-3806': 7, '0000-0002-4203-0457': 6, '0000-0001-8038-5876': 3, '0000-0002-7183-3400': 3, '0000-0003-4233-01

svc F1:  0.9810281517747859
              precision    recall  f1-score   support

           0       0.91      1.00      0.96        85
           1       1.00      0.79      0.89        39

   micro avg       0.94      0.94      0.94       124
   macro avg       0.96      0.90      0.92       124
weighted avg       0.94      0.94      0.93       124

[85  0  8 31]
LR Accuracy:  0.9354838709677419
LR F1:  0.9203852327447832
For name:  c_franco
total sample size before apply threshold:  64
Counter({'0000-0003-1958-3851': 28, '0000-0003-2288-1518': 18, '0000-0002-2861-3883': 17, '0000-0003-2729-4064': 1})
[]
Total sample size after apply threshold:  0
For name:  v_wong
total sample size before apply threshold:  35
Counter({'0000-0001-6751-7942': 14, '0000-0002-2951-8108': 12, '0000-0001-9356-7556': 8, '0000-0003-2844-3789': 1})
[]
Total sample size after apply threshold:  0
For name:  j_feng
total sample size before apply threshold:  147
Counter({'0000-0003-4762-7532': 102, '0000-0003-1

Minimal sample size:  0
maximal sample size:  305
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(326, 2036)
1
326
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       206
           1       1.00      0.98      0.99       120

   micro avg       0.99      0.99      0.99       326
   macro avg       1.00      0.99      0.99       326
weighted avg       0.99      0.99      0.99       326

[206   0   2 118]
svc Accuracy:  0.9938650306748467
svc F1:  0.993382860390533
    

Minimal sample size:  25
maximal sample size:  305
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(101, 979)
1
101
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        69
           1       1.00      0.88      0.93        32

   micro avg       0.96      0.96      0.96       101
   macro avg       0.97      0.94      0.95       101
weighted avg       0.96      0.96      0.96       101

[69  0  4 28]
svc Accuracy:  0.9603960396039604
svc F1:  0.9525821596244132
       

total sample size before apply threshold:  60
Counter({'0000-0002-7710-4284': 22, '0000-0002-7687-4746': 17, '0000-0002-6767-6596': 13, '0000-0001-7712-0025': 6, '0000-0002-4817-6385': 2})
[]
Total sample size after apply threshold:  0
For name:  t_ito
total sample size before apply threshold:  69
Counter({'0000-0001-7443-3157': 34, '0000-0003-4516-9283': 17, '0000-0003-0686-8129': 5, '0000-0002-4237-3564': 4, '0000-0003-1971-4313': 3, '0000-0001-9873-099X': 3, '0000-0003-1279-228X': 1, '0000-0001-6015-9302': 1, '0000-0002-9274-7050': 1})
['0000-0001-7443-3157']
Total sample size after apply threshold:  34
For name:  t_jackson
total sample size before apply threshold:  47
Counter({'0000-0001-6351-2773': 23, '0000-0001-6749-9959': 9, '0000-0003-1669-6666': 6, '0000-0003-3214-3973': 3, '0000-0001-8404-4251': 2, '0000-0002-0248-2627': 2, '0000-0002-5489-6020': 1, '0000-0003-2387-6411': 1})
[]
Total sample size after apply threshold:  0
For name:  m_romero
total sample size before apply th

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        77
           1       1.00      0.98      0.99        60

   micro avg       0.99      0.99      0.99       137
   macro avg       0.99      0.99      0.99       137
weighted avg       0.99      0.99      0.99       137

[77  0  1 59]
svc Accuracy:  0.9927007299270073
svc F1:  0.9925725128761181
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        77
           1       1.00      0.98      0.99        60

   micro avg       0.99      0.99      0.99       137
   macro avg       0.99      0.99      0.99       137
weighted avg       0.99      0.99      0.99       137

[77  0  1 59]
LR Accuracy:  0.9927007299270073
LR F1:  0.9925725128761181
For name:  b_zhou
total sample size before apply threshold:  20
Counter({'0000-0002-1535-6283': 13, '0000-0003-2846-1813': 2, '0000-0003-2634-1527': 1, '0000-0001-9774-2737': 1, '0000-0003-1560-49

Minimal sample size:  2
maximal sample size:  341
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(259, 2032)
1
259
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        40
           1       0.98      0.89      0.93        45
           2       0.93      0.82      0.87        34
           3       0.98      0.93      0.95        44
           4       0.84      0.95      0.89        65
           5       0.93      0.81      0.86        31

   micro avg       0.90      0

Minimal sample size:  8
maximal sample size:  274
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(86, 726)
1
86
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        49
           1       0.94      0.89      0.92        37

   micro avg       0.93      0.93      0.93        86
   macro avg       0.93      0.93      0.93        86
weighted avg       0.93      0.93      0.93        86

[47  2  4 33]
svc Accuracy:  0.9302325581395349
svc F1:  0.9283333333333335
          

Minimal sample size:  11
maximal sample size:  262
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(86, 744)
1
86
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        38
           1       1.00      1.00      1.00        48

   micro avg       1.00      1.00      1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

[38  0  0 48]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

Minimal sample size:  9
maximal sample size:  350
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(280, 2380)
1
280
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        53
           1       1.00      1.00      1.00        44
           2       1.00      1.00      1.00        92
           3       0.98      1.00      0.99        91

   micro avg       0.99      0.99      0.99       280
   macro avg       0.99      0.99      0.99       280
weighted avg       0.99      0

Minimal sample size:  2
maximal sample size:  346
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(172, 1566)
1
172
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       130
           1       1.00      0.98      0.99        42

   micro avg       0.99      0.99      0.99       172
   macro avg       1.00      0.99      0.99       172
weighted avg       0.99      0.99      0.99       172

[130   0   1  41]
svc Accuracy:  0.9941860465116279
svc F1:  0.9920601948021972
   

total sample size before apply threshold:  91
Counter({'0000-0003-0116-1386': 50, '0000-0003-2283-1377': 23, '0000-0003-4880-6006': 14, '0000-0002-6155-8644': 2, '0000-0002-5268-290X': 1, '0000-0002-7437-4211': 1})
['0000-0003-0116-1386']
Total sample size after apply threshold:  50
For name:  a_murray
total sample size before apply threshold:  76
Counter({'0000-0002-4094-962X': 32, '0000-0002-0929-9315': 14, '0000-0001-7143-287X': 9, '0000-0001-6762-588X': 7, '0000-0001-5014-1096': 7, '0000-0001-7047-8139': 4, '0000-0001-9648-2902': 3})
['0000-0002-4094-962X']
Total sample size after apply threshold:  32
For name:  b_cao
total sample size before apply threshold:  58
Counter({'0000-0002-9462-496X': 39, '0000-0003-3588-972X': 14, '0000-0003-3401-6900': 4, '0000-0003-4443-2326': 1})
['0000-0002-9462-496X']
Total sample size after apply threshold:  39
For name:  k_sohn
total sample size before apply threshold:  31
Counter({'0000-0002-3237-044X': 17, '0000-0001-8941-1188': 12, '0000-0001-9

Minimal sample size:  18
maximal sample size:  255
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(131, 1249)
1
131
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        60
           1       0.97      0.96      0.96        71

   micro avg       0.96      0.96      0.96       131
   macro avg       0.96      0.96      0.96       131
weighted avg       0.96      0.96      0.96       131

[58  2  3 68]
svc Accuracy:  0.9618320610687023
svc F1:  0.9616083465213059
      

Minimal sample size:  3
maximal sample size:  302
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(218, 1923)
1
218
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       0.98      1.00      0.99        55
           2       1.00      0.98      0.99        65

   micro avg       1.00      1.00      1.00       218
   macro avg       0.99      0.99      0.99       218
weighted avg       1.00      1.00      1.00       218

[98  0  0  0 55  0  0  1 64]


Minimal sample size:  57
maximal sample size:  287
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(75, 791)
1
75
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        35
           1       0.97      0.97      0.97        40

   micro avg       0.97      0.97      0.97        75
   macro avg       0.97      0.97      0.97        75
weighted avg       0.97      0.97      0.97        75

[34  1  1 39]
svc Accuracy:  0.9733333333333334
svc F1:  0.9732142857142857
         

Minimal sample size:  8
maximal sample size:  212
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(99, 685)
1
99
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.99      1.00      0.99        66

   micro avg       0.99      0.99      0.99        99
   macro avg       0.99      0.98      0.99        99
weighted avg       0.99      0.99      0.99        99

[32  1  0 66]
svc Accuracy:  0.98989898989899
svc F1:  0.9885482938114517
            

Minimal sample size:  6
maximal sample size:  384
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(234, 1958)
1
234
              precision    recall  f1-score   support

           0       0.88      0.83      0.86        36
           1       0.91      0.99      0.95       117
           2       0.87      0.81      0.84        48
           3       1.00      0.82      0.90        33

   micro avg       0.91      0.91      0.91       234
   macro avg       0.91      0.86      0.89       234
weighted avg       0.91      0

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.96      0.94      0.95        48
           2       1.00      0.94      0.97        48
           3       0.91      0.99      0.95        81

   micro avg       0.95      0.95      0.95       217
   macro avg       0.97      0.95      0.96       217
weighted avg       0.96      0.95      0.95       217

[37  1  0  2  0 45  0  3  0  0 45  3  0  1  0 80]
svc Accuracy:  0.9539170506912442
svc F1:  0.9557237199264103
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.98      0.83      0.90        48
           2       1.00      0.94      0.97        48
           3       0.86      1.00      0.93        81

   micro avg       0.94      0.94      0.94       217
   macro avg       0.96      0.92      0.94       217
weighted avg       0.94      0.94      0.94       217

[37  1  0  2  0 40

              precision    recall  f1-score   support

           0       1.00      0.74      0.85        31
           1       0.96      1.00      0.98       194

   micro avg       0.96      0.96      0.96       225
   macro avg       0.98      0.87      0.92       225
weighted avg       0.97      0.96      0.96       225

[ 23   8   0 194]
svc Accuracy:  0.9644444444444444
svc F1:  0.9158249158249159
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        31
           1       0.86      1.00      0.93       194

   micro avg       0.86      0.86      0.86       225
   macro avg       0.43      0.50      0.46       225
weighted avg       0.74      0.86      0.80       225

[  0  31   0 194]
LR Accuracy:  0.8622222222222222
LR F1:  0.4630071599045346
For name:  k_hong
total sample size before apply threshold:  127
Counter({'0000-0002-4684-6111': 44, '0000-0002-2852-5111': 29, '0000-0001-7325-1036': 20, '0000-0003-3334-817X': 12, '0000-

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Minimal sample size:  4
maximal sample size:  281
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(120, 1250)
1
120
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

[78  0  0 42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1

Minimal sample size:  5
maximal sample size:  265
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(85, 893)
1
85
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.98      0.98      0.98        43

   micro avg       0.98      0.98      0.98        85
   macro avg       0.98      0.98      0.98        85
weighted avg       0.98      0.98      0.98        85

[41  1  1 42]
svc Accuracy:  0.9764705882352941
svc F1:  0.9764673311184939
          

Minimal sample size:  11
maximal sample size:  291
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(364, 2476)
1
364
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       194
           1       0.93      0.95      0.94        39
           2       0.96      0.87      0.91        30
           3       0.93      0.96      0.95       101

   micro avg       0.97      0.97      0.97       364
   macro avg       0.96      0.94      0.95       364
weighted avg       0.97      

Minimal sample size:  4
maximal sample size:  275
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(113, 1060)
1
113
              precision    recall  f1-score   support

           0       0.96      0.99      0.98        80
           1       0.97      0.91      0.94        33

   micro avg       0.96      0.96      0.96       113
   macro avg       0.97      0.95      0.96       113
weighted avg       0.96      0.96      0.96       113

[79  1  3 30]
svc Accuracy:  0.9646017699115044
svc F1:  0.9564043209876543
       

Minimal sample size:  2
maximal sample size:  331
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(135, 1421)
1
135
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           1       1.00      1.00      1.00        59
           2       0.98      1.00      0.99        46

   micro avg       0.99      0.99      0.99       135
   macro avg       0.99      0.99      0.99       135
weighted avg       0.99      0.99      0.99       135

[29  0  1  0 59  0  0  0 46]


Minimal sample size:  10
maximal sample size:  283
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(111, 959)
1
111
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        76

   micro avg       1.00      1.00      1.00       111
   macro avg       1.00      1.00      1.00       111
weighted avg       1.00      1.00      1.00       111

[35  0  0 76]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1

Minimal sample size:  6
maximal sample size:  316
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(536, 3364)
1
536
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       113
           1       0.99      1.00      0.99       423

   micro avg       0.99      0.99      0.99       536
   macro avg       0.99      0.98      0.99       536
weighted avg       0.99      0.99      0.99       536

[108   5   0 423]
svc Accuracy:  0.9906716417910447
svc F1:  0.9857500624764052
   

Minimal sample size:  4
maximal sample size:  607
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(144, 1281)
1
144
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       144
   macro avg       1.00      1.00      1.00       144
weighted avg       1.00      1.00      1.00       144

[102   0   0  42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall

Minimal sample size:  10
maximal sample size:  313
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(346, 2526)
1
346
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        41
           1       0.97      1.00      0.99        38
           2       0.93      0.93      0.93        42
           3       1.00      0.95      0.97        37
           4       1.00      0.98      0.99        50
           5       1.00      1.00      1.00        38
           6       1.00      0

Minimal sample size:  2
maximal sample size:  312
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(84, 747)
1
84
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.90      0.95        30

   micro avg       0.96      0.96      0.96        84
   macro avg       0.97      0.95      0.96        84
weighted avg       0.97      0.96      0.96        84

[54  0  3 27]
svc Accuracy:  0.9642857142857143
svc F1:  0.9601706970128023
          

Minimal sample size:  3
maximal sample size:  206
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(81, 582)
1
81
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        44
           1       1.00      0.89      0.94        37

   micro avg       0.95      0.95      0.95        81
   macro avg       0.96      0.95      0.95        81
weighted avg       0.95      0.95      0.95        81

[44  0  4 33]
svc Accuracy:  0.9506172839506173
svc F1:  0.9496894409937888
          

Minimal sample size:  9
maximal sample size:  297
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(241, 2008)
1
241
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        39
           1       0.99      1.00      0.99       202

   micro avg       0.99      0.99      0.99       241
   macro avg       0.98      0.97      0.98       241
weighted avg       0.99      0.99      0.99       241

[ 37   2   1 201]
svc Accuracy:  0.9875518672199171
svc F1:  0.9768157768157768
   

Minimal sample size:  82
maximal sample size:  283
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(90, 973)
1
90
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        36

   micro avg       1.00      1.00      1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

[54  0  0 36]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-s

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       0.99      0.99      0.99        71
           2       0.99      1.00      0.99        70

   micro avg       0.99      0.99      0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205

[63  1  0  0 70  1  0  0 70]
svc Accuracy:  0.9902439024390244
svc F1:  0.990316426209385
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        64
           1       0.99      0.99      0.99        71
           2       0.99      1.00      0.99        70

   micro avg       0.99      0.99      0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205

[63  1  0  0 70  1  0  0 70]
LR Accuracy:  0.9902439024390244
LR F1:  0.990316426209385
For name:  j_alves
total sample size before apply threshold:

Minimal sample size:  7
maximal sample size:  299
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(661, 3576)
1
661
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       587
           1       0.97      0.79      0.87        43
           2       1.00      0.77      0.87        31

   micro avg       0.97      0.97      0.97       661
   macro avg       0.98      0.85      0.91       661
weighted avg       0.97      0.97      0.97       661

[586   1   0   9  34   0   7 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.89      1.00      0.94       587
           1       1.00      0.05      0.09        43
           2       0.00      0.00      0.00        31

   micro avg       0.89      0.89      0.89       661
   macro avg       0.63      0.35      0.34       661
weighted avg       0.86      0.89      0.84       661

[587   0   0  41   2   0  31   0   0]
LR Accuracy:  0.8910741301059002
LR F1:  0.3437013257237976
For name:  a_guerrero
total sample size before apply threshold:  57
Counter({'0000-0001-5474-1451': 28, '0000-0002-4389-5516': 12, '0000-0001-8602-1248': 9, '0000-0001-6050-8699': 6, '0000-0003-2550-6764': 2})
[]
Total sample size after apply threshold:  0
For name:  a_grant
total sample size before apply threshold:  45
Counter({'0000-0002-1147-2375': 22, '0000-0001-6146-101X': 9, '0000-0001-7205-5869': 7, '0000-0002-7032-3716': 4, '0000-0001-9746-2989': 2, '0000-0002-1553-596X': 1})
[]
Total sample size after apply

Total sample size after apply threshold:  105
Minimal sample size:  6
maximal sample size:  283
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(105, 1031)
1
105
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        75
           1       1.00      0.97      0.98        30

   micro avg       0.99      0.99      0.99       105
   macro avg       0.99      0.98      0.99       105
weighted avg       0.99      0.99      0.99       105

[75  0  1 29]
svc Accuracy:  0.990476

total sample size before apply threshold:  115
Counter({'0000-0002-2768-3572': 44, '0000-0003-2410-2135': 17, '0000-0003-0493-062X': 15, '0000-0003-4655-6496': 10, '0000-0003-4504-8609': 7, '0000-0003-1762-7224': 6, '0000-0002-5500-8195': 6, '0000-0003-2546-2415': 5, '0000-0003-1501-896X': 2, '0000-0002-4887-3711': 1, '0000-0003-3856-9887': 1, '0000-0002-9983-7948': 1})
['0000-0002-2768-3572']
Total sample size after apply threshold:  44
For name:  m_cruz
total sample size before apply threshold:  141
Counter({'0000-0001-9759-5466': 57, '0000-0001-9846-6754': 46, '0000-0003-1822-0514': 30, '0000-0002-4767-530X': 3, '0000-0001-8152-3054': 3, '0000-0003-3311-7582': 2})
['0000-0001-9846-6754', '0000-0001-9759-5466', '0000-0003-1822-0514']
Total sample size after apply threshold:  133
Minimal sample size:  13
maximal sample size:  519
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lower

Minimal sample size:  1
maximal sample size:  465
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(1214, 5564)
1
1214
              precision    recall  f1-score   support

           0       0.93      0.96      0.95        56
           1       0.81      0.98      0.88       124
           2       0.96      0.71      0.81        31
           3       1.00      0.91      0.95        32
           4       0.71      0.85      0.77        73
           5       0.96      0.72      0.82        32
           6       0.98      

              precision    recall  f1-score   support

           0       1.00      0.47      0.64        30
           1       1.00      0.72      0.84        36
           2       0.86      1.00      0.93       194
           3       1.00      0.85      0.92        34

   micro avg       0.89      0.89      0.89       294
   macro avg       0.97      0.76      0.83       294
weighted avg       0.91      0.89      0.89       294

[ 14   0  16   0   0  26  10   0   0   0 194   0   0   0   5  29]
LR Accuracy:  0.8945578231292517
LR F1:  0.8304306385567453
For name:  a_correia
total sample size before apply threshold:  136
Counter({'0000-0002-5115-1429': 81, '0000-0003-0408-6262': 26, '0000-0002-0119-9790': 11, '0000-0002-2831-025X': 7, '0000-0003-2414-0131': 4, '0000-0003-3000-9324': 4, '0000-0002-8946-8579': 2, '0000-0002-2172-6631': 1})
['0000-0002-5115-1429']
Total sample size after apply threshold:  81
For name:  a_reynolds
total sample size before apply threshold:  40
Counter({'000

Minimal sample size:  7
maximal sample size:  271
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(86, 799)
1
86
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        55
           1       1.00      1.00      1.00        31

   micro avg       1.00      1.00      1.00        86
   macro avg       1.00      1.00      1.00        86
weighted avg       1.00      1.00      1.00        86

[55  0  0 31]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-sc

Minimal sample size:  6
maximal sample size:  268
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(191, 1601)
1
191
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        36
           1       0.97      1.00      0.98       116
           2       1.00      0.95      0.97        39

   micro avg       0.98      0.98      0.98       191
   macro avg       0.99      0.96      0.98       191
weighted avg       0.98      0.98      0.98       191

[ 34   2   0   0 116   0   0 

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        58
           1       0.98      1.00      0.99       113

   micro avg       0.99      0.99      0.99       171
   macro avg       0.99      0.98      0.99       171
weighted avg       0.99      0.99      0.99       171

[ 56   2   0 113]
svc Accuracy:  0.9883040935672515
svc F1:  0.9868421052631579
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        58
           1       0.92      1.00      0.96       113

   micro avg       0.94      0.94      0.94       171
   macro avg       0.96      0.91      0.93       171
weighted avg       0.95      0.94      0.94       171

[ 48  10   0 113]
LR Accuracy:  0.9415204678362573
LR F1:  0.9316437480012791
For name:  l_wang
total sample size before apply threshold:  828
Counter({'0000-0001-9783-4383': 98, '0000-0003-3870-3388': 64, '0000-0002-5947-306X': 63, '0000-0002-5773-1627': 56, '0000-

Minimal sample size:  14
maximal sample size:  322
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(319, 2655)
1
319
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      0.97      0.98        58
           2       0.99      1.00      1.00       222

   micro avg       0.99      0.99      0.99       319
   macro avg       1.00      0.99      0.99       319
weighted avg       0.99      0.99      0.99       319

[ 39   0   0   0  56   2   0

Minimal sample size:  5
maximal sample size:  350
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function dummy at 0x7f57f0c52488>, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function dummy at 0x7f57f0c52488>, use_idf=True,
        vocabulary=None)
(335, 2174)
1
335
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        33
           1       0.95      0.94      0.95       120
           2       1.00      0.98      0.99        43
           3       0.89      0.91      0.90        35
           4       0.88      0.92      0.90        65
           5       0.97      0.92      0.95        39

   micro avg       0.94      0

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        31
           1       0.97      0.95      0.96        39
           2       0.92      1.00      0.96        57

   micro avg       0.95      0.95      0.95       127
   macro avg       0.96      0.94      0.95       127
weighted avg       0.96      0.95      0.95       127

[27  1  3  0 37  2  0  0 57]
LR Accuracy:  0.952755905511811
LR F1:  0.9500188790249643
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
[]
Total sample size after apply threshold:  0
For name:  c_miller
total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-000

Minimal sample size:  8
maximal sample size:  330
0.5303242514954936
(419, 100)
1
419
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        51
           1       0.97      0.99      0.98        79
           2       0.96      0.99      0.98        82
           3       0.99      1.00      1.00       141
           4       1.00      0.95      0.98        66

   micro avg       0.99      0.99      0.99       419
   macro avg       0.99      0.98      0.98       419
weighted avg       0.99      0.99      0.99       419

[ 50   1   0   0   0   0  78   1   0   0   0   0  81   1   0   0   0   0
 141   0   0   1   2   0  63]
svc Accuracy:  0.9856801909307876
svc F1:  0.9840690633944874
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        51
           1       0.99      0.95      0.97        79
           2       0.94      0.99      0.96        82
           3       0.96      1.00      0.98

0.832016655690358
(160, 100)
1
160
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       123
           1       1.00      0.92      0.96        37

   micro avg       0.98      0.98      0.98       160
   macro avg       0.99      0.96      0.97       160
weighted avg       0.98      0.98      0.98       160

[123   0   3  34]
svc Accuracy:  0.98125
svc F1:  0.9728491430510775
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       123
           1       1.00      0.68      0.81        37

   micro avg       0.93      0.93      0.93       160
   macro avg       0.96      0.84      0.88       160
weighted avg       0.93      0.93      0.92       160

[123   0  12  25]
LR Accuracy:  0.925
LR F1:  0.8799699924981246
For name:  k_xu
total sample size before apply threshold:  37
Counter({'0000-0002-2788-194X': 19, '0000-0003-2036-3469': 14, '0000-0002-3985-739X': 3, '0000-0001-7851-2629': 1})

Minimal sample size:  17
maximal sample size:  316
0.9451663020896924
(119, 100)
1
119
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        34
           1       1.00      1.00      1.00        43
           2       0.98      0.95      0.96        42

   micro avg       0.97      0.97      0.97       119
   macro avg       0.97      0.97      0.97       119
weighted avg       0.98      0.97      0.97       119

[33  0  1  0 43  0  2  0 40]
svc Accuracy:  0.9747899159663865
svc F1:  0.9734590536057274
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        34
           1       1.00      1.00      1.00        43
           2       0.98      0.95      0.96        42

   micro avg       0.97      0.97      0.97       119
   macro avg       0.97      0.97      0.97       119
weighted avg       0.98      0.97      0.97       119

[33  0  1  0 43  0  2  0 40]
LR Accuracy:  0.974789915966386

Minimal sample size:  8
maximal sample size:  351
0.6315180426195333
(283, 100)
1
283
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       149
           1       0.95      0.90      0.92        98
           2       0.97      0.92      0.94        36

   micro avg       0.94      0.94      0.94       283
   macro avg       0.95      0.93      0.94       283
weighted avg       0.94      0.94      0.94       283

[144   4   1  10  88   0   2   1  33]
svc Accuracy:  0.9363957597173145
svc F1:  0.9361951355084992
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       149
           1       0.98      0.88      0.92        98
           2       1.00      0.78      0.88        36

   micro avg       0.92      0.92      0.92       283
   macro avg       0.95      0.88      0.91       283
weighted avg       0.93      0.92      0.92       283

[147   2   0  12  86   0   8   0  28]
LR Accuracy:  

Minimal sample size:  19
maximal sample size:  295
0.9790144012985202
(106, 100)
1
106
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        49
           1       0.97      1.00      0.98        57

   micro avg       0.98      0.98      0.98       106
   macro avg       0.98      0.98      0.98       106
weighted avg       0.98      0.98      0.98       106

[47  2  0 57]
svc Accuracy:  0.9811320754716981
svc F1:  0.9809626436781609
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        49
           1       0.98      1.00      0.99        57

   micro avg       0.99      0.99      0.99       106
   macro avg       0.99      0.99      0.99       106
weighted avg       0.99      0.99      0.99       106

[48  1  0 57]
LR Accuracy:  0.9905660377358491
LR F1:  0.9904975347377858
For name:  p_teixeira
total sample size before apply threshold:  213
Counter({'0000-0002-7258-7977': 60, '000

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        44
           1       1.00      1.00      1.00        32
           2       0.97      1.00      0.99        68
           3       1.00      0.99      0.99        75

   micro avg       0.99      0.99      0.99       219
   macro avg       0.99      0.99      0.99       219
weighted avg       0.99      0.99      0.99       219

[43  0  1  0  0 32  0  0  0  0 68  0  0  0  1 74]
LR Accuracy:  0.9908675799086758
LR F1:  0.9918253960268187
For name:  j_yi
total sample size before apply threshold:  29
Counter({'0000-0001-5299-9897': 18, '0000-0002-9296-8443': 9, '0000-0002-1025-865X': 1, '0000-0003-1718-6326': 1})
[]
Total sample size after apply threshold:  0
For name:  s_khan
total sample size before apply threshold:  193
Counter({'0000-0001-5147-145X': 61, '0000-0001-5654-2835': 42, '0000-0003-4185-8882': 29, '0000-0002-6792-3577': 7, '0000-0003-0910-4095': 7, '0000-0002-0310-0424': 6

Minimal sample size:  29
maximal sample size:  277
1.0
(94, 94)
1
94
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        53

   micro avg       1.00      1.00      1.00        94
   macro avg       1.00      1.00      1.00        94
weighted avg       1.00      1.00      1.00        94

[41  0  0 53]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        53

   micro avg       1.00      1.00      1.00        94
   macro avg       1.00      1.00      1.00        94
weighted avg       1.00      1.00      1.00        94

[41  0  0 53]
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_viana
total sample size before apply threshold:  139
Counter({'0000-0002-0464-4845': 34, '0000-0003-4356-8109': 31, '0000-0002-4073-3802': 29, '0000-0001-9665-2115': 26, '000

total sample size before apply threshold:  102
Counter({'0000-0003-4002-048X': 31, '0000-0001-8583-5362': 26, '0000-0002-8245-0181': 20, '0000-0003-3533-5268': 12, '0000-0002-0375-1040': 11, '0000-0002-3078-8404': 2})
['0000-0003-4002-048X']
Total sample size after apply threshold:  31
For name:  s_james
total sample size before apply threshold:  59
Counter({'0000-0001-9369-3288': 29, '0000-0003-0651-9842': 13, '0000-0001-7955-0491': 8, '0000-0001-6758-5726': 7, '0000-0003-1150-0628': 1, '0000-0002-8128-2139': 1})
[]
Total sample size after apply threshold:  0
For name:  p_persson
total sample size before apply threshold:  80
Counter({'0000-0001-9172-3068': 39, '0000-0001-7600-3230': 26, '0000-0001-9140-6724': 8, '0000-0003-4468-032X': 7})
['0000-0001-9172-3068']
Total sample size after apply threshold:  39
For name:  y_tanaka
total sample size before apply threshold:  20
Counter({'0000-0002-0674-660X': 12, '0000-0002-6190-4586': 5, '0000-0001-9598-5583': 2, '0000-0002-5163-7752': 1})


Minimal sample size:  21
maximal sample size:  366
0.9600184593656874
(113, 100)
1
113
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        74
           1       0.95      0.95      0.95        39

   micro avg       0.96      0.96      0.96       113
   macro avg       0.96      0.96      0.96       113
weighted avg       0.96      0.96      0.96       113

[72  2  2 37]
svc Accuracy:  0.9646017699115044
svc F1:  0.9608454608454609
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        74
           1       1.00      0.79      0.89        39

   micro avg       0.93      0.93      0.93       113
   macro avg       0.95      0.90      0.92       113
weighted avg       0.94      0.93      0.93       113

[74  0  8 31]
LR Accuracy:  0.9292035398230089
LR F1:  0.9172161172161173
For name:  a_nielsen
total sample size before apply threshold:  132
Counter({'0000-0003-4372-9961': 70, '0000

Minimal sample size:  2
maximal sample size:  269
0.7431237432578472
(202, 100)
1
202
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.99      1.00      1.00       167

   micro avg       1.00      1.00      1.00       202
   macro avg       1.00      0.99      0.99       202
weighted avg       1.00      1.00      1.00       202

[ 34   1   0 167]
svc Accuracy:  0.995049504950495
svc F1:  0.9912610858749729
              precision    recall  f1-score   support

           0       1.00      0.49      0.65        35
           1       0.90      1.00      0.95       167

   micro avg       0.91      0.91      0.91       202
   macro avg       0.95      0.74      0.80       202
weighted avg       0.92      0.91      0.90       202

[ 17  18   0 167]
LR Accuracy:  0.9108910891089109
LR F1:  0.8013548951048951
For name:  m_reilly
total sample size before apply threshold:  20
Counter({'0000-0001-8029-0084': 17, '

Minimal sample size:  3
maximal sample size:  624
0.45753168692005824
(533, 100)
1
533
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        32
           1       0.96      0.98      0.97        65
           2       0.99      0.96      0.98        84
           3       0.99      0.99      0.99       113
           4       0.99      1.00      1.00       154
           5       1.00      1.00      1.00        85

   micro avg       0.99      0.99      0.99       533
   macro avg       0.99      0.98      0.99       533
weighted avg       0.99      0.99      0.99       533

[ 31   1   0   0   0   0   0  64   0   0   1   0   0   2  81   1   0   0
   0   0   1 112   0   0   0   0   0   0 154   0   0   0   0   0   0  85]
svc Accuracy:  0.9887429643527205
svc F1:  0.9862736274674947
              precision    recall  f1-score   support

           0       1.00      0.78      0.88        32
           1       0.95      0.97      0.96        6

0.6207246133371274
(287, 100)
1
287
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        77
           1       0.98      1.00      0.99       210

   micro avg       0.98      0.98      0.98       287
   macro avg       0.99      0.97      0.98       287
weighted avg       0.98      0.98      0.98       287

[ 72   5   0 210]
svc Accuracy:  0.9825783972125436
svc F1:  0.9773391235688906
              precision    recall  f1-score   support

           0       1.00      0.79      0.88        77
           1       0.93      1.00      0.96       210

   micro avg       0.94      0.94      0.94       287
   macro avg       0.96      0.90      0.92       287
weighted avg       0.95      0.94      0.94       287

[ 61  16   0 210]
LR Accuracy:  0.9442508710801394
LR F1:  0.9236803616540354
For name:  j_abrantes
total sample size before apply threshold:  57
Counter({'0000-0002-8391-7134': 42, '0000-0003-1902-9017': 11, '0000-0003-4585-9831'

total sample size before apply threshold:  121
Counter({'0000-0003-2125-6841': 81, '0000-0003-3676-9228': 18, '0000-0001-8438-0319': 17, '0000-0002-7713-1928': 4, '0000-0003-2197-5539': 1})
['0000-0003-2125-6841']
Total sample size after apply threshold:  81
For name:  s_hussein
total sample size before apply threshold:  33
Counter({'0000-0002-7946-0717': 18, '0000-0002-6305-508X': 9, '0000-0003-3657-7410': 4, '0000-0002-5394-4385': 1, '0000-0002-0139-1483': 1})
[]
Total sample size after apply threshold:  0
For name:  z_luo
total sample size before apply threshold:  25
Counter({'0000-0002-3074-046X': 15, '0000-0002-2719-1025': 5, '0000-0002-8129-333X': 3, '0000-0003-0164-4492': 2})
[]
Total sample size after apply threshold:  0
For name:  c_pimentel
total sample size before apply threshold:  22
Counter({'0000-0002-5158-6414': 16, '0000-0002-1106-8962': 3, '0000-0002-8364-8990': 2, '0000-0002-4932-0174': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ito
total sample s

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        30
           1       0.83      1.00      0.90        38
           2       0.86      0.76      0.81        33

   micro avg       0.88      0.88      0.88       101
   macro avg       0.90      0.87      0.88       101
weighted avg       0.89      0.88      0.88       101

[26  0  4  0 38  0  0  8 25]
LR Accuracy:  0.8811881188118812
LR F1:  0.8799283154121863
For name:  r_radhakrishnan
total sample size before apply threshold:  62
Counter({'0000-0003-0088-4777': 35, '0000-0002-8220-655X': 14, '0000-0001-6616-8525': 7, '0000-0001-7170-699X': 5, '0000-0002-3560-1020': 1})
['0000-0003-0088-4777']
Total sample size after apply threshold:  35
For name:  k_saito
total sample size before apply threshold:  61
Counter({'0000-0003-4663-1134': 26, '0000-0002-2151-6204': 16, '0000-0002-5726-8775': 11, '0000-0003-2557-1726': 7, '0000-0001-6310-5342': 1})
[]
Total sample size after apply thres

Minimal sample size:  12
maximal sample size:  205
1.0000000000000002
(80, 80)
1
80
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        44

   micro avg       1.00      1.00      1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

[36  0  0 44]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.98      1.00      0.99        44

   micro avg       0.99      0.99      0.99        80
   macro avg       0.99      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80

[35  1  0 44]
LR Accuracy:  0.9875
LR F1:  0.9873397689507832
For name:  d_fernandes
total sample size before apply threshold:  40
Counter({'0000-0003-0599-3200': 20, '0000-0002-5056-5734': 9, '0000-0001-5263-2737': 

              precision    recall  f1-score   support

           0       0.99      0.96      0.97        90
           1       0.88      0.99      0.93       166
           2       1.00      0.59      0.74        44

   micro avg       0.92      0.92      0.92       300
   macro avg       0.96      0.85      0.88       300
weighted avg       0.93      0.92      0.92       300

[ 86   4   0   1 165   0   0  18  26]
LR Accuracy:  0.9233333333333333
LR F1:  0.8831509159736942
For name:  y_jia
total sample size before apply threshold:  46
Counter({'0000-0002-2784-1905': 24, '0000-0003-3852-7302': 10, '0000-0002-8852-7557': 3, '0000-0001-9657-0806': 3, '0000-0001-7978-9312': 3, '0000-0001-9395-2139': 2, '0000-0003-4972-1004': 1})
[]
Total sample size after apply threshold:  0
For name:  p_gaspar
total sample size before apply threshold:  93
Counter({'0000-0003-4217-5717': 87, '0000-0001-5967-0584': 3, '0000-0002-4832-8537': 2, '0000-0003-3388-1724': 1})
['0000-0003-4217-5717']
Total sample

Minimal sample size:  11
maximal sample size:  311
1.0000000000000004
(74, 74)
1
74
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.97      0.99        38

   micro avg       0.99      0.99      0.99        74
   macro avg       0.99      0.99      0.99        74
weighted avg       0.99      0.99      0.99        74

[36  0  1 37]
svc Accuracy:  0.9864864864864865
svc F1:  0.9864840182648402
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       1.00      0.97      0.99        38

   micro avg       0.99      0.99      0.99        74
   macro avg       0.99      0.99      0.99        74
weighted avg       0.99      0.99      0.99        74

[36  0  1 37]
LR Accuracy:  0.9864864864864865
LR F1:  0.9864840182648402
For name:  s_woo
total sample size before apply threshold:  25
Counter({'0000-0003-3692-7169': 22, '0000-0001-87

              precision    recall  f1-score   support

           0       0.96      0.59      0.73        39
           1       0.81      1.00      0.89       100
           2       1.00      1.00      1.00        45
           3       0.92      0.92      0.92        39
           4       1.00      0.71      0.83        31

   micro avg       0.89      0.89      0.89       254
   macro avg       0.94      0.84      0.88       254
weighted avg       0.91      0.89      0.88       254

[ 23  13   0   3   0   0 100   0   0   0   0   0  45   0   0   1   2   0
  36   0   0   9   0   0  22]
LR Accuracy:  0.889763779527559
LR F1:  0.8752562950676157
For name:  s_bae
total sample size before apply threshold:  83
Counter({'0000-0003-0551-7618': 19, '0000-0002-3019-0584': 17, '0000-0002-4995-6543': 17, '0000-0002-8993-8884': 9, '0000-0003-0098-8816': 8, '0000-0003-1926-5466': 6, '0000-0001-7603-7676': 6, '0000-0003-0637-4110': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ferna

              precision    recall  f1-score   support

           0       1.00      0.24      0.38        34
           1       0.77      1.00      0.87        86

   micro avg       0.78      0.78      0.78       120
   macro avg       0.88      0.62      0.62       120
weighted avg       0.83      0.78      0.73       120

[ 8 26  0 86]
LR Accuracy:  0.7833333333333333
LR F1:  0.6248196248196248
For name:  s_teixeira
total sample size before apply threshold:  36
Counter({'0000-0003-0419-2348': 12, '0000-0001-5845-058X': 11, '0000-0002-2462-8535': 3, '0000-0002-9473-0113': 3, '0000-0002-7464-3944': 3, '0000-0002-6603-7936': 3, '0000-0003-3664-2577': 1})
[]
Total sample size after apply threshold:  0
For name:  l_almeida
total sample size before apply threshold:  133
Counter({'0000-0002-4861-8649': 57, '0000-0002-7769-4712': 43, '0000-0003-1370-961X': 12, '0000-0003-0370-214X': 8, '0000-0002-0651-7014': 5, '0000-0001-9346-7520': 4, '0000-0002-1324-0068': 1, '0000-0002-9544-3028': 1, '0

1.0000000000000007
(81, 81)
1
81
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        38

   micro avg       1.00      1.00      1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81

[43  0  0 38]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        38

   micro avg       1.00      1.00      1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81

[43  0  0 38]
LR Accuracy:  1.0
LR F1:  1.0
For name:  d_franco
total sample size before apply threshold:  115
Counter({'0000-0002-5669-7164': 58, '0000-0002-0093-7042': 40, '0000-0003-3849-4272': 8, '0000-0001-5604-2531': 6, '0000-0002-8653-0488': 2, '0000-0002-2050

Minimal sample size:  10
maximal sample size:  388
0.6044823167518012
(295, 100)
1
295
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       155
           1       1.00      0.98      0.99        44
           2       0.93      0.85      0.89        60
           3       0.96      0.61      0.75        36

   micro avg       0.91      0.91      0.91       295
   macro avg       0.94      0.85      0.89       295
weighted avg       0.91      0.91      0.90       295

[152   0   3   0   1  43   0   0   8   0  51   1  13   0   1  22]
svc Accuracy:  0.9084745762711864
svc F1:  0.8863092846961713
              precision    recall  f1-score   support

           0       0.75      0.99      0.85       155
           1       1.00      0.82      0.90        44
           2       0.94      0.73      0.82        60
           3       0.88      0.19      0.32        36

   micro avg       0.81      0.81      0.81       295
   macro avg       0.89 

Total sample size after apply threshold:  120
Minimal sample size:  3
maximal sample size:  259
0.9312134938996433
(120, 100)
1
120
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        64
           1       1.00      0.98      0.99        56

   micro avg       0.99      0.99      0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120

[64  0  1 55]
svc Accuracy:  0.9916666666666667
svc F1:  0.9916195265032474
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        64
           1       1.00      0.98      0.99        56

   micro avg       0.99      0.99      0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120

[64  0  1 55]
LR Accuracy:  0.9916666666666667
LR F1:  0.9916195265032474
For name:  r_luz
total sample size before apply threshold:  20
C

Minimal sample size:  5
maximal sample size:  312
0.4136816441439162
(651, 100)
1
651
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        31
           1       0.99      1.00      0.99        73
           2       1.00      0.91      0.95        33
           3       0.88      0.93      0.90        55
           4       0.96      1.00      0.98        92
           5       0.86      0.97      0.91        87
           6       0.96      0.98      0.97        49
           7       0.92      0.84      0.88        43
           8       1.00      1.00      1.00        61
           9       1.00      0.88      0.94        41
          10       0.96      0.84      0.90        51
          11       0.91      0.83      0.87        35

   micro avg       0.94      0.94      0.94       651
   macro avg       0.95      0.93      0.94       651
weighted avg       0.95      0.94      0.94       651

[31  0  0  0  0  0  0  0  0  0  0  0  0 73  0  

Minimal sample size:  2
maximal sample size:  307
0.6229798112908742
(303, 100)
1
303
              precision    recall  f1-score   support

           0       0.96      0.95      0.96        57
           1       0.99      0.99      0.99        81
           2       1.00      0.96      0.98        48
           3       0.97      0.99      0.98       117

   micro avg       0.98      0.98      0.98       303
   macro avg       0.98      0.97      0.98       303
weighted avg       0.98      0.98      0.98       303

[ 54   1   0   2   1  80   0   0   0   0  46   2   1   0   0 116]
svc Accuracy:  0.976897689768977
svc F1:  0.975258222804713
              precision    recall  f1-score   support

           0       1.00      0.82      0.90        57
           1       0.99      0.99      0.99        81
           2       1.00      0.92      0.96        48
           3       0.89      1.00      0.94       117

   micro avg       0.95      0.95      0.95       303
   macro avg       0.97    

Minimal sample size:  16
maximal sample size:  293
1.0000000000000002
(94, 94)
1
94
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        39
           1       0.98      1.00      0.99        55

   micro avg       0.99      0.99      0.99        94
   macro avg       0.99      0.99      0.99        94
weighted avg       0.99      0.99      0.99        94

[38  1  0 55]
svc Accuracy:  0.9893617021276596
svc F1:  0.9890019890019889
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.96      1.00      0.98        55

   micro avg       0.98      0.98      0.98        94
   macro avg       0.98      0.97      0.98        94
weighted avg       0.98      0.98      0.98        94

[37  2  0 55]
LR Accuracy:  0.9787234042553191
LR F1:  0.9779135338345866
For name:  a_lombardi
total sample size before apply threshold:  90
Counter({'0000-0002-2013-3009': 49, '0000-00

Minimal sample size:  6
maximal sample size:  276
0.9522859787065344
(115, 100)
1
115
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00        79

   micro avg       1.00      1.00      1.00       115
   macro avg       1.00      1.00      1.00       115
weighted avg       1.00      1.00      1.00       115

[36  0  0 79]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.99      1.00      0.99        79

   micro avg       0.99      0.99      0.99       115
   macro avg       0.99      0.99      0.99       115
weighted avg       0.99      0.99      0.99       115

[35  1  0 79]
LR Accuracy:  0.991304347826087
LR F1:  0.9898130923908228
For name:  w_zheng
total sample size before apply threshold:  93
Counter({'0000-0002-6236-9765': 48, '0000-0003-1034-0757': 24, '0000-0003-00

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       0.90      0.83      0.86        69
           2       0.80      0.98      0.88       106
           3       1.00      0.85      0.92        40
           4       1.00      0.73      0.85        45

   micro avg       0.89      0.89      0.89       295
   macro avg       0.94      0.88      0.90       295
weighted avg       0.91      0.89      0.89       295

[ 35   0   0   0   0   0  57  12   0   0   0   2 104   0   0   0   4   2
  34   0   0   0  12   0  33]
LR Accuracy:  0.8915254237288136
LR F1:  0.9020130121825037
For name:  r_thomas
total sample size before apply threshold:  368
Counter({'0000-0002-0518-8386': 95, '0000-0002-2340-0301': 95, '0000-0003-1448-7182': 74, '0000-0003-2062-8623': 46, '0000-0001-9251-5543': 13, '0000-0002-2970-6352': 10, '0000-0002-2165-5917': 8, '0000-0003-1282-7825': 5, '0000-0003-3588-2317': 5, '0000-0002-7286-2764': 4, '0000-

Minimal sample size:  12
maximal sample size:  285
0.5227766691871361
(372, 100)
1
372
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       112
           1       1.00      0.91      0.95        80
           2       0.94      1.00      0.97       180

   micro avg       0.97      0.97      0.97       372
   macro avg       0.98      0.96      0.97       372
weighted avg       0.97      0.97      0.97       372

[108   0   4   0  73   7   0   0 180]
svc Accuracy:  0.9704301075268817
svc F1:  0.9688056507146406
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       112
           1       1.00      0.89      0.94        80
           2       0.92      1.00      0.96       180

   micro avg       0.96      0.96      0.96       372
   macro avg       0.97      0.94      0.96       372
weighted avg       0.96      0.96      0.96       372

[106   0   6   0  71   9   0   0 180]
LR Accuracy: 

Minimal sample size:  8
maximal sample size:  334
0.5068337075714512
(446, 100)
1
446
              precision    recall  f1-score   support

           0       0.85      0.74      0.79        39
           1       1.00      0.88      0.94        33
           2       0.91      0.88      0.90        49
           3       0.94      0.99      0.97       146
           4       0.94      0.97      0.95       115
           5       0.91      0.91      0.91        64

   micro avg       0.93      0.93      0.93       446
   macro avg       0.93      0.89      0.91       446
weighted avg       0.93      0.93      0.93       446

[ 29   0   3   1   2   4   0  29   0   4   0   0   2   0  43   0   3   1
   0   0   0 145   1   0   0   0   1   2 111   1   3   0   0   2   1  58]
svc Accuracy:  0.9304932735426009
svc F1:  0.9085906864139605
              precision    recall  f1-score   support

           0       0.91      0.54      0.68        39
           1       1.00      0.55      0.71        33

Minimal sample size:  8
maximal sample size:  332
0.9730152878258828
(111, 100)
1
111
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        69

   micro avg       1.00      1.00      1.00       111
   macro avg       1.00      1.00      1.00       111
weighted avg       1.00      1.00      1.00       111

[42  0  0 69]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        69

   micro avg       0.99      0.99      0.99       111
   macro avg       0.99      0.99      0.99       111
weighted avg       0.99      0.99      0.99       111

[41  1  0 69]
LR Accuracy:  0.990990990990991
LR F1:  0.9903787813122995
For name:  m_marino
total sample size before apply threshold:  69
Counter({'0000-0001-9155-6378': 14, '0000-0002-0045-0234': 11, '0000-0002-7

Minimal sample size:  8
maximal sample size:  404
0.772549108215789
(188, 100)
1
188
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       149
           1       1.00      0.90      0.95        39

   micro avg       0.98      0.98      0.98       188
   macro avg       0.99      0.95      0.97       188
weighted avg       0.98      0.98      0.98       188

[149   0   4  35]
svc Accuracy:  0.9787234042553191
svc F1:  0.9663504564166816
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       149
           1       1.00      0.23      0.38        39

   micro avg       0.84      0.84      0.84       188
   macro avg       0.92      0.62      0.64       188
weighted avg       0.87      0.84      0.80       188

[149   0  30   9]
LR Accuracy:  0.8404255319148937
LR F1:  0.6417682926829269
For name:  j_young
total sample size before apply threshold:  267
Counter({'0000-0002-1514-1522': 124, 

Total sample size after apply threshold:  184
Minimal sample size:  13
maximal sample size:  296
0.7658950579731241
(184, 100)
1
184
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00        72

   micro avg       1.00      1.00      1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184

[112   0   0  72]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       112
           1       1.00      1.00      1.00        72

   micro avg       1.00      1.00      1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184

[112   0   0  72]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_nguyen
total sample size before apply threshold:  27
Counter({'0000-0002-8578-7396': 20, '0000-0002-47

Minimal sample size:  9
maximal sample size:  282
0.8823265577623667
(139, 100)
1
139
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        57
           1       1.00      0.97      0.98        30
           2       0.98      1.00      0.99        52

   micro avg       0.99      0.99      0.99       139
   macro avg       0.99      0.98      0.99       139
weighted avg       0.99      0.99      0.99       139

[56  0  1  1 29  0  0  0 52]
svc Accuracy:  0.9856115107913669
svc F1:  0.9853277260948983
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        57
           1       1.00      0.90      0.95        30
           2       0.95      1.00      0.97        52

   micro avg       0.97      0.97      0.97       139
   macro avg       0.98      0.96      0.97       139
weighted avg       0.97      0.97      0.97       139

[56  0  1  1 27  2  0  0 52]
LR Accuracy:  0.9712230215827338

Total sample size after apply threshold:  0
For name:  y_park
total sample size before apply threshold:  627
Counter({'0000-0002-6281-489X': 171, '0000-0002-5879-6879': 78, '0000-0002-3671-6364': 67, '0000-0002-9553-8561': 56, '0000-0003-1191-7335': 35, '0000-0002-8288-9450': 32, '0000-0002-8808-4530': 28, '0000-0002-1310-148X': 28, '0000-0002-5466-2339': 22, '0000-0001-8336-8051': 20, '0000-0003-3652-591X': 16, '0000-0001-8583-4335': 15, '0000-0001-8495-9224': 14, '0000-0001-7025-8945': 13, '0000-0002-1959-0843': 9, '0000-0002-7574-4165': 7, '0000-0003-1997-6444': 6, '0000-0002-8536-0835': 3, '0000-0001-6587-6562': 3, '0000-0002-1702-0986': 1, '0000-0002-2801-2674': 1, '0000-0001-5110-5716': 1, '0000-0002-3019-5748': 1})
['0000-0002-3671-6364', '0000-0003-1191-7335', '0000-0002-5879-6879', '0000-0002-8288-9450', '0000-0002-6281-489X', '0000-0002-9553-8561']
Total sample size after apply threshold:  439
Minimal sample size:  11
maximal sample size:  358
0.5413023465086415
(439, 100)
1


Minimal sample size:  11
maximal sample size:  243
0.9999999999999991
(68, 68)
1
68
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        38
           1       0.97      0.97      0.97        30

   micro avg       0.97      0.97      0.97        68
   macro avg       0.97      0.97      0.97        68
weighted avg       0.97      0.97      0.97        68

[37  1  1 29]
svc Accuracy:  0.9705882352941176
svc F1:  0.9701754385964912
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        38
           1       1.00      0.90      0.95        30

   micro avg       0.96      0.96      0.96        68
   macro avg       0.96      0.95      0.95        68
weighted avg       0.96      0.96      0.96        68

[38  0  3 27]
LR Accuracy:  0.9558823529411765
LR F1:  0.9546968687541639
For name:  j_muller
total sample size before apply threshold:  113
Counter({'0000-0001-6009-7471': 58, '0000-000

Minimal sample size:  10
maximal sample size:  308
0.7129397401468001
(218, 100)
1
218
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       103
           1       0.98      0.93      0.96        46
           2       1.00      1.00      1.00        69

   micro avg       0.98      0.98      0.98       218
   macro avg       0.98      0.98      0.98       218
weighted avg       0.98      0.98      0.98       218

[102   1   0   3  43   0   0   0  69]
svc Accuracy:  0.981651376146789
svc F1:  0.9787749287749289
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       103
           1       0.97      0.83      0.89        46
           2       1.00      1.00      1.00        69

   micro avg       0.96      0.96      0.96       218
   macro avg       0.97      0.94      0.95       218
weighted avg       0.96      0.96      0.96       218

[102   1   0   8  38   0   0   0  69]
LR Accuracy:  

Minimal sample size:  15
maximal sample size:  376
1.0
(83, 83)
1
83
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        49
           1       1.00      1.00      1.00        34

   micro avg       1.00      1.00      1.00        83
   macro avg       1.00      1.00      1.00        83
weighted avg       1.00      1.00      1.00        83

[49  0  0 34]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        49
           1       1.00      0.97      0.99        34

   micro avg       0.99      0.99      0.99        83
   macro avg       0.99      0.99      0.99        83
weighted avg       0.99      0.99      0.99        83

[49  0  1 33]
LR Accuracy:  0.9879518072289156
LR F1:  0.9874868083823307
For name:  g_volpe
total sample size before apply threshold:  31
Counter({'0000-0001-9993-5348': 15, '0000-0001-5057-1846': 14, '0000-0002-3916-5393': 1, '00

Minimal sample size:  7
maximal sample size:  296
0.5302836115081038
(420, 100)
1
420
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       322
           1       1.00      0.89      0.94        37
           2       1.00      0.97      0.98        61

   micro avg       0.99      0.99      0.99       420
   macro avg       0.99      0.95      0.97       420
weighted avg       0.99      0.99      0.99       420

[322   0   0   4  33   0   2   0  59]
svc Accuracy:  0.9857142857142858
svc F1:  0.9723199023199024
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       322
           1       1.00      0.49      0.65        37
           2       1.00      0.77      0.87        61

   micro avg       0.92      0.92      0.92       420
   macro avg       0.97      0.75      0.83       420
weighted avg       0.93      0.92      0.91       420

[322   0   0  19  18   0  14   0  47]
LR Accuracy:  

['0000-0002-9173-7811']
Total sample size after apply threshold:  41
For name:  l_tavares
total sample size before apply threshold:  41
Counter({'0000-0001-8671-6285': 18, '0000-0001-9487-7978': 7, '0000-0001-8438-7887': 7, '0000-0002-1432-524X': 7, '0000-0003-3190-0194': 2})
[]
Total sample size after apply threshold:  0
For name:  t_murakami
total sample size before apply threshold:  63
Counter({'0000-0002-0314-8807': 59, '0000-0002-2661-2633': 2, '0000-0001-7924-8073': 1, '0000-0002-0754-2879': 1})
['0000-0002-0314-8807']
Total sample size after apply threshold:  59
For name:  x_xiao
total sample size before apply threshold:  31
Counter({'0000-0002-3987-8668': 11, '0000-0002-9753-6586': 8, '0000-0003-1749-4230': 7, '0000-0002-0240-0038': 5})
[]
Total sample size after apply threshold:  0
For name:  j_davies
total sample size before apply threshold:  122
Counter({'0000-0001-6660-4032': 55, '0000-0001-5888-664X': 14, '0000-0001-7415-6129': 10, '0000-0003-4035-6047': 9, '0000-0002-4108

Minimal sample size:  20
maximal sample size:  288
0.9497566296933923
(117, 100)
1
117
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        69

   micro avg       1.00      1.00      1.00       117
   macro avg       1.00      1.00      1.00       117
weighted avg       1.00      1.00      1.00       117

[48  0  0 69]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        48
           1       0.99      1.00      0.99        69

   micro avg       0.99      0.99      0.99       117
   macro avg       0.99      0.99      0.99       117
weighted avg       0.99      0.99      0.99       117

[47  1  0 69]
LR Accuracy:  0.9914529914529915
LR F1:  0.9911397198031049
For name:  a_castro
total sample size before apply threshold:  126
Counter({'0000-0001-7526-6717': 39, '0000-0002-8311-0840': 17, '0000-000

Total sample size after apply threshold:  187
Minimal sample size:  6
maximal sample size:  275
0.7822915712195658
(187, 100)
1
187
              precision    recall  f1-score   support

           0       0.94      0.95      0.95        85
           1       0.89      0.94      0.91        33
           2       0.93      0.84      0.88        31
           3       1.00      1.00      1.00        38

   micro avg       0.94      0.94      0.94       187
   macro avg       0.94      0.93      0.94       187
weighted avg       0.94      0.94      0.94       187

[81  2  2  0  2 31  0  0  3  2 26  0  0  0  0 38]
svc Accuracy:  0.9411764705882353
svc F1:  0.9351222647845936
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        85
           1       1.00      0.73      0.84        33
           2       0.88      0.48      0.62        31
           3       1.00      1.00      1.00        38

   micro avg       0.87      0.87      0.87      

Minimal sample size:  16
maximal sample size:  254
0.980370506698348
(107, 100)
1
107
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        50
           1       0.91      0.93      0.92        57

   micro avg       0.92      0.92      0.92       107
   macro avg       0.92      0.91      0.92       107
weighted avg       0.92      0.92      0.92       107

[45  5  4 53]
svc Accuracy:  0.9158878504672897
svc F1:  0.9154150197628458
              precision    recall  f1-score   support

           0       0.95      0.84      0.89        50
           1       0.87      0.96      0.92        57

   micro avg       0.91      0.91      0.91       107
   macro avg       0.91      0.90      0.91       107
weighted avg       0.91      0.91      0.91       107

[42  8  2 55]
LR Accuracy:  0.9065420560747663
LR F1:  0.9051418439716312
For name:  e_brown
total sample size before apply threshold:  71
Counter({'0000-0002-5995-834X': 28, '0000-000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      0.97      0.99       104
           1       1.00      0.44      0.61        39
           2       0.84      1.00      0.91       487
           3       0.00      0.00      0.00        35
           4       1.00      0.47      0.64        57
           5       1.00      0.95      0.98       105

   micro avg       0.89      0.89      0.89       827
   macro avg       0.81      0.64      0.69       827
weighted avg       0.86      0.89      0.86       827

[101   0   3   0   0   0   0  17  22   0   0   0   0   0 487   0   0   0
   0   0  35   0   0   0   0   0  30   0  27   0   0   0   5   0   0 100]
LR Accuracy:  0.8851269649334945
LR F1:  0.6870179181211831
For name:  w_choi
total sample size before apply threshold:  118
Counter({'0000-0003-1801-9386': 79, '0000-0002-7896-7655': 16, '0000-0002-6623-3806': 7, '0000-0002-4203-0457': 6, '0000-0001-8038-5876': 3, '0000-0002-7183-3400': 3, '0000-0003-4233-01

Minimal sample size:  14
maximal sample size:  271
0.9584406944396857
(114, 100)
1
114
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        81
           1       1.00      1.00      1.00        33

   micro avg       1.00      1.00      1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114

[81  0  0 33]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        81
           1       1.00      0.91      0.95        33

   micro avg       0.97      0.97      0.97       114
   macro avg       0.98      0.95      0.97       114
weighted avg       0.97      0.97      0.97       114

[81  0  3 30]
LR Accuracy:  0.9736842105263158
LR F1:  0.967099567099567
For name:  b_moreno
total sample size before apply threshold:  8
Counter({'0000-0001-5799-9802': 6, '0000-0002-8881-4329': 1, '0000-0002-153

total sample size before apply threshold:  79
Counter({'0000-0001-9516-9551': 31, '0000-0001-9141-5751': 28, '0000-0002-4907-4292': 19, '0000-0003-3333-9783': 1})
['0000-0001-9516-9551']
Total sample size after apply threshold:  31
For name:  j_thomsen
total sample size before apply threshold:  28
Counter({'0000-0002-9336-5695': 18, '0000-0003-2143-8274': 8, '0000-0002-7368-6133': 1, '0000-0002-8275-4847': 1})
[]
Total sample size after apply threshold:  0
For name:  v_gupta
total sample size before apply threshold:  238
Counter({'0000-0002-8850-0485': 63, '0000-0002-6139-1346': 38, '0000-0002-1348-3545': 30, '0000-0002-9190-1757': 26, '0000-0003-4639-3316': 22, '0000-0001-6987-2550': 14, '0000-0002-6157-3705': 14, '0000-0002-1518-6624': 8, '0000-0002-2089-027X': 6, '0000-0003-2809-2966': 5, '0000-0003-1567-1037': 3, '0000-0001-7184-4663': 3, '0000-0003-1565-5918': 3, '0000-0003-2824-3402': 1, '0000-0001-6804-3830': 1, '0000-0001-6955-9134': 1})
['0000-0002-8850-0485', '0000-0002-1348-

Minimal sample size:  9
maximal sample size:  317
0.6542433385580181
(269, 100)
1
269
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        84
           1       0.97      1.00      0.99       102
           2       0.95      1.00      0.98        83

   micro avg       0.97      0.97      0.97       269
   macro avg       0.98      0.97      0.97       269
weighted avg       0.97      0.97      0.97       269

[ 77   3   4   0 102   0   0   0  83]
svc Accuracy:  0.9739776951672863
svc F1:  0.9728331912475134
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        84
           1       0.96      1.00      0.98       102
           2       0.98      1.00      0.99        83

   micro avg       0.98      0.98      0.98       269
   macro avg       0.98      0.98      0.98       269
weighted avg       0.98      0.98      0.98       269

[ 78   4   2   0 102   0   0   0  83]
LR Accuracy:  

Minimal sample size:  9
maximal sample size:  444
0.8235752857222426
(166, 100)
1
166
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       125
           1       1.00      0.95      0.97        41

   micro avg       0.99      0.99      0.99       166
   macro avg       0.99      0.98      0.98       166
weighted avg       0.99      0.99      0.99       166

[125   0   2  39]
svc Accuracy:  0.9879518072289156
svc F1:  0.9835317460317461
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       125
           1       1.00      0.88      0.94        41

   micro avg       0.97      0.97      0.97       166
   macro avg       0.98      0.94      0.96       166
weighted avg       0.97      0.97      0.97       166

[125   0   5  36]
LR Accuracy:  0.9698795180722891
LR F1:  0.95772854596384
For name:  c_guo
total sample size before apply threshold:  6
Counter({'0000-0001-9253-3469': 2, '0000-0

Minimal sample size:  6
maximal sample size:  315
0.4837748209029722
(493, 100)
1
493
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       244
           1       0.96      0.90      0.93        30
           2       1.00      0.97      0.98        67
           3       1.00      0.93      0.97        30
           4       0.97      0.97      0.97        71
           5       0.94      1.00      0.97        51

   micro avg       0.98      0.98      0.98       493
   macro avg       0.98      0.96      0.97       493
weighted avg       0.98      0.98      0.98       493

[243   0   0   0   0   1   1  27   0   0   2   0   1   1  65   0   0   0
   0   0   0  28   0   2   2   0   0   0  69   0   0   0   0   0   0  51]
svc Accuracy:  0.9797160243407708
svc F1:  0.9690794111569131
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       244
           1       1.00      0.83      0.91        30

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        40
           1       0.98      0.91      0.94        45
           2       0.93      0.82      0.87        34
           3       0.98      0.93      0.95        44
           4       0.81      0.95      0.87        65
           5       0.92      0.74      0.82        31

   micro avg       0.89      0.89      0.89       259
   macro avg       0.91      0.88      0.89       259
weighted avg       0.90      0.89      0.89       259

[36  0  2  0  2  0  0 41  0  0  4  0  6  0 28  0  0  0  1  0  0 41  2  0
  0  0  0  1 62  2  0  1  0  0  7 23]
LR Accuracy:  0.8918918918918919
LR F1:  0.8888591658819281
For name:  m_magnusson
total sample size before apply threshold:  67
Counter({'0000-0003-1710-5936': 24, '0000-0002-6565-4027': 22, '0000-0002-3141-8544': 10, '0000-0002-7574-1095': 7, '0000-0002-8049-2142': 3, '0000-0001-5388-6608': 1})
[]
Total sample size after apply threshold:  0
F

0.739050096900825
(196, 100)
1
196
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        36
           1       0.92      1.00      0.96       105
           2       1.00      0.87      0.93        55

   micro avg       0.95      0.95      0.95       196
   macro avg       0.97      0.94      0.95       196
weighted avg       0.96      0.95      0.95       196

[ 34   2   0   0 105   0   0   7  48]
svc Accuracy:  0.9540816326530612
svc F1:  0.9541238386563563
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        36
           1       0.85      1.00      0.92       105
           2       1.00      0.78      0.88        55

   micro avg       0.91      0.91      0.91       196
   macro avg       0.95      0.87      0.90       196
weighted avg       0.92      0.91      0.91       196

[ 30   6   0   0 105   0   0  12  43]
LR Accuracy:  0.9081632653061225
LR F1:  0.9025648536926733
For n

Minimal sample size:  10
maximal sample size:  314
0.7582837191162626
(196, 100)
1
196
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       142
           1       1.00      0.94      0.97        54

   micro avg       0.98      0.98      0.98       196
   macro avg       0.99      0.97      0.98       196
weighted avg       0.99      0.98      0.98       196

[142   0   3  51]
svc Accuracy:  0.9846938775510204
svc F1:  0.9804878048780488
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       142
           1       1.00      0.85      0.92        54

   micro avg       0.96      0.96      0.96       196
   macro avg       0.97      0.93      0.95       196
weighted avg       0.96      0.96      0.96       196

[142   0   8  46]
LR Accuracy:  0.9591836734693877
LR F1:  0.9463013698630136
For name:  j_moreno
total sample size before apply threshold:  138
Counter({'0000-0003-0087-4659': 44

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        58
           1       0.94      1.00      0.97        83

   micro avg       0.96      0.96      0.96       141
   macro avg       0.97      0.96      0.96       141
weighted avg       0.97      0.96      0.96       141

[53  5  0 83]
LR Accuracy:  0.9645390070921985
LR F1:  0.9628575944365418
For name:  q_lu
total sample size before apply threshold:  35
Counter({'0000-0002-2804-0827': 22, '0000-0002-4261-5121': 5, '0000-0002-4514-0969': 4, '0000-0002-7952-2332': 3, '0000-0001-6234-4384': 1})
[]
Total sample size after apply threshold:  0
For name:  s_kumar
total sample size before apply threshold:  419
Counter({'0000-0003-4326-5941': 130, '0000-0002-4003-4411': 42, '0000-0003-2405-3791': 25, '0000-0003-0658-8709': 21, '0000-0001-8373-105X': 19, '0000-0001-5902-6641': 18, '0000-0001-5940-9490': 14, '0000-0003-0562-2645': 13, '0000-0003-2130-7493': 13, '0000-0003-0423-2880': 9, '000

Minimal sample size:  8
maximal sample size:  454
0.7486526309226937
(200, 100)
1
200
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        90

   micro avg       1.00      1.00      1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

[110   0   0  90]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        90

   micro avg       1.00      1.00      1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

[110   0   0  90]
LR Accuracy:  1.0
LR F1:  1.0
For name:  a_sharma
total sample size before apply threshold:  223
Counter({'0000-0002-2653-0806': 85, '0000-0003-3349-4417': 23, '0000-0002-7668-3501': 14, '0000

Minimal sample size:  10
maximal sample size:  356
0.609044324758479
(287, 100)
1
287
              precision    recall  f1-score   support

           0       0.98      0.88      0.93        50
           1       0.97      0.93      0.95        71
           2       0.86      0.96      0.91        71
           3       1.00      0.98      0.99        55
           4       0.93      0.95      0.94        40

   micro avg       0.94      0.94      0.94       287
   macro avg       0.95      0.94      0.94       287
weighted avg       0.94      0.94      0.94       287

[44  0  5  0  1  0 66  5  0  0  0  2 68  0  1  0  0  0 54  1  1  0  1  0
 38]
svc Accuracy:  0.9407665505226481
svc F1:  0.9423440073843603
              precision    recall  f1-score   support

           0       0.95      0.80      0.87        50
           1       0.94      0.93      0.94        71
           2       0.83      0.96      0.89        71
           3       0.96      0.98      0.97        55
           4  

Minimal sample size:  10
maximal sample size:  332
0.7022100848597225
(214, 100)
1
214
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       140
           1       0.97      0.89      0.93        37
           2       1.00      0.97      0.99        37

   micro avg       0.98      0.98      0.98       214
   macro avg       0.98      0.95      0.97       214
weighted avg       0.98      0.98      0.98       214

[140   0   0   4  33   0   0   1  36]
svc Accuracy:  0.9766355140186916
svc F1:  0.9672647758698308
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       140
           1       1.00      0.65      0.79        37
           2       1.00      0.68      0.81        37

   micro avg       0.88      0.88      0.88       214
   macro avg       0.95      0.77      0.84       214
weighted avg       0.90      0.88      0.88       214

[140   0   0  13  24   0  12   0  25]
LR Accuracy: 

Minimal sample size:  8
maximal sample size:  212
0.9999999999999999
(99, 99)
1
99
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.99      1.00      0.99        66

   micro avg       0.99      0.99      0.99        99
   macro avg       0.99      0.98      0.99        99
weighted avg       0.99      0.99      0.99        99

[32  1  0 66]
svc Accuracy:  0.98989898989899
svc F1:  0.9885482938114517
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        33
           1       0.94      1.00      0.97        66

   micro avg       0.96      0.96      0.96        99
   macro avg       0.97      0.94      0.95        99
weighted avg       0.96      0.96      0.96        99

[29  4  0 66]
LR Accuracy:  0.9595959595959596
LR F1:  0.9530360531309298
For name:  x_cao
total sample size before apply threshold:  74
Counter({'0000-0002-3004-7518': 25, '0000-0001-7222-

Minimal sample size:  6
maximal sample size:  384
0.6787877293055953
(234, 100)
1
234
              precision    recall  f1-score   support

           0       0.88      0.83      0.86        36
           1       0.89      0.99      0.94       117
           2       0.87      0.81      0.84        48
           3       1.00      0.76      0.86        33

   micro avg       0.90      0.90      0.90       234
   macro avg       0.91      0.85      0.87       234
weighted avg       0.90      0.90      0.90       234

[ 30   1   5   0   0 116   1   0   3   6  39   0   1   7   0  25]
svc Accuracy:  0.8974358974358975
svc F1:  0.8742981887850456
              precision    recall  f1-score   support

           0       0.90      0.53      0.67        36
           1       0.72      0.99      0.83       117
           2       0.85      0.58      0.69        48
           3       1.00      0.55      0.71        33

   micro avg       0.77      0.77      0.77       234
   macro avg       0.87  

Minimal sample size:  7
maximal sample size:  467
1.0000000000000004
(89, 89)
1
89
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00        89
   macro avg       1.00      1.00      1.00        89
weighted avg       1.00      1.00      1.00        89

[54  0  0 35]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        35

   micro avg       1.00      1.00      1.00        89
   macro avg       1.00      1.00      1.00        89
weighted avg       1.00      1.00      1.00        89

[54  0  0 35]
LR Accuracy:  1.0
LR F1:  1.0
For name:  s_thompson
total sample size before apply threshold:  45
Counter({'0000-0003-0327-7155': 36, '0000-0003-4784-8386': 3, '0000-0001-9689-1490': 2, '0000-0001-9637-2

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


total sample size before apply threshold:  190
Counter({'0000-0003-3414-3440': 78, '0000-0002-5857-3851': 42, '0000-0003-1072-0223': 25, '0000-0001-6299-3747': 24, '0000-0003-1710-3914': 9, '0000-0002-3866-1344': 6, '0000-0003-0072-3316': 3, '0000-0002-9319-1701': 2, '0000-0003-3463-9200': 1})
['0000-0003-3414-3440', '0000-0002-5857-3851']
Total sample size after apply threshold:  120
Minimal sample size:  4
maximal sample size:  281
0.9332857388703321
(120, 100)
1
120
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

[78  0  0 42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        78
           1       1.00      0.88      0.94 

Minimal sample size:  5
maximal sample size:  265
0.9999999999999994
(85, 85)
1
85
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.98      0.98      0.98        43

   micro avg       0.98      0.98      0.98        85
   macro avg       0.98      0.98      0.98        85
weighted avg       0.98      0.98      0.98        85

[41  1  1 42]
svc Accuracy:  0.9764705882352941
svc F1:  0.9764673311184939
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.98      1.00      0.99        43

   micro avg       0.99      0.99      0.99        85
   macro avg       0.99      0.99      0.99        85
weighted avg       0.99      0.99      0.99        85

[41  1  0 43]
LR Accuracy:  0.9882352941176471
LR F1:  0.9882287771776762
For name:  j_regan
total sample size before apply threshold:  27
Counter({'0000-0003-2164-9151': 10, '0000-0001-5

0.5643693524595701
(364, 100)
1
364
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       194
           1       0.86      0.92      0.89        39
           2       0.96      0.87      0.91        30
           3       0.93      0.94      0.94       101

   micro avg       0.96      0.96      0.96       364
   macro avg       0.94      0.93      0.93       364
weighted avg       0.96      0.96      0.96       364

[193   0   0   1   0  36   0   3   0   1  26   3   0   5   1  95]
svc Accuracy:  0.9615384615384616
svc F1:  0.9336365506120287
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       194
           1       0.97      0.77      0.86        39
           2       1.00      0.60      0.75        30
           3       0.84      0.96      0.89       101

   micro avg       0.93      0.93      0.93       364
   macro avg       0.94      0.83      0.87       364
weighted avg       0.

Minimal sample size:  8
maximal sample size:  324
0.5803981467061281
(321, 100)
1
321
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       102
           1       1.00      1.00      1.00        46
           2       0.99      1.00      1.00       173

   micro avg       1.00      1.00      1.00       321
   macro avg       1.00      1.00      1.00       321
weighted avg       1.00      1.00      1.00       321

[101   0   1   0  46   0   0   0 173]
svc Accuracy:  0.9968847352024922
svc F1:  0.9973973490817375
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       102
           1       1.00      0.76      0.86        46
           2       0.92      1.00      0.96       173

   micro avg       0.95      0.95      0.95       321
   macro avg       0.97      0.90      0.93       321
weighted avg       0.95      0.95      0.95       321

[ 97   0   5   0  35  11   0   0 173]
LR Accuracy:  

Minimal sample size:  19
maximal sample size:  332
1.000000000000001
(72, 72)
1
72
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        35
           1       1.00      0.95      0.97        37

   micro avg       0.97      0.97      0.97        72
   macro avg       0.97      0.97      0.97        72
weighted avg       0.97      0.97      0.97        72

[35  0  2 35]
svc Accuracy:  0.9722222222222222
svc F1:  0.9722222222222222
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        35
           1       1.00      0.97      0.99        37

   micro avg       0.99      0.99      0.99        72
   macro avg       0.99      0.99      0.99        72
weighted avg       0.99      0.99      0.99        72

[35  0  1 36]
LR Accuracy:  0.9861111111111112
LR F1:  0.9861084314103801
For name:  a_moura
total sample size before apply threshold:  36
Counter({'0000-0003-0339-1230': 15, '0000-0002-2

Minimal sample size:  10
maximal sample size:  262
0.8148559520836104
(172, 100)
1
172
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       120
           1       1.00      0.98      0.99        52

   micro avg       0.99      0.99      0.99       172
   macro avg       1.00      0.99      0.99       172
weighted avg       0.99      0.99      0.99       172

[120   0   1  51]
svc Accuracy:  0.9941860465116279
svc F1:  0.9930709422712807
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       120
           1       1.00      0.96      0.98        52

   micro avg       0.99      0.99      0.99       172
   macro avg       0.99      0.98      0.99       172
weighted avg       0.99      0.99      0.99       172

[120   0   2  50]
LR Accuracy:  0.9883720930232558
LR F1:  0.9860638470264138
For name:  h_shin
total sample size before apply threshold:  114
Counter({'0000-0001-7615-9809': 34, 

Minimal sample size:  4
maximal sample size:  607
0.8703087950159599
(144, 100)
1
144
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        42

   micro avg       1.00      1.00      1.00       144
   macro avg       1.00      1.00      1.00       144
weighted avg       1.00      1.00      1.00       144

[102   0   0  42]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       102
           1       1.00      0.86      0.92        42

   micro avg       0.96      0.96      0.96       144
   macro avg       0.97      0.93      0.95       144
weighted avg       0.96      0.96      0.96       144

[102   0   6  36]
LR Accuracy:  0.9583333333333334
LR F1:  0.9472527472527472
For name:  m_kang
total sample size before apply threshold:  131
Counter({'0000-0003-1595-1717': 38, '0000-0003-3245-144X': 19, '000

Minimal sample size:  10
maximal sample size:  313
0.5733392782249038
(346, 100)
1
346
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        41
           1       0.97      0.97      0.97        38
           2       0.91      0.93      0.92        42
           3       0.94      0.89      0.92        37
           4       0.98      0.98      0.98        50
           5       1.00      0.97      0.99        38
           6       1.00      0.98      0.99        46
           7       1.00      0.93      0.96        54

   micro avg       0.94      0.94      0.94       346
   macro avg       0.94      0.94      0.94       346
weighted avg       0.95      0.94      0.94       346

[36  1  4  0  0  0  0  0  1 37  0  0  0  0  0  0  2  0 39  1  0  0  0  0
  4  0  0 33  0  0  0  0  1  0  0  0 49  0  0  0  0  0  0  1  0 37  0  0
  1  0  0  0  0  0 45  0  3  0  0  0  1  0  0 50]
svc Accuracy:  0.9421965317919075
svc F1:  0.9417753521596965
    

total sample size before apply threshold:  150
Counter({'0000-0002-8132-5610': 54, '0000-0002-5677-7332': 30, '0000-0003-4380-3711': 25, '0000-0001-6200-3686': 16, '0000-0003-0732-1571': 14, '0000-0003-1999-1206': 4, '0000-0003-1947-8605': 4, '0000-0001-8957-661X': 2, '0000-0001-9341-5827': 1})
['0000-0002-8132-5610', '0000-0002-5677-7332']
Total sample size after apply threshold:  84
Minimal sample size:  2
maximal sample size:  312
1.0000000000000004
(84, 84)
1
84
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        54
           1       1.00      1.00      1.00        30

   micro avg       1.00      1.00      1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

[54  0  0 30]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.87      0.93    

total sample size before apply threshold:  57
Counter({'0000-0002-8925-2567': 21, '0000-0003-1076-2669': 18, '0000-0003-3698-1608': 12, '0000-0002-3326-3670': 6})
[]
Total sample size after apply threshold:  0
For name:  g_coppola
total sample size before apply threshold:  142
Counter({'0000-0002-9574-0081': 61, '0000-0002-8510-6925': 57, '0000-0003-0147-6142': 16, '0000-0003-2675-783X': 7, '0000-0001-7139-3719': 1})
['0000-0002-9574-0081', '0000-0002-8510-6925']
Total sample size after apply threshold:  118
Minimal sample size:  3
maximal sample size:  365
0.9445336308232949
(118, 100)
1
118
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        61
           1       0.95      1.00      0.97        57

   micro avg       0.97      0.97      0.97       118
   macro avg       0.97      0.98      0.97       118
weighted avg       0.98      0.97      0.97       118

[58  3  0 57]
svc Accuracy:  0.9745762711864406
svc F1:  0.97457444516268

Minimal sample size:  12
maximal sample size:  339
0.9974360525392357
(101, 100)
1
101
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        64
           1       1.00      0.97      0.99        37

   micro avg       0.99      0.99      0.99       101
   macro avg       0.99      0.99      0.99       101
weighted avg       0.99      0.99      0.99       101

[64  0  1 36]
svc Accuracy:  0.9900990099009901
svc F1:  0.9892747159392588
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        64
           1       1.00      0.86      0.93        37

   micro avg       0.95      0.95      0.95       101
   macro avg       0.96      0.93      0.94       101
weighted avg       0.95      0.95      0.95       101

[64  0  5 32]
LR Accuracy:  0.9504950495049505
LR F1:  0.944971123460826
For name:  c_cao
total sample size before apply threshold:  74
Counter({'0000-0003-2139-1648': 25, '0000-0003-

Minimal sample size:  0
maximal sample size:  283
0.5975310561298496
(325, 100)
1
325
              precision    recall  f1-score   support

           0       0.94      0.88      0.91        34
           1       0.99      1.00      0.99        95
           2       0.98      0.97      0.98       109
           3       0.89      1.00      0.94        48
           4       1.00      0.90      0.95        39

   micro avg       0.97      0.97      0.97       325
   macro avg       0.96      0.95      0.95       325
weighted avg       0.97      0.97      0.97       325

[ 30   0   2   2   0   0  95   0   0   0   2   0 106   1   0   0   0   0
  48   0   0   1   0   3  35]
svc Accuracy:  0.9661538461538461
svc F1:  0.9535872497752944
              precision    recall  f1-score   support

           0       0.97      0.85      0.91        34
           1       0.97      1.00      0.98        95
           2       0.87      0.98      0.92       109
           3       0.96      0.90      0.92

total sample size before apply threshold:  789
Counter({'0000-0002-2381-2349': 587, '0000-0002-5252-9649': 43, '0000-0001-5645-8422': 31, '0000-0002-9174-7681': 19, '0000-0001-8483-6777': 19, '0000-0003-1599-9171': 13, '0000-0003-0245-2265': 13, '0000-0001-9746-1230': 10, '0000-0002-8343-794X': 8, '0000-0002-6881-5690': 8, '0000-0003-4000-2919': 6, '0000-0002-3540-1133': 6, '0000-0003-2502-5098': 6, '0000-0001-9634-2918': 4, '0000-0002-6825-888X': 4, '0000-0003-1209-9653': 3, '0000-0002-9044-9199': 3, '0000-0003-2340-0042': 2, '0000-0002-3794-3788': 2, '0000-0002-7413-4189': 1, '0000-0001-7479-7778': 1})
['0000-0002-2381-2349', '0000-0002-5252-9649', '0000-0001-5645-8422']
Total sample size after apply threshold:  661
Minimal sample size:  7
maximal sample size:  299
0.4015587634840025
(661, 100)
1
661
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       587
           1       0.97      0.84      0.90        43
           2       0.95

['0000-0002-1051-7260', '0000-0001-5675-1258']
Total sample size after apply threshold:  105
Minimal sample size:  6
maximal sample size:  283
0.9841641203078756
(105, 100)
1
105
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        75
           1       1.00      0.97      0.98        30

   micro avg       0.99      0.99      0.99       105
   macro avg       0.99      0.98      0.99       105
weighted avg       0.99      0.99      0.99       105

[75  0  1 29]
svc Accuracy:  0.9904761904761905
svc F1:  0.9882141654506678
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        75
           1       1.00      0.93      0.97        30

   micro avg       0.98      0.98      0.98       105
   macro avg       0.99      0.97      0.98       105
weighted avg       0.98      0.98      0.98       105

[75  0  2 28]
LR Accuracy:  0.9809523809523809
LR F1:  0.9761796733212341
For name:  a_chen

Total sample size after apply threshold:  0
For name:  b_ryan
total sample size before apply threshold:  31
Counter({'0000-0002-6703-3718': 15, '0000-0001-7213-3273': 11, '0000-0002-5018-2952': 3, '0000-0003-3881-8556': 2})
[]
Total sample size after apply threshold:  0
For name:  j_kim
total sample size before apply threshold:  2116
Counter({'0000-0003-1835-9436': 200, '0000-0003-3477-1172': 146, '0000-0003-1232-5307': 124, '0000-0001-6537-0350': 78, '0000-0003-0934-3344': 73, '0000-0001-7964-106X': 56, '0000-0003-2337-6935': 52, '0000-0003-2068-7287': 51, '0000-0002-3573-638X': 46, '0000-0003-4085-293X': 41, '0000-0002-6349-6950': 41, '0000-0002-6931-8581': 38, '0000-0002-4171-3803': 38, '0000-0003-0373-5080': 36, '0000-0002-1299-4300': 36, '0000-0002-8383-8524': 33, '0000-0002-0087-1151': 32, '0000-0002-3500-7494': 32, '0000-0002-4687-6732': 31, '0000-0001-5979-5774': 30, '0000-0001-9660-6303': 29, '0000-0002-1903-8354': 28, '0000-0002-5390-8763': 27, '0000-0003-0767-1918': 26, '000

              precision    recall  f1-score   support

           0       0.87      0.96      0.92        56
           1       0.69      0.99      0.82       124
           2       0.94      0.55      0.69        31
           3       1.00      0.81      0.90        32
           4       0.69      0.78      0.73        73
           5       1.00      0.59      0.75        32
           6       0.95      0.87      0.91        46
           7       0.97      0.79      0.87        38
           8       1.00      0.97      0.98        30
           9       1.00      0.50      0.67        38
          10       0.78      0.41      0.54        51
          11       1.00      0.93      0.96        41
          12       1.00      0.39      0.56        36
          13       0.73      0.51      0.60        78
          14       0.91      0.30      0.45        33
          15       0.74      1.00      0.85       200
          16       0.93      0.75      0.83        52
          17       0.94    

Minimal sample size:  10
maximal sample size:  330
0.6497486585930867
(288, 100)
1
288
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       119
           1       1.00      0.92      0.96        48
           2       0.94      0.99      0.97        85
           3       1.00      0.97      0.99        36

   micro avg       0.98      0.98      0.98       288
   macro avg       0.98      0.97      0.97       288
weighted avg       0.98      0.98      0.98       288

[118   0   1   0   1  44   3   0   1   0  84   0   0   0   1  35]
svc Accuracy:  0.9756944444444444
svc F1:  0.9738505430530654
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       119
           1       1.00      0.77      0.87        48
           2       0.94      1.00      0.97        85
           3       1.00      0.94      0.97        36

   micro avg       0.95      0.95      0.95       288
   macro avg       0.97 

Minimal sample size:  7
maximal sample size:  295
0.8264105071918817
(157, 100)
1
157
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00        97

   micro avg       1.00      1.00      1.00       157
   macro avg       1.00      1.00      1.00       157
weighted avg       1.00      1.00      1.00       157

[60  0  0 97]
svc Accuracy:  1.0
svc F1:  1.0
              precision    recall  f1-score   support

           0       1.00      0.93      0.97        60
           1       0.96      1.00      0.98        97

   micro avg       0.97      0.97      0.97       157
   macro avg       0.98      0.97      0.97       157
weighted avg       0.98      0.97      0.97       157

[56  4  0 97]
LR Accuracy:  0.9745222929936306
LR F1:  0.9726576105886451
For name:  m_aguilar
total sample size before apply threshold:  108
Counter({'0000-0002-1935-6619': 59, '0000-0001-7395-5754': 18, '0000-000

Minimal sample size:  7
maximal sample size:  344
0.4625628505950121
(514, 100)
1
514
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.96      0.98      0.97        53
           2       0.97      0.97      0.97        64
           3       1.00      0.87      0.93        31
           4       0.86      1.00      0.93        56
           5       0.95      1.00      0.98        98
           6       0.95      0.67      0.78        30
           7       0.93      0.98      0.95        63
           8       1.00      0.97      0.98        31
           9       1.00      0.92      0.96        48

   micro avg       0.95      0.95      0.95       514
   macro avg       0.96      0.93      0.94       514
weighted avg       0.96      0.95      0.95       514

[39  0  0  0  0  0  0  1  0  0  0 52  0  0  0  1  0  0  0  0  0  0 62  0
  1  1  0  0  0  0  0  0  2 27  0  1  1  0  0  0  0  0  0  0 56  0  0  0
  0  0  0

Minimal sample size:  14
maximal sample size:  310
0.8422001987742187
(148, 100)
1
148
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        89
           1       1.00      0.98      0.99        59

   micro avg       0.99      0.99      0.99       148
   macro avg       0.99      0.99      0.99       148
weighted avg       0.99      0.99      0.99       148

[89  0  1 58]
svc Accuracy:  0.9932432432432432
svc F1:  0.9929331996371102
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        89
           1       1.00      0.86      0.93        59

   micro avg       0.95      0.95      0.95       148
   macro avg       0.96      0.93      0.94       148
weighted avg       0.95      0.95      0.95       148

[89  0  8 51]
LR Accuracy:  0.9459459459459459
LR F1:  0.9421309872922776
For name:  m_sousa
total sample size before apply threshold:  211
Counter({'0000-0002-3009-3290': 117, '0000-

Minimal sample size:  19
maximal sample size:  305
0.7686120108452724
(180, 100)
1
180
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        48
           1       1.00      1.00      1.00        61
           2       1.00      1.00      1.00        36
           3       0.94      0.94      0.94        35

   micro avg       0.98      0.98      0.98       180
   macro avg       0.98      0.98      0.98       180
weighted avg       0.98      0.98      0.98       180

[46  0  0  2  0 61  0  0  0  0 36  0  2  0  0 33]
svc Accuracy:  0.9777777777777777
svc F1:  0.975297619047619
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        48
           1       1.00      1.00      1.00        61
           2       1.00      0.97      0.99        36
           3       0.97      0.91      0.94        35

   micro avg       0.97      0.97      0.97       180
   macro avg       0.97      0.97      0.

In [32]:
for feature in all_features:
    print(feature)
    print(pp_textual)
    print("svc: ", modelSVCf1)
    print("lr: ", modelLRf1)

['tf', 'tf_idf', 'lsa']
svc:  [0.9464991405128569, 0.9731343812578639, 0.9706356660582325]
lr:  [0.9699976962200287, 0.9314359637774903, 0.9415017101135941]


In [19]:
# accuracy
from statistics import mean 
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

578
578
0.9798807689357634
0.9496564529342066


In [20]:
# f1
from statistics import mean 
# remove string from result
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

578
578
0.9764067884953497
0.9288583090243283


In [None]:
%reset

In [None]:
%who