In [1]:
import com_func

Dataset = "pubmed"

# parameters
threshold = 10
cutoff = 3

coauthor_emb_type = "tf_idf"
venue_emb_type = "off"
year_emb_type = "off"
pp_textual_emb_type = "off"
citation_emb_type = "off"

In [16]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7],
                                "publish_year": read_data[10]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [3]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [4]:
# co-author relation to frequence count
def co_author_to_vector(raw_co_author_data, emb_type="off"):
    while True:
        if emb_type == "tf":
            co_author_vectorizer = CountVectorizer()
            print(co_author_vectorizer)
            result_vector = co_author_vectorizer.fit_transform(raw_co_author_data).toarray()
            #print(co_author_vectorizer.get_feature_names())
            #print(len(co_author_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [5]:
# venue relation with author
def venue_to_vector(raw_venue_id, emb_type="off"):
    while True:
        if emb_type == "tf":
            venue_count_vectorizer = CountVectorizer()
            print(venue_count_vectorizer)
            result_vector = venue_count_vectorizer.fit_transform(raw_venue_id).toarray()
            #print(len(venue_count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [6]:
# author-year relation to emb
def year_to_vector(raw_year, emb_type="off"):
    while True:
        if emb_type == "tf":
            count_vectorizer = CountVectorizer()
            result_vector = count_vectorizer.fit_transform(raw_year).toarray()
            #print(len(count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="tf"
    return result_vector

In [7]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = count_vectorizer.fit_transform(cleaned_token).toarray()
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [8]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [21]:
# load the file
import sys
import io
import os
import collections
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)
# collect statistic to output
allname = []
num_class = []
per_class_count = []
average_textual_size = []

all_mnb_accuracy = []
all_mnb_f1 = []
all_svcLinear_accuracy = []
all_svcLinear_f1 = []
all_LR_accuracy = []
all_LR_f1 = []

# read all file in labeled group
for file in listfiles:
    # group name
    temp = file.split("_")
    name = temp[1]+"_"+temp[-1]
    print("For name: ",name)
    allname.append(name)
    # read needed content in labeled file
    labeled_data = read_labeled_file(fileDir+file)
    print("total sample size before apply threshold: ",len(labeled_data))
    # count number of paper each author write based on author ID
    paperCounter = collections.Counter(labeled_data["authorID"])
    print(paperCounter)
    # collect per class statistic
    for k in list(paperCounter):
        if paperCounter[k] < threshold:
            del paperCounter[k]
    temp =list(paperCounter.keys())
    print(temp)
    per_class_count.append(paperCounter)
    num_class.append(len(paperCounter))
    # remove samples that are smaller than threshold
    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
    print("Total sample size after apply threshold: ",len(labeled_data))
    # if only have one class or no class pass the threshold, not applicable
    if(len(paperCounter)==0) or (len(paperCounter)==1):
        average_textual_size.append("Not applicable")
        all_mnb_accuracy.append("Not applicable")
        all_mnb_f1.append("Not applicable")
        all_svcLinear_accuracy.append("Not applicable")
        all_svcLinear_f1.append("Not applicable")
        all_LR_accuracy.append("Not applicable")
        all_LR_f1.append("Not applicable")
    else:
        # convert author id to label
        gather_label = []
        for index, record in labeled_data.iterrows():
            gather_label.append(temp.index(record["authorID"]))
        labeled_data["label"] = gather_label
        # shuffle the data
        labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
        # extract true label and pid
        label = labeled_data["label"]
        pid = labeled_data["paperID"]
        # list of different data field
        part_collection = []
        # data part 1, co-author matrix
        data_part_co_author = co_author_to_vector(labeled_data["co-author"], emb_type=coauthor_emb_type)
        print(data_part_co_author.shape)
        part_collection.append(data_part_co_author)
        # data part 2.1, venue_id that author attend
        data_part_venue = venue_to_vector(labeled_data["venue_id"], emb_type=venue_emb_type)
        print(data_part_venue.shape)
        part_collection.append(data_part_venue)
        # data part 2.2 year that author attend
        data_part_year = year_to_vector(labeled_data["publish_year"], emb_type=year_emb_type)
        print(data_part_year.shape)
        part_collection.append(data_part_year)
        # merge different part of data data together by concatenate it all together
        # remove empty emb (when emb set off)
        part_collection = [part for part in part_collection if len(part)!=0]
        print(len(part_collection))
        if len(part_collection)>1:
            combinedata = np.concatenate(part_collection,axis=1)
        elif len(part_collection)==1:
            if isinstance(part_collection[0], pd.DataFrame):
                combinedata = part_collection[0].values
            else:
                combinedata = part_collection[0]
        else:
            print("No data available")
            break
        print(len(combinedata))
        # using converted feature vector to train classifier
        # using Multinomial naive bayes
        clf = MultinomialNB()
        mnbaccuracy, mnbmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("MNB Accuracy: ",mnbaccuracy)
        print("MNB F1: ", mnbmarcof1)
        all_mnb_accuracy.append(mnbaccuracy)
        all_mnb_f1.append(mnbmarcof1)
        # using SVM with linear kernal
        clf = SVC(decision_function_shape='ovr', kernel='linear')
        svcaccuracy, svcmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("svc Accuracy: ",svcaccuracy)
        print("svc F1: ", svcmarcof1)
        all_svcLinear_accuracy.append(svcaccuracy)
        all_svcLinear_f1.append(svcmarcof1)
        # using logistic regression
        clf = LogisticRegression(multi_class='ovr')
        LRaccuracy, LRmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("LR Accuracy: ",LRaccuracy)
        print("LR F1: ", LRmarcof1)
        all_LR_accuracy.append(LRaccuracy)
        all_LR_f1.append(LRmarcof1)
# write evaluation result to excel
output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"per_class_size":per_class_count, 
                       "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                       "mnb accuracy":all_mnb_accuracy, "mnb macro f1": all_mnb_f1,
                       "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

savePath = "../result/"+Dataset+"/coauthor_only/"
if not os.path.exists(savePath):
    os.makedirs(savePath)
filename = "2004_coauthor_only_"+coauthor_emb_type+"_threshold="+str(threshold)+".csv"
output.to_csv(savePath+filename, encoding='utf-8',index=False)
print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(127, 263)
(0, 0)
(0, 0)
1
127
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        31
          1     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        31
          1       1.00      0.54      0.70        39
          2       0.64      1.00      0.78        57

avg / total       0.84      0.75      0.74       127

[17  0 14  0 21 18  0  0 57]
LR Accuracy:  0.7480314960629921
LR F1:  0.7297184170471841
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
['0000-0002-5403-0091', '0000-0002-3046-1313']
Total sample size after apply threshold:  30
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-0002-2936-7717': 6, '0000-0003-3898-9734': 6, '0000-0002-5074-6914': 2, '0000-0003-4266-6700': 1, '0000-0002-9286-9787': 1, '0000-0002-0821-0892': 1})
['0000-0003-4341-1283', '0000-0002-9448-8144', '0000-0003-2772-9531', '0000-0001-6082-9273', '0000-0002-3813-1706', '0000-0001-8628-4902', '0000-0002-3989-7973', '0000-0002-2601-4422']
Total sample size after apply threshold:  235
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.48      1.00      0.65        51
          1       1.00      0.42      0.59        19
          2       1.00      0.70      0.83        27
          3       0.91      0.45      0.61        22
          4       0.86      0.95      0.90        39
          5       1.00      0.33      0.50        15
          6       0.97      0.78      0.86        40
          7       1.00      0.45      0.62        22

avg / total       0.85      0.73      0.73       235

[51  0  0  0  0  0  0  0  9  8  0  1  1  0  0  0  8  0 19  0  0  0  0  0
 12  0  0 10  0  0  0  0  1  0  0  0 37  0  1  0  7  0  0  0  3  5  0  0
  7  0  0  0  2  0 31  0 12  0  0  0  0  0  0 10]
MNB Accuracy:  0.7276595744680852
MNB F1:  0.6948574888661821
             precision    recall  f1-score   support

          0       0.54      1.00      0.70        51
          1       1.00      0.63      0.77        19
          2       1.00      0.67      0.80       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.90      1.00      0.95        69
          2       1.00      0.91      0.95        22

avg / total       0.93      0.92      0.91       101

[ 4  6  0  0 69  0  0  2 20]
svc Accuracy:  0.9207920792079208
svc F1:  0.8230050010871928
             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.79      1.00      0.88        69
          2       1.00      0.50      0.67        22

avg / total       0.86      0.82      0.80       101

[ 3  7  0  0 69  0  0 11 11]
LR Accuracy:  0.8217821782178217
LR F1:  0.6709401709401709
For name:  a_vega
total sample size before apply threshold:  20
Counter({'0000-0002-8207-9925': 10, '0000-0002-2178-2780': 8, '0000-0002-8148-5702': 1, '0000-0003-1082-0961': 1})
['0000-0002-8207-9925']
Total sample size after apply threshold:  10
For name:  k_smith
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.87      0.93        15
          1       1.00      0.93      0.96        29
          2       1.00      0.38      0.56        13
          3       0.80      0.99      0.89       133
          4       1.00      0.64      0.78        14
          5       1.00      0.96      0.98        23
          6       0.98      0.87      0.92        75
          7       1.00      0.79      0.88        19

avg / total       0.92      0.90      0.89       321

[ 13   0   0   2   0   0   0   0   0  27   0   2   0   0   0   0   0   0
   5   8   0   0   0   0   0   0   0 132   0   0   1   0   0   0   0   5
   9   0   0   0   0   0   0   1   0  22   0   0   0   0   0  10   0   0
  65   0   0   0   0   4   0   0   0  15]
svc Accuracy:  0.897196261682243
svc F1:  0.8627533521888557
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        15
          1       1.00      0.93      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[ 0  9  1  0 40  6  0 13 30]
MNB Accuracy:  0.7070707070707071
MNB F1:  0.49691358024691357
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.75      0.85      0.80        46
          2       0.81      0.88      0.84        43

avg / total       0.70      0.78      0.74        99

[ 0  8  2  0 39  7  0  5 38]
svc Accuracy:  0.7777777777777778
svc F1:  0.5467876039304611
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.71      0.85      0.77        46
          2       0.82      0.84      0.83        43

avg / total       0.68      0.76      0.72        99

[ 0  9  1  0 39  7  0  7 36]
LR Accuracy:  0.7575757575757576
LR F1:  0.5332878115397747
For name:  j_qian
total sample size before apply threshold:  17
Counter({'0000-0002-8793-9330': 6, '0000-0001-6145-045X': 6, '0000-0003-3162-2913': 1, '0000-0002-9522-6445': 1, '0000-0002-1325

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  625
Counter({'0000-0001-5188-7957': 141, '0000-0002-6063-7615': 82, '0000-0001-6665-6596': 79, '0000-0002-4688-3000': 66, '0000-0001-7152-765X': 51, '0000-0001-8251-4176': 28, '0000-0003-1235-5186': 26, '0000-0002-8883-7838': 25, '0000-0001-8331-3181': 20, '0000-0001-8377-5175': 15, '0000-0002-8861-0596': 14, '0000-0002-3804-2594': 14, '0000-0003-3815-0891': 14, '0000-0002-4497-4961': 10, '0000-0002-9801-9580': 9, '0000-0003-4400-5180': 5, '0000-0002-3500-914X': 5, '0000-0002-0195-6771': 4, '0000-0001-6105-0296': 3, '0000-0002-4681-3360': 3, '0000-0003-0161-0532': 3, '0000-0002-6511-1284': 3, '0000-0002-0195-5509': 2, '0000-0003-0500-1961': 2, '0000-0002-5355-3210': 1})
['0000-0003-1235-5186', '0000-0002-8861-0596', '0000-0001-7152-765X', '0000-0001-6665-6596', '0000-0002-4497-4961', '0000-0002-6063-7615', '0000-0001-5188-7957', '0000-0002-3804-2594', '0000-0002-8883-7838', '0000-0001-8251-4176', '0000-0003-3815-0891', '0000-0001-8331-3181', '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.31      0.47        26
          1       1.00      0.86      0.92        14
          2       0.96      0.84      0.90        51
          3       0.99      0.89      0.93        79
          4       0.00      0.00      0.00        10
          5       0.88      0.73      0.80        82
          6       0.44      0.99      0.61       141
          7       0.00      0.00      0.00        14
          8       1.00      0.12      0.21        25
          9       1.00      0.18      0.30        28
         10       0.00      0.00      0.00        14
         11       1.00      0.25      0.40        20
         12       0.98      0.79      0.87        66
         13       0.00      0.00      0.00        15

avg / total       0.75      0.68      0.64       585

[  8   0   0   0   0   1  17   0   0   0   0   0   0   0   0  12   0   0
   0   0   2   0   0   0   0   0   0   0   0   0  43   0   0   0   8   0
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000-0002-4899-1929': 25, '0000-0003-0487-4242': 24, '0000-0002-3642-1486': 22, '0000-0001-9965-3535': 17, '0000-0002-4168-757X': 17, '0000-0001-6525-3744': 14, '0000-0002-3897-0278': 14, '0000-0002-1181-5112': 12, '0000-0003-1447-9385': 11, '0000-0002-7305-8786': 11, '0000-0002-2655-7806': 10, '0000-0003-3466-5353': 9, '0000-0002-7359-663X': 8, '0000-0003-4600-8668': 6, '0000-0002-1382-7088': 5, '0000-0002-9505-4882': 5, '0000-0003-3667-9900': 4, '0000-0001-9714-6038': 4, '0000-0002-4760-0228': 3, '0000-0003-4188-7915': 3, '0000-0001-9454-0427': 3, '0000-0002-0333-6808': 3, '0000-0003-2134-4964': 3, '0000-0002-6658-047X': 3, '0000-0003-1273-379X': 3, '0000-0002-7047-3183': 3, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.00      0.00      0.00        14
          2       0.39      0.84      0.53       154
          3       0.00      0.00      0.00        22
          4       0.00      0.00      0.00        17
          5       0.00      0.00      0.00        11
          6       0.00      0.00      0.00        25
          7       0.51      0.88      0.64       211
          8       0.00      0.00      0.00        57
          9       0.00      0.00      0.00        28
         10       0.53      0.88      0.66       139
         11       1.00      0.16      0.27        45
         12       0.00      0.00      0.00        24
         13       0.00      0.00      0.00        57
         14       0.00      0.00      0.00        14
         15       0.76      0.45      0.57        55
         16       0.81      0.32      0.45        92
         17       1.00      0.12      0.21   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Counter({'0000-0002-6694-4130': 41, '0000-0002-3050-7262': 16, '0000-0001-9570-135X': 7, '0000-0001-5680-2641': 2})
['0000-0002-6694-4130', '0000-0002-3050-7262']
Total sample size after apply threshold:  57
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 173)
(0, 0)
(0, 0)
1
57
             precision    recall  f1-score   support

          0       1.00      0.98      0.99        41
          1       0.94      1.00      0.97        16

avg / total       0.98      0.98      0.98        57

[40  1  0 16]
MNB Accuracy:  0.9824561403508771
MNB F1:  0.978675645342312
             precision    rec

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


199
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.00      0.00      0.00        12
          2       0.76      1.00      0.87       151
          3       1.00      0.04      0.08        24

avg / total       0.70      0.76      0.67       199

[  0   0  12   0   0   0  12   0   0   0 151   0   0   0  23   1]
MNB Accuracy:  0.7638190954773869
MNB F1:  0.2363323782234957
             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       1.00      0.33      0.50        12
          2       0.83      1.00      0.91       151
          3       1.00      0.46      0.63        24

avg / total       0.87      0.84      0.81       199

[  2   0  10   0   0   4   8   0   0   0 151   0   0   0  13  11]
svc Accuracy:  0.8442211055276382
svc F1:  0.5802981552981553
             precision    recall  f1-score   support

          0       0.00      0.00      0.00  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  25
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(25, 42)
(0, 0)
(0, 0)
1
25
             precision    recall  f1-score   support

          0       0.83      0.50      0.62        10
          1       0.74      0.93      0.82        15

avg / total       0.78      0.76      0.74        25

[ 5  5  1 14]
MNB Accuracy:  0.76
MNB F1:  0.7242647058823529
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        10
          1       1.00      0.93      0.97        15

avg / total       0.96      0.96      0.96  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96       123
          1       1.00      0.76      0.86        37

avg / total       0.95      0.94      0.94       160

[123   0   9  28]
svc Accuracy:  0.94375
svc F1:  0.9131221719457014
             precision    recall  f1-score   support

          0       0.84      1.00      0.91       123
          1       1.00      0.38      0.55        37

avg / total       0.88      0.86      0.83       160

[123   0  23  14]
LR Accuracy:  0.85625
LR F1:  0.7317588745535388
For name:  k_xu
total sample size before apply threshold:  37
Counter({'0000-0002-2788-194X': 19, '0000-0003-2036-3469': 14, '0000-0002-3985-739X': 3, '0000-0001-7851-2629': 1})
['0000-0003-2036-3469', '0000-0002-2788-194X']
Total sample size after apply threshold:  33
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.90      1.00      0.95        19

avg / total       0.95      0.94      0.94        33

[12  2  0 19]
LR Accuracy:  0.9393939393939394
LR F1:  0.9365384615384615
For name:  s_antunes
total sample size before apply threshold:  54
Counter({'0000-0002-6686-9919': 35, '0000-0002-5512-9093': 12, '0000-0003-3218-3924': 4, '0000-0002-2264-3774': 3})
['0000-0002-5512-9093', '0000-0002-6686-9919']
Total sample size after apply threshold:  47
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabular

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
(0, 0)
1
42
             precision    recall  f1-score   support

          0       1.00      0.06      0.12        16
          1       0.52      1.00      0.68        16
          2       1.00      1.00      1.00        10

avg / total       0.82      0.64      0.54        42

[ 1 15  0  0 16  0  0  0 10]
MNB Accuracy:  0.6428571428571429
MNB F1:  0.5994993742177722
             precision    recall  f1-score   support

          0       0.83      0.94      0.88        16
          1       1.00      0.81      0.90        16
          2       0.91      1.00      0.95        10

avg / total       0.91      0.90      0.90        42

[15  0  1  3 13  0  0  0 10]
svc Accuracy:  0.9047619047619048
svc F1:  0.9104285392317847
             precision    recall  f1-score   support

          0       0.79      0.94      0.86        16
          1       0.93      0.81      0.87        16
          2       1.00      0.90      0.95        10

avg / total       0.89      0.88      0.88      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        27
          1       1.00      0.72      0.84        25
          2       0.95      1.00      0.97        19
          3       0.00      0.00      0.00        10
          4       1.00      0.09      0.17        11
          5       1.00      0.60      0.75        20
          6       1.00      0.78      0.88        18
          7       0.00      0.00      0.00        13
          8       0.48      1.00      0.65        61
          9       1.00      0.78      0.88        32

avg / total       0.77      0.72      0.69       236

[20  0  1  0  0  0  0  0  6  0  0 18  0  0  0  0  0  0  7  0  0  0 19  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  1  0  0  0
 10  0  0  0  0  0  0 12  0  0  8  0  0  0  0  0  0  0 14  0  4  0  0  0
  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0 61  0  0  0  0  0  0  0
  0  0  7 25]
MNB Accuracy:  0.7203389830508474
MNB F1:  0.5983898172

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      1.00      0.82        40
          1       1.00      0.62      0.76        13
          2       1.00      0.36      0.53        14
          3       1.00      0.87      0.93        30

avg / total       0.87      0.81      0.80        97

[40  0  0  0  5  8  0  0  9  0  5  0  4  0  0 26]
LR Accuracy:  0.8144329896907216
LR F1:  0.7582796276405299
For name:  g_guidi
total sample size before apply threshold:  37
Counter({'0000-0002-3061-9870': 15, '0000-0003-3199-6624': 11, '0000-0001-9535-9152': 5, '0000-0002-1393-326X': 4, '0000-0002-8857-0096': 2})
['0000-0003-3199-6624', '0000-0002-3061-9870']
Total sample size after apply threshold:  26
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smoot

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(134, 375)
(0, 0)
(0, 0)
1
134
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        10
          1       1.00      0.81      0.89        26
          2       0.88      1.00      0.94        29
          3       0.97      1.00      0.99        33
          4       0.80      1.00      0.89        36

avg / total       0.91      0.90      0.87       134

[ 1  0  1  1  7  0 21  3  0  2  0  0 29  0  0  0  0  0 33  0  0  0  0  0
 36]
MNB Accuracy:  0.8955223880597015
MNB F1:  0.776976517963416
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(180, 623)
(0, 0)
(0, 0)
1
180
             precision    recall  f1-score   support

          0       1.00      0.53      0.69        19
          1       1.00      0.08      0.15        12
          2       0.73      0.94      0.82        96
          3       0.00      0.00      0.00        13
          4       1.00      0.82      0.90        17
          5       0.74      1.00      0.85        23

avg / total       0.75      0.77      0.71       180

[10  0  9  0  0  0  0  1 11  0  0  0  0  0 90  0  0  6  0  0 11  0  0  2
  0  0  3  0 14  0  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        19
          1       1.00      0.42      0.59        12
          2       0.81      0.94      0.87        96
          3       0.83      0.38      0.53        13
          4       1.00      0.88      0.94        17
          5       0.79      1.00      0.88        23

avg / total       0.86      0.84      0.83       180

[14  0  5  0  0  0  0  5  7  0  0  0  0  0 90  1  0  5  0  0  7  5  0  1
  0  0  2  0 15  0  0  0  0  0  0 23]
svc Accuracy:  0.8444444444444444
svc F1:  0.7757860890138114
             precision    recall  f1-score   support

          0       1.00      0.42      0.59        19
          1       0.00      0.00      0.00        12
          2       0.71      0.96      0.81        96
          3       1.00      0.08      0.14        13
          4       1.00      0.88      0.94        17
          5       0.81      0.91      0.86        23

avg / total       0.75     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       1.00      0.92      0.96        12
          2       1.00      1.00      1.00        19

avg / total       0.98      0.98      0.98        48

[17  0  0  1 11  0  0  0 19]
LR Accuracy:  0.9791666666666666
LR F1:  0.975983436853002
For name:  x_fu
total sample size before apply threshold:  16
Counter({'0000-0001-6928-4396': 8, '0000-0001-9295-6314': 6, '0000-0002-8012-4753': 1, '0000-0002-4305-6624': 1})
[]
Total sample size after apply threshold:  0
For name:  f_ortega
total sample size before apply threshold:  368
Counter({'0000-0003-2001-1121': 205, '0000-0003-2111-769X': 86, '0000-0002-4730-9270': 38, '0000-0002-3172-2095': 22, '0000-0002-7431-354X': 9, '0000-0001-7850-2105': 7, '0000-0003-0231-2051': 1})
['0000-0002-3172-2095', '0000-0003-2001-1121', '0000-0002-4730-9270', '0000-0003-2111-769X']
Total sample size after apply threshold:  351
TfidfVectorizer(an

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.77      0.87        44
          1       1.00      0.86      0.92        21
          2       0.94      0.88      0.91        73
          3       1.00      0.80      0.89        10
          4       0.82      0.95      0.88       107
          5       1.00      0.70      0.82        20
          6       1.00      0.76      0.87        51
          7       0.66      0.92      0.77        59
          8       1.00      0.79      0.88        14

avg / total       0.89      0.86      0.87       399

[ 34   0   0   0   5   0   0   5   0   0  18   0   0   1   0   0   2   0
   0   0  64   0   3   0   0   6   0   0   0   0   8   2   0   0   0   0
   0   0   2   0 102   0   0   3   0   0   0   1   0   4  14   0   1   0
   0   0   0   0   3   0  39   9   0   0   0   1   0   4   0   0  54   0
   0   0   0   0   1   0   0   2  11]
svc Accuracy:  0.8621553884711779
svc F1:  0.8674473302519855
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94       101
          1       1.00      0.43      0.60        21

avg / total       0.91      0.90      0.88       122

[101   0  12   9]
MNB Accuracy:  0.9016393442622951
MNB F1:  0.7719626168224298
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       101
          1       0.95      0.86      0.90        21

avg / total       0.97      0.97      0.97       122

[100   1   3  18]
svc Accuracy:  0.9672131147540983
svc F1:  0.9401960784313725
             precision    recall  f1-score   support

          0       0.83      1.00      0.91       101
          1       1.00      0.05      0.09        21

avg / total       0.86      0.84      0.77       122

[101   0  20   1]
LR Accuracy:  0.8360655737704918
LR F1:  0.5004095004095004
For name:  h_song
total sample size before apply threshold:  210
Counter({'0000-0001-5684-4059': 88, '0000-0001-5553-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(181, 1094)
(0, 0)
(0, 0)
1
181
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        29
          1       0.00      0.00      0.00        14
          2       0.69      1.00      0.82        88
          3       0.85      0.57      0.68        30
          4       1.00      0.40      0.57        20

avg / total       0.75      0.77      0.73       181

[26  0  1  2  0  0  0 14  0  0  0  0 88  0  0  0  0 13 17  0  0  0 11  1
  8]
MNB Accuracy:  0.7679558011049724
MNB F1:  0.6030975536091816
             precision    recall  f1-score   support

          0       1.00      0.97      0.98        29
          1       1.00      0.36      0.53        14
          2       0.88      1.00      0.94        88
          3       0.87      0.90      0.89        30
          4       0.94      0.80      0.86        20

avg / total       0.91      0.91      0.90       181

[28  0  0  1  0  0  5  9  0  0  0  0 88  0  0  0  0  2 27  1  0  0  1  3
 16]
sv

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       1.00      0.25      0.40        12
          2       0.67      1.00      0.80        24
          3       0.92      1.00      0.96        23

avg / total       0.87      0.81      0.78        73

[ 9  0  4  1  0  3  8  1  0  0 24  0  0  0  0 23]
MNB Accuracy:  0.8082191780821918
MNB F1:  0.7352355072463769
             precision    recall  f1-score   support

          0       0.92      0.79      0.85        14
          1       0.91      0.83      0.87        12
          2       0.82      0.96      0.88        24
          3       1.00      0.96      0.98        23

avg / total       0.91      0.90      0.90        73

[11  1  2  0  0 10  2  0  1  0 23  0  0  0  1 22]
svc Accuracy:  0.9041095890410958
svc F1:  0.8945280564845782
             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       1.00     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      0.30      0.43        10
          1       0.95      0.99      0.97       128

avg / total       0.93      0.94      0.93       138

[  3   7   1 127]
svc Accuracy:  0.9420289855072463
svc F1:  0.6990185387131951
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.93      1.00      0.96       128

avg / total       0.86      0.93      0.89       138

[  0  10   0 128]
LR Accuracy:  0.927536231884058
LR F1:  0.48120300751879697
For name:  a_simon
total sample size before apply threshold:  117
Counter({'0000-0002-6141-7921': 60, '0000-0002-0151-0120': 19, '0000-0002-6509-4541': 14, '0000-0001-6023-6427': 14, '0000-0002-1879-5628': 5, '0000-0002-3286-5776': 4, '0000-0003-4641-6186': 1})
['0000-0002-6141-7921', '0000-0002-6509-4541', '0000-0002-0151-0120', '0000-0001-6023-6427']
Total sample size after apply threshold:  107
TfidfVector

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.97      0.86       149
          1       0.88      0.84      0.86        98
          2       0.00      0.00      0.00        36

avg / total       0.71      0.80      0.75       283

[145   4   0  15  82   1  29   7   0]
MNB Accuracy:  0.8021201413427562
MNB F1:  0.5722089697119902
             precision    recall  f1-score   support

          0       0.85      0.93      0.89       149
          1       0.86      0.93      0.89        98
          2       0.79      0.31      0.44        36

avg / total       0.85      0.85      0.83       283

[139   8   2   6  91   1  18   7  11]
svc Accuracy:  0.8515901060070671
svc F1:  0.7410608345902464
             precision    recall  f1-score   support

          0       0.78      0.95      0.86       149
          1       0.89      0.86      0.88        98
          2       0.83      0.14      0.24        36

avg / total       0.82      0.82      0.78       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(403, 752)
(0, 0)
(0, 0)
1
403
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        21
          1       0.44      1.00      0.61        68
          2       0.50      1.00      0.66        69
          3       0.00      0.00      0.00        17
          4       0.00      0.00      0.00        14
          5       1.00      0.68      0.81        25
          6       0.00      0.00      0.00        10
          7       1.00      0.38      0.55        32
          8       0.00      0.00      0.00        12
          9       0.00      0.00      0.00        15
         10       1.00      0.23      0.37        22
         11       1.00      0.22      0.36        18
         12       0.00      0.00      0.00        12
         13       0.92      0.85      0.89        68

avg / total       0.61      0.60      0.53       403

[ 9 10  2  0  0  0  0  0  0  0  0  0  0  0  0 68  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 69  0  0  0  0  0  0  0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.98      0.97        66
          1       1.00      0.57      0.72        30
          2       0.87      1.00      0.93        72

avg / total       0.93      0.92      0.91       168

[65  0  1  3 17 10  0  0 72]
MNB Accuracy:  0.9166666666666666
MNB F1:  0.8741952557050027
             precision    recall  f1-score   support

          0       0.98      0.98      0.98        66
          1       1.00      0.70      0.82        30
          2       0.89      1.00      0.94        72

avg / total       0.95      0.94      0.94       168

[65  0  1  1 21  8  0  0 72]
svc Accuracy:  0.9404761904761905
svc F1:  0.9165181224004754
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        66
          1       1.00      0.43      0.60        30
          2       0.73      1.00      0.84        72

avg / total       0.88      0.84      0.83       168

[56  0 10  0 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.62      0.92      0.74        85
          1       1.00      0.73      0.84        22
          2       0.00      0.00      0.00        11
          3       0.00      0.00      0.00        12
          4       1.00      0.44      0.62        18
          5       0.00      0.00      0.00        12
          6       1.00      0.84      0.91        31
          7       1.00      0.25      0.40        16
          8       0.59      0.98      0.74        54
          9       1.00      0.40      0.57        15

avg / total       0.68      0.69      0.64       276

[78  0  0  0  0  0  0  0  7  0  6 16  0  0  0  0  0  0  0  0 10  0  0  0
  0  0  0  0  1  0 12  0  0  0  0  0  0  0  0  0  3  0  0  0  8  0  0  0
  7  0  2  0  0  0  0  0  0  0 10  0  3  0  0  0  0  0 26  0  2  0  2  0
  0  0  0  0  0  4 10  0  1  0  0  0  0  0  0  0 53  0  9  0  0  0  0  0
  0  0  0  6]
MNB Accuracy:  0.6920289855072463
MNB F1:  0.4816646755

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      1.00      0.94        15
          1       1.00      0.90      0.95        10
          2       1.00      0.92      0.96        12

avg / total       0.95      0.95      0.95        37

[15  0  0  1  9  0  1  0 11]
svc Accuracy:  0.9459459459459459
svc F1:  0.9471300533943555
             precision    recall  f1-score   support

          0       0.79      1.00      0.88        15
          1       1.00      0.70      0.82        10
          2       1.00      0.92      0.96        12

avg / total       0.91      0.89      0.89        37

[15  0  0  3  7  0  1  0 11]
LR Accuracy:  0.8918918918918919
LR F1:  0.887468030690537
For name:  j_matthews
total sample size before apply threshold:  65
Counter({'0000-0002-9815-8636': 46, '0000-0001-6184-1813': 7, '0000-0002-5993-7610': 5, '0000-0002-1832-4420': 4, '0000-0002-7282-8929': 1, '0000-0002-6888-9438': 1, '0000-0002-3968-8282': 1})
['0000-0002-9815-8636']


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.98       146
          1       1.00      0.30      0.46        10

avg / total       0.96      0.96      0.94       156

[146   0   7   3]
svc Accuracy:  0.9551282051282052
svc F1:  0.7190635451505016
             precision    recall  f1-score   support

          0       0.94      1.00      0.97       146
          1       0.00      0.00      0.00        10

avg / total       0.88      0.94      0.90       156

[146   0  10   0]
LR Accuracy:  0.9358974358974359
LR F1:  0.48344370860927155
For name:  r_harris
total sample size before apply threshold:  50
Counter({'0000-0002-4377-5063': 26, '0000-0002-7943-5650': 8, '0000-0002-2636-1520': 6, '0000-0003-1787-7784': 3, '0000-0002-9247-0768': 3, '0000-0003-0954-1981': 2, '0000-0003-3322-1371': 2})
['0000-0002-4377-5063']
Total sample size after apply threshold:  26
For name:  c_vaughan
total sample size before apply threshold:  83
Counter({'0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(34, 160)
(0, 0)
(0, 0)
1
34
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        20
          1       1.00      0.50      0.67        14

avg / total       0.85      0.79      0.78        34

[20  0  7  7]
MNB Accuracy:  0.7941176470588235
MNB F1:  0.7588652482269503
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        20
          1       0.88      1.00      0.93        14

avg / total       0.95      0.94      0.94        34

[18  2  0 14]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        17
          1       0.96      0.88      0.91        49
          2       0.90      0.96      0.93        57

avg / total       0.94      0.93      0.93       123

[17  0  0  0 43  6  0  2 55]
svc Accuracy:  0.9349593495934959
svc F1:  0.9490323356172617
             precision    recall  f1-score   support

          0       1.00      0.24      0.38        17
          1       0.95      0.84      0.89        49
          2       0.72      0.96      0.83        57

avg / total       0.85      0.81      0.79       123

[ 4  0 13  0 41  8  0  2 55]
LR Accuracy:  0.8130081300813008
LR F1:  0.6997747993171334
For name:  p_teixeira
total sample size before apply threshold:  213
Counter({'0000-0002-7258-7977': 60, '0000-0002-6296-5137': 55, '0000-0001-7202-0527': 48, '0000-0003-2315-2261': 26, '0000-0003-2735-6608': 22, '0000-0002-7596-9735': 1, '0000-0002-1593-8064': 1})
['0000-0001-7202-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      1.00      0.91        48
          1       1.00      0.98      0.99        60
          2       0.95      0.73      0.83        26
          3       1.00      0.86      0.93        22
          4       1.00      1.00      1.00        55

avg / total       0.95      0.95      0.95       211

[48  0  0  0  0  1 59  0  0  0  7  0 19  0  0  2  0  1 19  0  0  0  0  0
 55]
svc Accuracy:  0.9478672985781991
svc F1:  0.9300346481656749
             precision    recall  f1-score   support

          0       1.00      0.83      0.91        48
          1       1.00      1.00      1.00        60
          2       1.00      0.62      0.76        26
          3       1.00      0.91      0.95        22
          4       0.73      1.00      0.85        55

avg / total       0.93      0.91      0.90       211

[40  0  0  0  8  0 60  0  0  0  0  0 16  0 10  0  0  0 20  2  0  0  0  0
 55]
LR Accuracy:  0.9052132701421801
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



['0000-0002-8214-1696', '0000-0002-7231-0185', '0000-0003-3399-055X']
Total sample size after apply threshold:  201
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(201, 228)
(0, 0)
(0, 0)
1
201
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       1.00      0.62      0.76        65
          2       0.77      1.00      0.87       124

avg / total       0.80      0.82      0.78       201

[  0   0  12   0  40  25   0   0 124]
MNB Accuracy:  0.8159203980099502
MNB F1:  0.5440267335004177
             precision    recall  f1-sco

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(146, 1028)
(0, 0)
(0, 0)
1
146
             precision    recall  f1-score   support

          0       0.83      0.99      0.90        84
          1       1.00      0.79      0.88        33
          2       0.95      0.66      0.78        29

avg / total       0.89      0.88      0.87       146

[83  0  1  7 26  0 10  0 19]
MNB Accuracy:  0.8767123287671232
MNB F1:  0.8530133497761669


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94        84
          1       1.00      0.85      0.92        33
          2       1.00      0.83      0.91        29

avg / total       0.94      0.93      0.93       146

[84  0  0  5 28  0  5  0 24]
svc Accuracy:  0.9315068493150684
svc F1:  0.9225044629876124
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        84
          1       1.00      0.70      0.82        33
          2       1.00      0.52      0.68        29

avg / total       0.87      0.84      0.82       146

[84  0  0 10 23  0 14  0 15]
LR Accuracy:  0.8356164383561644
LR F1:  0.7927489177489179
For name:  d_parsons
total sample size before apply threshold:  30
Counter({'0000-0002-3956-6031': 26, '0000-0002-1393-8431': 2, '0000-0002-9121-7859': 1, '0000-0002-5142-4466': 1})
['0000-0002-3956-6031']
Total sample size after apply threshold:  26
For name:  a_choudhury
total samp

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[18  6  0 28]
LR Accuracy:  0.8846153846153846
LR F1:  0.880184331797235
For name:  c_richter
total sample size before apply threshold:  11
Counter({'0000-0002-5658-6173': 4, '0000-0002-6591-1118': 4, '0000-0001-6017-1520': 2, '0000-0002-6839-7994': 1})
[]
Total sample size after apply threshold:  0
For name:  m_hossain
total sample size before apply threshold:  102
Counter({'0000-0003-1408-2273': 26, '0000-0002-1878-8145': 17, '0000-0003-3967-2544': 10, '0000-0003-3399-581X': 9, '0000-0003-3303-5755': 7, '0000-0003-1271-1515': 7, '0000-0003-4733-0018': 6, '0000-0002-9953-586X': 5, '0000-0001-8019-843X': 4, '0000-0001-7996-9233': 3, '0000-0002-1917-8701': 1, '0000-0002-0984-984X': 1, '0000-0002-7673-8410': 1, '0000-0002-0977-4593': 1, '0000-0003-2970-2324': 1, '0000-0001-6753-4216': 1, '0000-0002-3929-6211': 1, '0000-0002-6621-8737': 1})
['0000-0003-3967-2544', '0000-0003-1408-2273', '0000-0002-1878-8145']
Total sample size after apply threshold:  53
TfidfVectorizer(analyzer='word', b

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        45
          1       0.93      1.00      0.96       122

avg / total       0.95      0.95      0.94       167

[ 36   9   0 122]
svc Accuracy:  0.9461077844311377
svc F1:  0.9266578831796224
             precision    recall  f1-score   support

          0       1.00      0.27      0.42        45
          1       0.79      1.00      0.88       122

avg / total       0.84      0.80      0.76       167

[ 12  33   0 122]
LR Accuracy:  0.8023952095808383
LR F1:  0.6509595287858636
For name:  m_soares
total sample size before apply threshold:  247
Counter({'0000-0001-9701-836X': 75, '0000-0002-9314-4833': 68, '0000-0001-6071-0272': 44, '0000-0003-1579-8513': 32, '0000-0002-5213-2377': 10, '0000-0001-8860-0470': 7, '0000-0003-4227-4141': 4, '0000-0002-7181-1906': 3, '0000-0002-4614-8209': 2, '0000-0002-8059-7067': 1, '0000-0002-9013-2570': 1})
['0000-0002-5213-2377', '0000-0001-6071-0272

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       1.00      0.91      0.95        44
          2       1.00      0.75      0.86        32
          3       0.89      0.93      0.91        68
          4       0.83      0.96      0.89        75

avg / total       0.91      0.90      0.90       229

[ 7  0  0  0  3  0 40  0  2  2  0  0 24  3  5  0  0  0 63  5  0  0  0  3
 72]
svc Accuracy:  0.8995633187772926
svc F1:  0.8856833860642579
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       1.00      0.86      0.93        44
          2       1.00      0.62      0.77        32
          3       0.95      0.91      0.93        68
          4       0.73      0.99      0.84        75

avg / total       0.90      0.86      0.86       229

[ 4  0  0  0  6  0 38  0  1  5  0  0 20  1 11  0  0  0 62  6  0  0  0  1
 74]
LR Accuracy:  0.8646288209606987
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.83      0.91        29
          1       1.00      0.92      0.96        61
          2       0.81      1.00      0.89        42

avg / total       0.94      0.92      0.93       132

[24  0  5  0 56  5  0  0 42]
svc Accuracy:  0.9242424242424242
svc F1:  0.918847451966681
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        29
          1       0.75      0.95      0.84        61
          2       0.87      0.62      0.72        42

avg / total       0.84      0.83      0.82       132

[25  3  1  0 58  3  0 16 26]
LR Accuracy:  0.8257575757575758
LR F1:  0.8295759527643586
For name:  a_rao
total sample size before apply threshold:  93
Counter({'0000-0002-2676-2762': 36, '0000-0003-0320-2962': 20, '0000-0002-2550-6097': 11, '0000-0001-6440-1274': 8, '0000-0003-2319-6539': 5, '0000-0002-2474-5010': 5, '0000-0003-4480-3190': 3, '0000-0003-4879-1123': 2, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 46
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 154)
(0, 0)
(0, 0)
1
46
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        29
          1       1.00      0.82      0.90        17

avg / total       0.94      0.93      0.93        46

[29  0  3 14]
MNB Accuracy:  0.9347826086956522
MNB F1:  0.9270227392913801
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        29
          1       1.00      0.82      0.90        17

avg / total       0.94      0.93      0.93        46

[29  0  3 14]
svc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(37, 170)
(0, 0)
(0, 0)
1
37
             precision    recall  f1-score   support

          0       0.56      1.00      0.71        15
          1       1.00      0.45      0.62        11
          2       1.00      0.45      0.62        11

avg / total       0.82      0.68      0.66        37

[15  0  0  6  5  0  6  0  5]
MNB Accuracy:  0.6756756756756757
MNB F1:  0.6547619047619048
             precision    recall  f1-score   support

          0       0.60      0.80      0.69        15
          1       0.88      0.64      0.74        11
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 55)
(0, 0)
(0, 0)
1
21
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.90      0.95        10

avg / total       0.96      0.95      0.95        21

[11  0  1  9]
MNB Accuracy:  0.9523809523809523
MNB F1:  0.9519450800915332
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.91      1.00      0.95        10

avg / total       0.96      0.95      0.95        21

[10  1  0 10]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(104, 161)
(0, 0)
(0, 0)
1
104
             precision    recall  f1-score   support

          0       0.86      0.75      0.80        16
          1       0.94      0.88      0.91        17
          2       0.96      0.92      0.94        25
          3       0.77      1.00      0.87        20
          4       0.96      0.88      0.92        26

avg / total       0.90      0.89      0.89       104

[12  1  0  3  0  0 15  0  2  0  1  0 23  0  1  0  0  0 20  0  1  0  1  1
 23]
MNB Accuracy:  0.8942307692307693
MNB F1:  0.887486327337259
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(183, 329)
(0, 0)
(0, 0)
1
183
             precision    recall  f1-score   support

          0       0.88      1.00      0.93        63
          1       0.92      0.95      0.94        38
          2       0.92      0.85      0.88        27
          3       0.97      0.97      0.97        33
          4       1.00      0.64      0.78        22

avg / total       0.92      0.92      0.91       183

[63  0  0  0  0  1 36  1  0  0  1  2 23  1  0  0  1  0 32  0  7  0  1  0
 14]
MNB Accuracy:  0.9180327868852459
MNB F1:  0.9000976800976801


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        63
          1       0.94      0.89      0.92        38
          2       0.95      0.78      0.86        27
          3       1.00      0.97      0.98        33
          4       1.00      0.73      0.84        22

avg / total       0.92      0.91      0.91       183

[63  0  0  0  0  3 34  1  0  0  4  2 21  0  0  1  0  0 32  0  6  0  0  0
 16]
svc Accuracy:  0.907103825136612
svc F1:  0.9005564847670111
             precision    recall  f1-score   support

          0       0.75      1.00      0.86        63
          1       0.94      0.87      0.90        38
          2       1.00      0.74      0.85        27
          3       1.00      0.97      0.98        33
          4       1.00      0.55      0.71        22

avg / total       0.90      0.87      0.87       183

[63  0  0  0  0  5 33  0  0  0  5  2 20  0  0  1  0  0 32  0 10  0  0  0
 12]
LR Accuracy:  0.8743169398907104
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.75      1.00      0.86        15

avg / total       0.86      0.81      0.79        26

[ 6  5  0 15]
LR Accuracy:  0.8076923076923077
LR F1:  0.7815126050420167
For name:  d_zhang
total sample size before apply threshold:  94
Counter({'0000-0002-4175-5982': 17, '0000-0002-7665-2182': 12, '0000-0003-0779-6438': 11, '0000-0003-4280-0068': 8, '0000-0001-9295-4992': 7, '0000-0001-9508-8209': 7, '0000-0001-6930-5994': 6, '0000-0001-9478-5344': 6, '0000-0001-5809-0027': 5, '0000-0002-4149-4938': 4, '0000-0002-1581-2357': 4, '0000-0001-5956-4618': 2, '0000-0001-7063-7742': 2, '0000-0002-2541-837X': 1, '0000-0001-6259-7082': 1, '0000-0002-4515-2070': 1})
['0000-0002-4175-5982', '0000-0002-7665-2182', '0000-0003-0779-6438']
Total sample size after apply threshold:  40
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(28, 145)
(0, 0)
(0, 0)
1
28
             precision    recall  f1-score   support

          0       0.73      1.00      0.84        16
          1       1.00      0.50      0.67        12

avg / total       0.84      0.79      0.77        28

[16  0  6  6]
MNB Accuracy:  0.7857142857142857
MNB F1:  0.7543859649122807
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.86      1.00      0.92        12

avg / total       0.94      0.93      0.93        28

[14  2  0 12]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(206, 440)
(0, 0)
(0, 0)
1
206
             precision    recall  f1-score   support

          0       0.58      0.99      0.73       102
          1       0.93      0.54      0.68        46
          2       0.00      0.00      0.00        14
          3       1.00      0.18      0.30        17
          4       0.00      0.00      0.00        17
          5       0.00      0.00      0.00        10

avg / total       0.58      0.63      0.54       206

[101   0   0   0   1   0  21  25   0   0   0   0  14   0   0   0   0   0
  13   1   0   3   0   0 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.79      0.97      0.87       102
          1       0.90      0.80      0.85        46
          2       0.88      0.50      0.64        14
          3       1.00      0.82      0.90        17
          4       1.00      0.35      0.52        17
          5       0.83      1.00      0.91        10

avg / total       0.86      0.84      0.83       206

[99  3  0  0  0  0  9 37  0  0  0  0  5  0  7  0  0  2  2  0  1 14  0  0
 10  1  0  0  6  0  0  0  0  0  0 10]
svc Accuracy:  0.8398058252427184
svc F1:  0.7822068151699769
             precision    recall  f1-score   support

          0       0.66      0.99      0.79       102
          1       0.94      0.70      0.80        46
          2       1.00      0.14      0.25        14
          3       1.00      0.29      0.45        17
          4       1.00      0.18      0.30        17
          5       0.89      0.80      0.84        10

avg / total       0.81     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.79      0.88        14
          1       0.91      0.83      0.87        12
          2       1.00      0.93      0.96        27
          3       1.00      0.89      0.94        18
          4       0.63      0.97      0.76        30
          5       1.00      0.36      0.53        11
          6       1.00      0.96      0.98        28

avg / total       0.91      0.87      0.87       140

[11  0  0  0  3  0  0  0 10  0  0  2  0  0  0  0 25  0  2  0  0  0  0  0
 16  2  0  0  0  1  0  0 29  0  0  0  0  0  0  7  4  0  0  0  0  0  1  0
 27]
svc Accuracy:  0.8714285714285714
svc F1:  0.847227079915194
             precision    recall  f1-score   support

          0       1.00      0.21      0.35        14
          1       1.00      0.67      0.80        12
          2       1.00      0.96      0.98        27
          3       1.00      0.89      0.94        18
          4       0.54      1.00      0.70

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.90      0.86        41
          1       0.92      0.87      0.89        53
          2       1.00      0.92      0.96        13

avg / total       0.89      0.89      0.89       107

[37  4  0  7 46  0  1  0 12]
svc Accuracy:  0.8878504672897196
svc F1:  0.9045563332580718
             precision    recall  f1-score   support

          0       0.91      0.76      0.83        41
          1       0.82      0.94      0.88        53
          2       1.00      0.92      0.96        13

avg / total       0.88      0.87      0.87       107

[31 10  0  3 50  0  0  1 12]
LR Accuracy:  0.8691588785046729
LR F1:  0.887953216374269
For name:  m_viana
total sample size before apply threshold:  139
Counter({'0000-0002-0464-4845': 34, '0000-0003-4356-8109': 31, '0000-0002-4073-3802': 29, '0000-0001-9665-2115': 26, '0000-0001-9288-2108': 13, '0000-0002-3074-767X': 5, '0000-0002-5657-5570': 1})
['0000-0001-9665-2115'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.76      0.96      0.85        26
          1       1.00      0.94      0.97        31
          2       1.00      0.93      0.96        29
          3       0.91      0.77      0.83        13
          4       1.00      0.97      0.99        34

avg / total       0.94      0.93      0.93       133

[25  0  0  1  0  2 29  0  0  0  2  0 27  0  0  3  0  0 10  0  1  0  0  0
 33]
svc Accuracy:  0.9323308270676691
svc F1:  0.919363593654006
             precision    recall  f1-score   support

          0       0.80      0.92      0.86        26
          1       0.91      0.94      0.92        31
          2       0.93      0.97      0.95        29
          3       0.88      0.54      0.67        13
          4       1.00      0.97      0.99        34

avg / total       0.91      0.91      0.91       133

[24  1  0  1  0  1 29  1  0  0  0  1 28  0  0  4  1  1  7  0  1  0  0  0
 33]
LR Accuracy:  0.9097744360902256
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.97      0.95        60
          1       0.91      0.84      0.87        25

avg / total       0.93      0.93      0.93        85

[58  2  4 21]
svc Accuracy:  0.9294117647058824
svc F1:  0.9129098360655737
             precision    recall  f1-score   support

          0       0.77      1.00      0.87        60
          1       1.00      0.28      0.44        25

avg / total       0.84      0.79      0.74        85

[60  0 18  7]
LR Accuracy:  0.788235294117647
LR F1:  0.6535326086956522
For name:  c_liao
total sample size before apply threshold:  35
Counter({'0000-0002-1324-9644': 11, '0000-0001-5168-6493': 11, '0000-0001-9777-3701': 6, '0000-0003-3459-1913': 6, '0000-0003-4156-0912': 1})
['0000-0002-1324-9644', '0000-0001-5168-6493']
Total sample size after apply threshold:  22
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(139, 378)
(0, 0)
(0, 0)
1
139
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        68
          1       1.00      0.67      0.80        18
          2       0.83      1.00      0.91        53

avg / total       0.93      0.92      0.92       139

[63  0  5  0 12  6  0  0 53]
MNB Accuracy:  0.920863309352518
MNB F1:  0.8892716556838695
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        68
          1       1.00      0.83      0.91        18
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 27
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(27, 93)
(0, 0)
(0, 0)
1
27
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       1.00      0.90      0.95        10

avg / total       0.97      0.96      0.96        27

[17  0  1  9]
MNB Accuracy:  0.9629629629629629
MNB F1:  0.9593984962406015
             precision    recall  f1-score   support

          0       0.94      0.94      0.94        17
          1       0.90      0.90      0.90        10

avg / total       0.93      0.93      0.93        27

[16  1  1  9]
svc 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(100, 351)
(0, 0)
(0, 0)
1
100
             precision    recall  f1-score   support

          0       0.97      0.97      0.97        35
          1       0.98      0.98      0.98        65

avg / total       0.98      0.98      0.98       100

[34  1  1 64]
MNB Accuracy:  0.98
MNB F1:  0.9780219780219781
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        35
          1       0.97      1.00      0.98        65

avg / total       0.98      0.98      0.98       100

[33  2  0 65]
svc Accuracy:  0.98

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.98      1.00      0.99       238
          1       1.00      0.81      0.89        31

avg / total       0.98      0.98      0.98       269

[238   0   6  25]
svc Accuracy:  0.9776951672862454
svc F1:  0.9402045050385299
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       238
          1       1.00      0.13      0.23        31

avg / total       0.91      0.90      0.86       269

[238   0  27   4]
LR Accuracy:  0.8996282527881041
LR F1:  0.587446748082931
For name:  k_jacobsen
total sample size before apply threshold:  113
Counter({'0000-0002-4198-6246': 93, '0000-0002-1121-2979': 17, '0000-0002-3450-0850': 2, '0000-0003-0135-0988': 1})
['0000-0002-4198-6246', '0000-0002-1121-2979']
Total sample size after apply threshold:  110
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='con

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96        93
          1       1.00      0.59      0.74        17

avg / total       0.94      0.94      0.93       110

[93  0  7 10]
svc Accuracy:  0.9363636363636364
svc F1:  0.8522356553444637
             precision    recall  f1-score   support

          0       0.86      1.00      0.93        93
          1       1.00      0.12      0.21        17

avg / total       0.88      0.86      0.81       110

[93  0 15  2]
LR Accuracy:  0.8636363636363636
LR F1:  0.5679497250589159
For name:  s_kelly
total sample size before apply threshold:  102
Counter({'0000-0003-4002-048X': 31, '0000-0001-8583-5362': 26, '0000-0002-8245-0181': 20, '0000-0003-3533-5268': 12, '0000-0002-0375-1040': 11, '0000-0002-3078-8404': 2})
['0000-0002-8245-0181', '0000-0001-8583-5362', '0000-0003-3533-5268', '0000-0002-0375-1040', '0000-0003-4002-048X']
Total sample size after apply threshold:  100
TfidfVectorizer(analy

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       0.81      0.85      0.83        26
          2       1.00      0.75      0.86        12
          3       1.00      0.55      0.71        11
          4       0.74      0.90      0.81        31

avg / total       0.87      0.85      0.85       100

[20  0  0  0  0  0 22  0  0  4  0  1  9  0  2  0  1  0  6  4  0  3  0  0
 28]
svc Accuracy:  0.85
svc F1:  0.8409616184455734
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       1.00      0.54      0.70        26
          2       1.00      0.75      0.86        12
          3       1.00      0.18      0.31        11
          4       0.56      1.00      0.72        31

avg / total       0.86      0.76      0.74       100

[20  0  0  0  0  0 14  0  0 12  0  0  9  0  3  0  0  0  2  9  0  0  0  0
 31]
LR Accuracy:  0.76
LR F1:  0.7171530794786609
Fo

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(65, 172)
(0, 0)
(0, 0)
1
65
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        26
          1       0.87      1.00      0.93        39

avg / total       0.92      0.91      0.90        65

[20  6  0 39]
MNB Accuracy:  0.9076923076923077
MNB F1:  0.8990683229813665
             precision    recall  f1-score   support

          0       1.00      0.81      0.89        26
          1       0.89      1.00      0.94        39

avg / total       0.93      0.92      0.92        65

[21  5  0 39]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.29      0.45        17
          1       0.00      0.00      0.00        11
          2       1.00      0.62      0.76        13
          3       0.84      1.00      0.91       144

avg / total       0.81      0.85      0.80       185

[  5   0   0  12   0   0   0  11   0   0   8   5   0   0   0 144]
MNB Accuracy:  0.8486486486486486
MNB F1:  0.5319606553783769
             precision    recall  f1-score   support

          0       0.94      0.88      0.91        17
          1       0.88      0.64      0.74        11
          2       1.00      0.77      0.87        13
          3       0.94      0.99      0.96       144

avg / total       0.94      0.94      0.94       185

[ 15   0   0   2   0   7   0   4   0   0  10   3   1   1   0 142]
svc Accuracy:  0.9405405405405406
svc F1:  0.8695525240380377
             precision    recall  f1-score   support

          0       1.00      0.29      0.45      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(297, 1721)
(0, 0)
(0, 0)
1
297
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       1.00      0.05      0.10        19
          2       1.00      0.54      0.70        35
          3       1.00      0.77      0.87        69
          4       0.54      1.00      0.70       112
          5       1.00      0.20      0.33        15
          6       1.00      0.37      0.54        27

avg / total       0.82      0.67      0.63       297

[  2   0   0   0  18   0   0   0   1   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.66      0.94      0.77        87
          1       0.98      0.95      0.97       117
          2       0.97      0.79      0.87        38
          3       1.00      0.40      0.57        10
          4       1.00      0.58      0.74        24
          5       0.90      0.76      0.82        70

avg / total       0.88      0.85      0.85       346

[ 82   0   1   0   0   4   5 111   0   0   0   1   8   0  30   0   0   0
   4   2   0   4   0   0   9   0   0   0  14   1  17   0   0   0   0  53]
svc Accuracy:  0.8497109826589595
svc F1:  0.7897239362340579
             precision    recall  f1-score   support

          0       0.65      0.93      0.76        87
          1       0.85      0.97      0.90       117
          2       0.96      0.71      0.82        38
          3       0.00      0.00      0.00        10
          4       1.00      0.25      0.40        24
          5       1.00      0.77      0.87   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      1.00      0.86        15
          1       1.00      0.62      0.76        13

avg / total       0.87      0.82      0.81        28

[15  0  5  8]
LR Accuracy:  0.8214285714285714
LR F1:  0.8095238095238095
For name:  b_kang
total sample size before apply threshold:  20
Counter({'0000-0001-5902-0549': 10, '0000-0001-6946-2279': 5, '0000-0003-2637-4695': 2, '0000-0003-0901-4903': 1, '0000-0002-4299-2170': 1, '0000-0002-1690-7753': 1})
['0000-0001-5902-0549']
Total sample size after apply threshold:  10
For name:  s_carter
total sample size before apply threshold:  205
Counter({'0000-0002-3585-9400': 124, '0000-0003-2617-8694': 44, '0000-0002-9080-519X': 15, '0000-0002-4670-0884': 12, '0000-0002-9817-0029': 5, '0000-0002-3619-8640': 2, '0000-0002-8169-4483': 2, '0000-0002-2907-9651': 1})
['0000-0002-3585-9400', '0000-0002-4670-0884', '0000-0003-2617-8694', '0000-0002-9080-519X']
Total sample size after appl

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        15
          1       0.74      1.00      0.85        56
          2       1.00      0.27      0.42        15

avg / total       0.83      0.77      0.73        86

[ 6  9  0  0 56  0  0 11  4]
svc Accuracy:  0.7674418604651163
svc F1:  0.6136553504974557
             precision    recall  f1-score   support

          0       1.00      0.07      0.12        15
          1       0.66      1.00      0.79        56
          2       0.00      0.00      0.00        15

avg / total       0.60      0.66      0.54        86

[ 1 14  0  0 56  0  0 15  0]
LR Accuracy:  0.6627906976744186
LR F1:  0.3064420803782506
For name:  m_gutierrez
total sample size before apply threshold:  32
Counter({'0000-0003-3199-0337': 30, '0000-0003-0964-6222': 2})
['0000-0003-3199-0337']
Total sample size after apply threshold:  30
For name:  s_moon
total sample size before apply threshold:  85
Counter({'0000-0001

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        14
          1       1.00      0.86      0.92        14
          2       0.96      0.86      0.91        29
          3       0.95      0.95      0.95        74
          4       0.73      0.95      0.82        39

avg / total       0.91      0.89      0.89       170

[ 7  0  0  1  6  0 12  0  1  1  0  0 25  1  3  0  0  0 70  4  0  0  1  1
 37]
svc Accuracy:  0.888235294117647
svc F1:  0.8534005334005335
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        14
          1       1.00      0.79      0.88        14
          2       0.96      0.83      0.89        29
          3       0.83      0.96      0.89        74
          4       0.76      0.79      0.77        39

avg / total       0.86      0.85      0.84       170

[ 7  0  0  3  4  0 11  0  2  1  0  0 24  3  2  0  0  0 71  3  0  0  1  7
 31]
LR Accuracy:  0.8470588235294118
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.70      0.83        27
          1       0.89      1.00      0.94        70
          2       1.00      0.94      0.97        17

avg / total       0.93      0.92      0.92       114

[19  8  0  0 70  0  0  1 16]
svc Accuracy:  0.9210526315789473
svc F1:  0.9117937472183169
             precision    recall  f1-score   support

          0       1.00      0.07      0.14        27
          1       0.72      1.00      0.84        70
          2       1.00      0.88      0.94        17

avg / total       0.83      0.76      0.69       114

[ 2 25  0  0 70  0  0  2 15]
LR Accuracy:  0.7631578947368421
LR F1:  0.6379181292587239
For name:  j_conde
total sample size before apply threshold:  84
Counter({'0000-0001-8422-6792': 35, '0000-0002-2187-479X': 29, '0000-0002-5677-3024': 19, '0000-0001-8739-6893': 1})
['0000-0001-8422-6792', '0000-0002-5677-3024', '0000-0002-2187-479X']
Total sample size after apply th

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97        35
          1       1.00      0.89      0.94        19
          2       1.00      1.00      1.00        29

avg / total       0.98      0.98      0.98        83

[35  0  0  2 17  0  0  0 29]
svc Accuracy:  0.9759036144578314
svc F1:  0.9722222222222222
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        35
          1       1.00      0.89      0.94        19
          2       1.00      1.00      1.00        29

avg / total       0.98      0.98      0.98        83

[35  0  0  2 17  0  0  0 29]
LR Accuracy:  0.9759036144578314
LR F1:  0.9722222222222222
For name:  k_wright
total sample size before apply threshold:  59
Counter({'0000-0003-0040-9247': 18, '0000-0002-9020-1572': 15, '0000-0003-3865-9743': 12, '0000-0002-0387-3048': 7, '0000-0001-6202-1737': 6, '0000-0003-0700-6010': 1})
['0000-0003-0040-9247', '0000-0002-9020-1572', '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.93      1.00      0.96       232
          1       1.00      0.55      0.71        11
          2       1.00      0.62      0.76        13
          3       1.00      0.30      0.46        10

avg / total       0.94      0.94      0.93       266

[232   0   0   0   5   6   0   0   5   0   8   0   7   0   0   3]
svc Accuracy:  0.9360902255639098
svc F1:  0.7234956352603412
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       232
          1       0.00      0.00      0.00        11
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        10

avg / total       0.76      0.87      0.81       266

[232   0   0   0  11   0   0   0  13   0   0   0  10   0   0   0]
LR Accuracy:  0.8721804511278195
LR F1:  0.2329317269076305
For name:  h_huang
total sample size before apply threshold:  224
Counter({'0000-0002-3386-0934': 87, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.78      0.82        18
          1       0.80      0.83      0.82        24
          2       1.00      0.79      0.88        14
          3       1.00      0.88      0.93        16
          4       0.89      0.98      0.93        87
          5       0.85      0.69      0.76        16

avg / total       0.89      0.89      0.88       175

[14  3  0  0  1  0  2 20  0  0  2  0  0  1 11  0  2  0  0  0  0 14  1  1
  0  1  0  0 85  1  0  0  0  0  5 11]
svc Accuracy:  0.8857142857142857
svc F1:  0.856795285666556
             precision    recall  f1-score   support

          0       0.92      0.67      0.77        18
          1       1.00      0.42      0.59        24
          2       1.00      0.57      0.73        14
          3       1.00      0.56      0.72        16
          4       0.67      1.00      0.81        87
          5       1.00      0.38      0.55        16

avg / total       0.83      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       0.66      1.00      0.80        35

avg / total       0.78      0.67      0.57        55

[ 2 18  0 35]
LR Accuracy:  0.6727272727272727
LR F1:  0.48863636363636365
For name:  y_xu
total sample size before apply threshold:  137
Counter({'0000-0002-2195-1695': 47, '0000-0002-6689-7768': 19, '0000-0002-6406-7832': 17, '0000-0001-6643-3173': 9, '0000-0002-0763-9953': 8, '0000-0002-4479-6157': 8, '0000-0001-7429-4724': 5, '0000-0002-5578-4960': 4, '0000-0002-1887-0632': 4, '0000-0002-9834-3006': 3, '0000-0002-9945-3514': 3, '0000-0001-8488-0399': 2, '0000-0001-9106-0049': 1, '0000-0003-4549-6110': 1, '0000-0002-2341-7971': 1, '0000-0003-4420-6353': 1, '0000-0002-7963-6890': 1, '0000-0002-7962-6668': 1, '0000-0003-1355-0055': 1, '0000-0002-1563-8811': 1})
['0000-0002-6406-7832', '0000-0002-2195-1695', '0000-0002-6689-7768']
Total sample size after apply threshold:  83

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.95      0.98        22
          1       0.97      1.00      0.99       216
          2       1.00      0.55      0.71        11

avg / total       0.98      0.98      0.97       249

[ 21   1   0   0 216   0   0   5   6]
MNB Accuracy:  0.9759036144578314
MNB F1:  0.8896426362835673
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        22
          1       0.99      1.00      0.99       216
          2       1.00      1.00      1.00        11

avg / total       0.99      0.99      0.99       249

[ 19   3   0   0 216   0   0   0  11]
svc Accuracy:  0.9879518072289156
svc F1:  0.9733109055228484
             precision    recall  f1-score   support

          0       1.00      0.32      0.48        22
          1       0.90      1.00      0.95       216
          2       1.00      0.27      0.43        11

avg / total       0.92      0.91      0.89       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      1.00      0.96       110
          1       1.00      0.89      0.94        57
          2       1.00      0.60      0.75        10

avg / total       0.95      0.94      0.94       177

[110   0   0   6  51   0   4   0   6]
svc Accuracy:  0.943502824858757
svc F1:  0.8836553945249598
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       110
          1       1.00      0.82      0.90        57
          2       1.00      0.30      0.46        10

avg / total       0.92      0.90      0.89       177

[110   0   0  10  47   0   7   0   3]
LR Accuracy:  0.903954802259887
LR F1:  0.7645515525262362
For name:  p_robinson
total sample size before apply threshold:  275
Counter({'0000-0002-7878-0313': 133, '0000-0002-0736-9199': 119, '0000-0002-3156-3418': 19, '0000-0002-0577-3147': 4})
['0000-0002-0736-9199', '0000-0002-7878-0313', '0000-0002-3156-3418']
Total samp

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.95      0.97       119
          1       0.90      1.00      0.95       133
          2       1.00      0.53      0.69        19

avg / total       0.95      0.94      0.94       271

[113   6   0   0 133   0   0   9  10]
svc Accuracy:  0.9446494464944649
svc F1:  0.8701374401767089
             precision    recall  f1-score   support

          0       1.00      0.91      0.95       119
          1       0.83      1.00      0.91       133
          2       1.00      0.16      0.27        19

avg / total       0.92      0.90      0.88       271

[108  11   0   0 133   0   0  16   3]
LR Accuracy:  0.9003690036900369
LR F1:  0.7107063174330243
For name:  c_zou
total sample size before apply threshold:  32
Counter({'0000-0003-2484-7292': 22, '0000-0001-8569-3747': 8, '0000-0003-4305-5055': 1, '0000-0002-9712-4282': 1})
['0000-0003-2484-7292']
Total sample size after apply threshold:  22
For name:  s_rana
t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.45      0.62        11
          1       0.75      1.00      0.86        18

avg / total       0.84      0.79      0.77        29

[ 5  6  0 18]
LR Accuracy:  0.7931034482758621
LR F1:  0.7410714285714286
For name:  s_jeong
total sample size before apply threshold:  93
Counter({'0000-0001-6178-8338': 33, '0000-0002-1958-8436': 21, '0000-0002-6376-7001': 13, '0000-0002-6480-7685': 7, '0000-0002-9084-5183': 6, '0000-0001-8995-3497': 5, '0000-0002-8370-3566': 1, '0000-0002-4004-3510': 1, '0000-0001-9175-9642': 1, '0000-0001-9197-1184': 1, '0000-0002-9868-621X': 1, '0000-0002-3309-0693': 1, '0000-0001-9575-0354': 1, '0000-0001-9588-1928': 1})
['0000-0002-6376-7001', '0000-0002-1958-8436', '0000-0001-6178-8338']
Total sample size after apply threshold:  67
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowe

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        35
          1       0.95      1.00      0.97       167

avg / total       0.96      0.96      0.95       202

[ 26   9   0 167]
svc Accuracy:  0.9554455445544554
svc F1:  0.9131099746690245
             precision    recall  f1-score   support

          0       1.00      0.11      0.21        35
          1       0.84      1.00      0.92       167

avg / total       0.87      0.85      0.79       202

[  4  31   0 167]
LR Accuracy:  0.8465346534653465
LR F1:  0.560098349139445
For name:  m_reilly
total sample size before apply threshold:  20
Counter({'0000-0001-8029-0084': 17, '0000-0002-5526-8245': 1, '0000-0001-8746-3224': 1, '0000-0003-2506-3190': 1})
['0000-0001-8029-0084']
Total sample size after apply threshold:  17
For name:  d_nguyen
total sample size before apply threshold:  25
Counter({'0000-0002-4997-555X': 8, '0000-0002-3283-3504': 7, '0000-0001-6420-7308': 3, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        33
          1       0.88      0.64      0.74        11
          2       0.92      0.85      0.88        13
          3       0.95      0.90      0.92        20
          4       0.74      1.00      0.85        29
          5       1.00      0.81      0.90        16
          6       1.00      0.85      0.92        13

avg / total       0.92      0.90      0.90       135

[33  0  0  0  0  0  0  0  7  1  1  2  0  0  0  0 11  0  2  0  0  0  1  0
 18  1  0  0  0  0  0  0 29  0  0  0  0  0  0  3 13  0  0  0  0  0  2  0
 11]
svc Accuracy:  0.9037037037037037
svc F1:  0.8865826565164666
             precision    recall  f1-score   support

          0       0.97      1.00      0.99        33
          1       1.00      0.64      0.78        11
          2       0.89      0.62      0.73        13
          3       1.00      0.90      0.95        20
          4       0.64      0.97      0.7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.84      1.00      0.92       125
          1       1.00      0.20      0.33        15
          2       1.00      0.94      0.97        18
          3       1.00      0.92      0.96        12
          4       0.86      0.75      0.80        16
          5       1.00      0.61      0.76        18

avg / total       0.89      0.88      0.86       204

[125   0   0   0   0   0  12   3   0   0   0   0   1   0  17   0   0   0
   1   0   0  11   0   0   4   0   0   0  12   0   5   0   0   0   2  11]
svc Accuracy:  0.8774509803921569
svc F1:  0.7892758748830713
             precision    recall  f1-score   support

          0       0.71      1.00      0.83       125
          1       0.00      0.00      0.00        15
          2       1.00      0.72      0.84        18
          3       1.00      0.58      0.74        12
          4       1.00      0.31      0.48        16
          5       1.00      0.22      0.36   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.96      0.98        28
          1       0.96      1.00      0.98        24

avg / total       0.98      0.98      0.98        52

[27  1  0 24]
LR Accuracy:  0.9807692307692307
LR F1:  0.9807050092764378
For name:  d_collins
total sample size before apply threshold:  31
Counter({'0000-0001-6754-9290': 8, '0000-0002-6248-9644': 7, '0000-0002-3283-0733': 6, '0000-0003-2274-0889': 5, '0000-0003-2484-1640': 2, '0000-0002-8432-7021': 1, '0000-0001-8891-1893': 1, '0000-0002-7981-3586': 1})
[]
Total sample size after apply threshold:  0
For name:  l_davies
total sample size before apply threshold:  96
Counter({'0000-0001-8801-3559': 62, '0000-0002-0451-8670': 19, '0000-0002-4876-6270': 11, '0000-0002-2986-705X': 4})
['0000-0001-8801-3559', '0000-0002-4876-6270', '0000-0002-0451-8670']
Total sample size after apply threshold:  92
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dty

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.36      0.53        22
          1       0.88      1.00      0.94       104

avg / total       0.90      0.89      0.87       126

[  8  14   0 104]
svc Accuracy:  0.8888888888888888
svc F1:  0.7351351351351352
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.83      1.00      0.90       104

avg / total       0.68      0.83      0.75       126

[  0  22   0 104]
LR Accuracy:  0.8253968253968254
LR F1:  0.45217391304347826
For name:  a_fontana
total sample size before apply threshold:  203
Counter({'0000-0002-6660-5315': 65, '0000-0002-5453-461X': 59, '0000-0002-5391-7520': 44, '0000-0002-8481-1219': 16, '0000-0002-4791-8746': 14, '0000-0003-3820-2823': 3, '0000-0003-1556-2770': 2})
['0000-0002-5391-7520', '0000-0002-5453-461X', '0000-0002-4791-8746', '0000-0002-6660-5315', '0000-0002-8481-1219']
Total sample size after apply 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.95      0.98        44
          1       0.88      0.98      0.93        59
          2       1.00      0.64      0.78        14
          3       0.98      0.98      0.98        65
          4       1.00      1.00      1.00        16

avg / total       0.96      0.95      0.95       198

[42  2  0  0  0  0 58  0  1  0  0  5  9  0  0  0  1  0 64  0  0  0  0  0
 16]
svc Accuracy:  0.9545454545454546
svc F1:  0.934393653262814
             precision    recall  f1-score   support

          0       0.97      0.89      0.93        44
          1       0.91      0.85      0.88        59
          2       1.00      0.50      0.67        14
          3       0.77      1.00      0.87        65
          4       1.00      0.75      0.86        16

avg / total       0.89      0.87      0.87       198

[39  1  0  4  0  1 50  0  8  0  0  3  7  4  0  0  0  0 65  0  0  1  0  3
 12]
LR Accuracy:  0.8737373737373737
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.68      0.84      0.75        32
          1       0.00      0.00      0.00        11
          2       0.63      0.84      0.72        32
          3       0.83      1.00      0.91       179
          4       1.00      0.45      0.62        20
          5       1.00      0.21      0.34        24
          6       0.90      0.43      0.58        21
          7       0.90      0.82      0.86        34

avg / total       0.81      0.80      0.77       353

[ 27   0   3   1   0   0   1   0   0   0   1  10   0   0   0   0   3   0
  27   2   0   0   0   0   0   0   0 179   0   0   0   0   0   0   0   8
   9   0   0   3   4   0   6   9   0   5   0   0   5   0   5   2   0   0
   9   0   1   0   1   4   0   0   0  28]
MNB Accuracy:  0.8045325779036827
MNB F1:  0.5982912882290575
             precision    recall  f1-score   support

          0       0.82      0.88      0.85        32
          1       1.00      0.55     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        11
          1       0.80      1.00      0.89        43
          2       1.00      0.36      0.53        11

avg / total       0.87      0.83      0.81        65

[ 7  4  0  0 43  0  0  7  4]
svc Accuracy:  0.8307692307692308
svc F1:  0.7325696830851469
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        11
          1       0.70      1.00      0.83        43
          2       0.00      0.00      0.00        11

avg / total       0.64      0.72      0.64        65

[ 4  7  0  0 43  0  0 11  0]
LR Accuracy:  0.7230769230769231
LR F1:  0.4534188034188034
For name:  t_smith
total sample size before apply threshold:  603
Counter({'0000-0002-3650-9381': 154, '0000-0003-1673-2954': 113, '0000-0002-2120-2766': 85, '0000-0002-6279-9685': 84, '0000-0003-3528-6793': 65, '0000-0003-4453-9713': 32, '0000-0002-5197-5030': 26, '0000-0002-3945-63

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.09      0.17        32
          2       0.92      0.71      0.80        65
          3       0.93      0.68      0.79        84
          4       0.88      0.93      0.90       113
          5       0.66      1.00      0.79       154
          6       0.96      0.92      0.94        85
          7       1.00      0.73      0.84        26

avg / total       0.84      0.81      0.79       569

[  0   0   0   1   2   7   0   0   0   3   2   1   9  17   0   0   0   0
  46   1   2  13   3   0   0   0   2  57   2  23   0   0   0   0   0   0
 105   8   0   0   0   0   0   0   0 154   0   0   0   0   0   0   0   7
  78   0   0   0   0   1   0   6   0  19]
MNB Accuracy:  0.81195079086116
MNB F1:  0.654362535142212
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.50      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       1.00      1.00      1.00        10
          2       0.72      1.00      0.84        18

avg / total       0.88      0.83      0.81        41

[ 6  0  7  0 10  0  0  0 18]
MNB Accuracy:  0.8292682926829268
MNB F1:  0.8229294165646674
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       1.00      1.00      1.00        10
          2       0.95      1.00      0.97        18

avg / total       0.98      0.98      0.98        41

[12  0  1  0 10  0  0  0 18]
svc Accuracy:  0.975609756097561
svc F1:  0.9776576576576576
             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       1.00      1.00      1.00        10
          2       0.72      1.00      0.84        18

avg / total       0.88      0.83      0.81        41

[ 6  0  7  0 10

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  15
For name:  t_wong
total sample size before apply threshold:  14
Counter({'0000-0002-1045-2698': 9, '0000-0002-5752-7917': 2, '0000-0001-9234-4529': 1, '0000-0001-6187-8851': 1, '0000-0001-8611-4911': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ross
total sample size before apply threshold:  25
Counter({'0000-0002-2302-8415': 17, '0000-0001-7305-3451': 3, '0000-0002-3094-3769': 2, '0000-0003-3512-9579': 1, '0000-0001-5676-4489': 1, '0000-0001-5523-2376': 1})
['0000-0002-2302-8415']
Total sample size after apply threshold:  17
For name:  d_richardson
total sample size before apply threshold:  456
Counter({'0000-0003-0960-6415': 231, '0000-0002-7751-1058': 167, '0000-0002-3992-8610': 22, '0000-0003-0247-9118': 17, '0000-0002-3189-2190': 12, '0000-0002-0054-6850': 7})
['0000-0002-3189-2190', '0000-0003-0960-6415', '0000-0002-7751-1058', '0000-0002-3992-8610', '0000-0003-0247-9118']
Total sample size after apply threshold:  4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.86      1.00      0.93       231
          2       0.98      0.95      0.96       167
          3       0.92      0.50      0.65        22
          4       1.00      0.47      0.64        17

avg / total       0.89      0.91      0.89       449

[  0  11   1   0   0   0 230   1   0   0   0   8 159   0   0   0  11   0
  11   0   0   6   2   1   8]
MNB Accuracy:  0.9086859688195991
MNB F1:  0.6352497014170584
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       0.89      1.00      0.94       231
          2       0.99      0.95      0.97       167
          3       1.00      0.73      0.84        22
          4       1.00      0.53      0.69        17

avg / total       0.94      0.94      0.93       449

[  6   6   0   0   0   0 231   0   0   0   0   9 158   0   0   0   6   0
  16   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      1.00      0.93        26
          1       1.00      0.79      0.88        14
          2       0.85      0.81      0.83        21

avg / total       0.89      0.89      0.88        61

[26  0  0  0 11  3  4  0 17]
MNB Accuracy:  0.8852459016393442
MNB F1:  0.8792799070847851
             precision    recall  f1-score   support

          0       0.87      1.00      0.93        26
          1       1.00      0.71      0.83        14
          2       0.86      0.86      0.86        21

avg / total       0.89      0.89      0.88        61

[26  0  0  1 10  3  3  0 18]
svc Accuracy:  0.8852459016393442
svc F1:  0.873015873015873
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        26
          1       1.00      0.71      0.83        14
          2       1.00      0.76      0.86        21

avg / total       0.89      0.85      0.85        61

[26  0  0  4 10

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.84      1.00      0.91        85
          1       1.00      0.70      0.82        10
          2       1.00      0.68      0.81        22
          3       1.00      0.40      0.57        10

avg / total       0.89      0.87      0.86       127

[85  0  0  0  3  7  0  0  7  0 15  0  6  0  0  4]
svc Accuracy:  0.8740157480314961
svc F1:  0.779936822156936
             precision    recall  f1-score   support

          0       0.71      1.00      0.83        85
          1       0.00      0.00      0.00        10
          2       1.00      0.36      0.53        22
          3       0.00      0.00      0.00        10

avg / total       0.65      0.73      0.65       127

[85  0  0  0 10  0  0  0 14  0  8  0 10  0  0  0]
LR Accuracy:  0.7322834645669292
LR F1:  0.3416666666666667
For name:  a_lin
total sample size before apply threshold:  46
Counter({'0000-0003-4236-7233': 27, '0000-0001-6310-9765': 10, '0000-0001-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  70
Counter({'0000-0001-6998-5686': 48, '0000-0001-5807-5820': 11, '0000-0003-3957-6288': 4, '0000-0003-4964-2197': 2, '0000-0002-9066-6935': 2, '0000-0003-4872-0632': 2, '0000-0002-7297-9639': 1})
['0000-0001-5807-5820', '0000-0001-6998-5686']
Total sample size after apply threshold:  59
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 586)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       0.89      0.73      0.80        11
          1       0.94      0.98      0.96        48

avg / total       0.93      0.93  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[ 8  3  1 47]
MNB Accuracy:  0.9322033898305084
MNB F1:  0.8795918367346938
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      1.00      1.00        48

avg / total       1.00      1.00      1.00        59

[11  0  0 48]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       0.84      1.00      0.91        48

avg / total       0.87      0.85      0.80        59

[ 2  9  0 48]
LR Accuracy:  0.847457627118644
LR F1:  0.610989010989011
For name:  h_vogel
total sample size before apply threshold:  15
Counter({'0000-0001-9821-7731': 5, '0000-0002-9902-8120': 4, '0000-0003-2404-9485': 4, '0000-0003-0072-4239': 2})
[]
Total sample size after apply threshold:  0
For name:  m_campos
total sample size before apply threshold:  148
Counter({'0000-0001-7738-9892': 107, '0000-0003-3217-9001': 12, '0000-0003-4313-706

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97       107
          1       1.00      0.42      0.59        12

avg / total       0.94      0.94      0.93       119

[107   0   7   5]
LR Accuracy:  0.9411764705882353
LR F1:  0.7782805429864253
For name:  d_stewart
total sample size before apply threshold:  294
Counter({'0000-0002-8157-7746': 210, '0000-0001-7360-8592': 77, '0000-0002-6764-4842': 3, '0000-0002-8499-7105': 1, '0000-0002-4087-5544': 1, '0000-0001-5144-1234': 1, '0000-0002-3690-9844': 1})
['0000-0001-7360-8592', '0000-0002-8157-7746']
Total sample size after apply threshold:  287
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(287, 519)
(0, 0)
(0, 0)
1
287
             precision    recall  f1-score   support

          0       0.97      0.92      0.95        77
          1       0.97      0.99      0.98       210

avg / total       0.97      0.97      0.97       287

[ 71   6   2 208]
MNB Accuracy:  0.9721254355400697
MNB F1:  0.9638993710691823
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        77
          1       0.96      1.00      0.98       210

avg / total       0.97      0.97      0.97       287

[ 69   8   0 210]
svc Accuracy:  0.9721254355400697
svc F1:  0.963256945333504
             precision    recall  f1-score   support

          0       1.00      0.57      0.73        77
          1       0.86      1.00      0.93       210

avg / total       0.90      0.89      0.87       287

[ 44  33   0 210]
LR Accuracy:  0.8850174216027874
LR F1:  0.827212522576761
For name:  j_abrantes
total sample size before apply threshold:  57
Counter({'0000-0002-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Total sample size after apply threshold:  103
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(103, 412)
(0, 0)
(0, 0)
1
103
             precision    recall  f1-score   support

          0       0.80      0.22      0.35        18
          1       1.00      0.10      0.18        10
          2       0.00      0.00      0.00        10
          3       0.66      0.98      0.79        65

avg / total       0.65      0.67      0.58       103

[ 4  0  0 14  0  1  0  9  0  0  0 10  1  0  0 64]
MNB Accuracy:  0.6699029126213593
MNB F1:  0.32994193139120676
             precision    recall  f1-score  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        36
          1       0.95      1.00      0.97       155

avg / total       0.96      0.95      0.95       191

[ 27   9   0 155]
svc Accuracy:  0.9528795811518325
svc F1:  0.9144648454993283
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        36
          1       0.86      1.00      0.93       155

avg / total       0.89      0.87      0.84       191

[ 11  25   0 155]
LR Accuracy:  0.8691099476439791
LR F1:  0.6967291203556685
For name:  f_campos
total sample size before apply threshold:  49
Counter({'0000-0001-8376-0977': 14, '0000-0002-5948-472X': 12, '0000-0002-1132-3257': 10, '0000-0001-8332-5043': 9, '0000-0001-9826-751X': 2, '0000-0001-5828-2862': 2})
['0000-0001-8376-0977', '0000-0002-5948-472X', '0000-0002-1132-3257']
Total sample size after apply threshold:  36
TfidfVectorizer(analyzer='word', binary=False, decode_error='s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(418, 1043)
(0, 0)
(0, 0)
1
418
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       0.73      1.00      0.84       188
          2       1.00      0.30      0.47        23
          3       1.00      0.91      0.95        65
          4       1.00      0.85      0.92        98
          5       1.00      0.33      0.50        24

avg / total       0.88      0.83      0.80       418

[  2  18   0   0   0   0   0 188   0   0   0   0   0  16   7   0   0   0
   0   6   0  59   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      0.50      0.65        28
          1       1.00      0.78      0.88        23
          2       1.00      0.20      0.33        15
          3       0.56      0.98      0.71        41

avg / total       0.82      0.70      0.68       107

[14  0  0 14  0 18  0  5  0  0  3 12  1  0  0 40]
MNB Accuracy:  0.7009345794392523
MNB F1:  0.6442076547011317
             precision    recall  f1-score   support

          0       0.95      0.75      0.84        28
          1       1.00      0.91      0.95        23
          2       1.00      0.27      0.42        15
          3       0.67      0.98      0.79        41

avg / total       0.86      0.80      0.79       107

[21  0  0  7  0 21  0  2  0  0  4 11  1  0  0 40]
svc Accuracy:  0.8037383177570093
svc F1:  0.7519193235112984
             precision    recall  f1-score   support

          0       1.00      0.46      0.63        28
          1       1.00     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(139, 750)
(0, 0)
(0, 0)
1
139
             precision    recall  f1-score   support

          0       0.92      0.89      0.90        64
          1       0.84      1.00      0.92        65
          2       0.00      0.00      0.00        10

avg / total       0.82      0.88      0.84       139

[57  7  0  0 65  0  5  5  0]
MNB Accuracy:  0.8776978417266187
MNB F1:  0.6067516208361279
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        64
          1       1.00      0.97      0.98        65
          2       1.00      0.10      0.18        10

avg / total       0.93      0.92      0.90       139

[64  0  0  2 63  0  9  0  1]
svc Accuracy:  0.920863309352518
svc F1:  0.6956854970568999
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        64
          1       1.00      0.97      0.98        65
          2       0.00      0.00      0.00        10

avg / total       0.86      0.91     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.68      0.81        22
          1       1.00      0.68      0.81        44
          2       1.00      0.71      0.83        14
          3       0.76      1.00      0.86        78

avg / total       0.88      0.84      0.84       158

[15  0  0  7  0 30  0 14  0  0 10  4  0  0  0 78]
svc Accuracy:  0.8417721518987342
svc F1:  0.8292083519984073
             precision    recall  f1-score   support

          0       1.00      0.32      0.48        22
          1       1.00      0.55      0.71        44
          2       1.00      0.07      0.13        14
          3       0.62      1.00      0.76        78

avg / total       0.81      0.70      0.65       158

[ 7  0  0 15  0 24  0 20  0  0  1 13  0  0  0 78]
LR Accuracy:  0.6962025316455697
LR F1:  0.5216700473292766
For name:  m_king
total sample size before apply threshold:  58
Counter({'0000-0002-2587-9117': 26, '0000-0001-6030-5154': 13, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 27
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(27, 105)
(0, 0)
(0, 0)
1
27
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.87      1.00      0.93        13

avg / total       0.94      0.93      0.93        27

[12  2  0 13]
MNB Accuracy:  0.9259259259259259
MNB F1:  0.9258241758241759
             precision    recall  f1-score   support

          0       0.93      1.00      0.97        14
          1       1.00      0.92      0.96        13

avg / total       0.97      0.96      0.96        27

[14  0  1 12]
svc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.96      1.00      0.98       279

avg / total       0.96      0.96      0.95       295

[  5  11   0 279]
MNB Accuracy:  0.9627118644067797
MNB F1:  0.7284291572516528
             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.96      1.00      0.98       279

avg / total       0.96      0.96      0.95       295

[  5  11   0 279]
svc Accuracy:  0.9627118644067797
svc F1:  0.7284291572516528


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.95      1.00      0.97       279

avg / total       0.89      0.95      0.92       295

[  0  16   0 279]
LR Accuracy:  0.9457627118644067
LR F1:  0.48606271777003485
For name:  j_cooper
total sample size before apply threshold:  147
Counter({'0000-0003-1339-4750': 85, '0000-0001-6009-3542': 24, '0000-0001-8163-2306': 19, '0000-0002-9014-4395': 14, '0000-0002-8626-7827': 4, '0000-0002-4932-1740': 1})
['0000-0002-9014-4395', '0000-0001-6009-3542', '0000-0001-8163-2306', '0000-0003-1339-4750']
Total sample size after apply threshold:  142
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sub

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        17
          1       0.91      1.00      0.95        81
          2       1.00      0.83      0.91        18

avg / total       0.94      0.93      0.93       116

[12  5  0  0 81  0  0  3 15]
svc Accuracy:  0.9310344827586207
svc F1:  0.8965394308193497
             precision    recall  f1-score   support

          0       1.00      0.06      0.11        17
          1       0.72      1.00      0.84        81
          2       1.00      0.17      0.29        18

avg / total       0.81      0.73      0.65       116

[ 1 16  0  0 81  0  0 15  3]
LR Accuracy:  0.7327586206896551
LR F1:  0.412067878389122
For name:  s_hussein
total sample size before apply threshold:  33
Counter({'0000-0002-7946-0717': 18, '0000-0002-6305-508X': 9, '0000-0003-3657-7410': 4, '0000-0002-5394-4385': 1, '0000-0002-0139-1483': 1})
['0000-0002-7946-0717']
Total sample size after apply threshold:  18
For name

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[18  0  0  0 14  0  0  0 22]
LR Accuracy:  1.0
LR F1:  1.0
For name:  f_zhang
total sample size before apply threshold:  103
Counter({'0000-0001-6035-4829': 27, '0000-0001-7434-7339': 23, '0000-0002-0480-7501': 11, '0000-0001-9542-6634': 10, '0000-0003-1298-9795': 9, '0000-0002-1371-266X': 7, '0000-0002-1957-0543': 5, '0000-0002-2822-2049': 4, '0000-0002-9309-9577': 2, '0000-0003-1709-7788': 2, '0000-0001-7550-9483': 1, '0000-0002-8438-7155': 1, '0000-0003-2829-0735': 1})
['0000-0001-7434-7339', '0000-0002-0480-7501', '0000-0001-6035-4829', '0000-0001-9542-6634']
Total sample size after apply threshold:  71
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.7171717171717171
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.97      1.00      0.98        29

avg / total       0.98      0.98      0.98        42

[12  1  0 29]
svc Accuracy:  0.9761904761904762
svc F1:  0.9715254237288136
             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       0.71      1.00      0.83        29

avg / total       0.80      0.71      0.62        42

[ 1 12  0 29]
LR Accuracy:  0.7142857142857143
LR F1:  0.4857142857142857
For name:  a_palma
total sample size before apply threshold:  61
Counter({'0000-0003-2099-1297': 34, '0000-0002-8530-4913': 13, '0000-0002-5971-3676': 8, '0000-0003-0420-1785': 3, '0000-0002-1682-7032': 2, '0000-0002-7263-4868': 1})
['0000-0002-8530-4913', '0000-0003-2099-1297']
Total sample size after apply threshold:  47
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        34

avg / total       1.00      1.00      1.00        47

[13  0  0 34]
MNB Accuracy:  1.0
MNB F1:  1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        34

avg / total       1.00      1.00      1.00        47

[13  0  0 34]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.97      1.00      0.99        34

avg / total       0.98      0.98      0.98        47

[12  1  0 34]
LR Accuracy:  0.9787234042553191
LR F1:  0.9727536231884057
For name:  e_shaw
total sample size before apply threshold:  16
Counter({'0000-0003-1424-7568': 9, '0000-0002-5653-0145': 4, '0000-0002-4148-3526': 2, '0000-0002-4334-1900': 1})
[]
Total sa

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(36, 101)
(0, 0)
(0, 0)
1
36
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        18
          1       1.00      0.94      0.97        18

avg / total       0.97      0.97      0.97        36

[18  0  1 17]
MNB Accuracy:  0.9722222222222222
MNB F1:  0.9722007722007722
             precision    recall  f1-score   support

          0       0.85      0.94      0.89        18
          1       0.94      0.83      0.88        18

avg / total       0.89      0.89      0.89        36

[17  1  3 15]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 176)
(0, 0)
(0, 0)
1
55
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.79      1.00      0.88        23
          2       1.00      0.81      0.90        16

avg / total       0.91      0.89      0.89        55

[13  3  0  0 23  0  0  3 13]
MNB Accuracy:  0.8909090909090909
MNB F1:  0.8925729442970822
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.85      1.00      0.92        23
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.60      0.71        10
          1       0.76      0.93      0.84        14

avg / total       0.80      0.79      0.78        24

[ 6  4  1 13]
MNB Accuracy:  0.7916666666666666
MNB F1:  0.7722960151802656
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.78      1.00      0.88        14

avg / total       0.87      0.83      0.82        24

[ 6  4  0 14]
svc Accuracy:  0.8333333333333334
svc F1:  0.8125
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       0.74      1.00      0.85        14

avg / total       0.85      0.79      0.77        24

[ 5  5  0 14]
LR Accuracy:  0.7916666666666666
LR F1:  0.7575757575757576
For name:  l_simon
total sample size before apply threshold:  14
Counter({'0000-0003-4321-8539': 7, '0000-0003-4870-1052': 4, '0000-0002-5010

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.96      0.96        23
          1       0.96      0.96      0.96        23

avg / total       0.96      0.96      0.96        46

[22  1  1 22]
MNB Accuracy:  0.9565217391304348
MNB F1:  0.9565217391304348
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        23
          1       0.92      1.00      0.96        23

avg / total       0.96      0.96      0.96        46

[21  2  0 23]
svc Accuracy:  0.9565217391304348
svc F1:  0.9564393939393939
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        23
          1       0.92      1.00      0.96        23

avg / total       0.96      0.96      0.96        46

[21  2  0 23]
LR Accuracy:  0.9565217391304348
LR F1:  0.9564393939393939
For name:  l_torres
total sample size before apply threshold:  65
Counter({'0000-0002-0194-7875': 56, '0000-0002-4598-1899': 7, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(123, 377)
(0, 0)
(0, 0)
1
123
             precision    recall  f1-score   support

          0       1.00      0.56      0.72        16
          1       1.00      0.57      0.72        23
          2       1.00      0.47      0.64        19
          3       1.00      0.27      0.42        15
          4       0.57      1.00      0.72        50

avg / total       0.82      0.69      0.67       123

[ 9  0  0  0  7  0 13  0  0 10  0  0  9  0 10  0  0  0  4 11  0  0  0  0
 50]
MNB Accuracy:  0.6910569105691057
MNB F1:  0.6461539355635465
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       1.00      1.00      1.00        93
          2       1.00      1.00      1.00        10

avg / total       1.00      1.00      1.00       124

[21  0  0  0 93  0  0  0 10]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        21
          1       0.82      1.00      0.90        93
          2       1.00      0.20      0.33        10

avg / total       0.87      0.84      0.81       124

[ 9 12  0  0 93  0  0  8  2]
LR Accuracy:  0.8387096774193549
LR F1:  0.6120819848975189
For name:  r_hu
total sample size before apply threshold:  128
Counter({'0000-0001-6709-031X': 93, '0000-0001-7412-8451': 27, '0000-0001-6893-529X': 4, '0000-0001-5549-3082': 2, '0000-0002-7126-4076': 1, '0000-0001-5921-6891': 1})
['0000-0001-6709-031X', '0000-0001-7412-8451']
Total sample size after apply thresh

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


j_braun
total sample size before apply threshold:  72
Counter({'0000-0002-8886-078X': 37, '0000-0002-4504-6235': 25, '0000-0002-8309-6401': 5, '0000-0002-2491-5788': 5})
['0000-0002-8886-078X', '0000-0002-4504-6235']
Total sample size after apply threshold:  62
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(62, 157)
(0, 0)
(0, 0)
1
62
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        37
          1       1.00      0.80      0.89        25

avg / total       0.93      0.92      0.92        62

[37  0  5 20]
MNB Accuracy:  0.9193548387096774
MN

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


w_lu
total sample size before apply threshold:  138
Counter({'0000-0003-4731-1976': 38, '0000-0001-6722-1527': 33, '0000-0001-5358-305X': 30, '0000-0001-7421-347X': 13, '0000-0002-1405-4806': 6, '0000-0001-9798-8964': 4, '0000-0003-4334-5722': 3, '0000-0002-6570-3044': 3, '0000-0002-5243-5554': 2, '0000-0001-5508-342X': 2, '0000-0002-1398-9933': 1, '0000-0001-6214-4024': 1, '0000-0002-5101-9778': 1, '0000-0002-4528-2246': 1})
['0000-0001-5358-305X', '0000-0003-4731-1976', '0000-0001-7421-347X', '0000-0001-6722-1527']
Total sample size after apply threshold:  114
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        voca

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      1.00      0.90        35
          1       1.00      0.43      0.60        14

avg / total       0.87      0.84      0.81        49

[35  0  8  6]
MNB Accuracy:  0.8367346938775511
MNB F1:  0.7487179487179487
             precision    recall  f1-score   support

          0       0.88      1.00      0.93        35
          1       1.00      0.64      0.78        14

avg / total       0.91      0.90      0.89        49

[35  0  5  9]
svc Accuracy:  0.8979591836734694
svc F1:  0.8579710144927537
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        35
          1       1.00      0.29      0.44        14

avg / total       0.84      0.80      0.75        49

[35  0 10  4]
LR Accuracy:  0.7959183673469388
LR F1:  0.6597222222222223
For name:  k_saito
total sample size before apply threshold:  61
Counter({'0000-0003-4663-1134': 26, '0000-0002-2151-6204': 16, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.24      0.39        25
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        11
          3       1.00      0.11      0.19        28
          4       0.00      0.00      0.00        14
          5       1.00      0.41      0.58        17
          6       1.00      0.83      0.91        35
          7       1.00      0.10      0.18        31
          8       1.00      0.19      0.31        27
          9       0.00      0.00      0.00        12
         10       0.00      0.00      0.00        13
         11       1.00      0.07      0.13        14
         12       1.00      0.41      0.59        29
         13       0.00      0.00      0.00        11
         14       0.00      0.00      0.00        11
         15       1.00      0.18      0.30        17
         16       1.00      0.71      0.83        24
         17       0.75      0.10      0.18   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.96      0.96      0.96        24
          1       0.91      0.71      0.80        14
          2       0.85      0.92      0.88        36
          3       0.77      0.93      0.85        44
          4       0.85      0.88      0.87        26
          5       1.00      0.64      0.78        11
          6       0.90      0.75      0.82        12
          7       1.00      0.92      0.96        12
          8       0.82      0.70      0.76        20

avg / total       0.87      0.86      0.86       199

[23  0  0  1  0  0  0  0  0  0 10  2  2  0  0  0  0  0  0  1 33  1  0  0
  1  0  0  0  0  0 41  2  0  0  0  1  0  0  0  2 23  0  0  0  1  0  0  2
  2  0  7  0  0  0  0  0  1  0  2  0  9  0  0  0  0  0  0  0  0  0 11  1
  1  0  1  4  0  0  0  0 14]
svc Accuracy:  0.8592964824120602
svc F1:  0.8512063086915862
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 300)
(0, 0)
(0, 0)
1
52
             precision    recall  f1-score   support

          0       0.79      1.00      0.89        31
          1       1.00      0.62      0.76        21

avg / total       0.88      0.85      0.84        52

[31  0  8 13]
MNB Accuracy:  0.8461538461538461
MNB F1:  0.8252100840336135
             precision    recall  f1-score   support

          0       0.97      0.94      0.95        31
          1       0.91      0.95      0.93        21

avg / total       0.94      0.94      0.94        52

[29  2  1 20]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 130
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 274)
(0, 0)
(0, 0)
1
130
             precision    recall  f1-score   support

          0       0.59      0.98      0.74        57
          1       0.96      0.72      0.82        32
          2       1.00      0.50      0.67        16
          3       0.00      0.00      0.00        11
          4       1.00      0.21      0.35        14

avg / total       0.73      0.69      0.65       130

[56  1  0  0  0  9 23  0  0  0  8  0  8  0  0 11  0  0  0  0 11  0  0  0
  3]
MNB Accuracy:  0.6923076923076923
MNB F1:  0.5155757039657968


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.68      0.98      0.81        57
          1       0.96      0.81      0.88        32
          2       1.00      0.75      0.86        16
          3       1.00      0.27      0.43        11
          4       1.00      0.43      0.60        14

avg / total       0.85      0.79      0.78       130

[56  1  0  0  0  6 26  0  0  0  4  0 12  0  0  8  0  0  3  0  8  0  0  0
  6]
svc Accuracy:  0.7923076923076923
svc F1:  0.7145651227202257
             precision    recall  f1-score   support

          0       0.54      0.98      0.70        57
          1       0.94      0.53      0.68        32
          2       1.00      0.38      0.55        16
          3       0.00      0.00      0.00        11
          4       1.00      0.14      0.25        14

avg / total       0.70      0.62      0.57       130

[56  1  0  0  0 15 17  0  0  0 10  0  6  0  0 11  0  0  0  0 12  0  0  0
  2]
LR Accuracy:  0.6230769230769231
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      0.98      0.95        44
          1       1.00      1.00      1.00        32
          2       1.00      0.11      0.20        18
          3       0.95      0.95      0.95        19
          4       1.00      0.54      0.70        13
          5       1.00      0.95      0.97        20
          6       0.00      0.00      0.00        16
          7       0.51      0.95      0.67        42

avg / total       0.80      0.79      0.74       204

[43  0  0  0  0  0  0  1  0 32  0  0  0  0  0  0  0  0  2  0  0  0  0 16
  0  0  0 18  0  0  0  1  0  0  0  0  7  0  0  6  0  0  0  0  0 19  0  1
  3  0  0  0  0  0  0 13  1  0  0  1  0  0  0 40]
MNB Accuracy:  0.7892156862745098
MNB F1:  0.6791811258916522
             precision    recall  f1-score   support

          0       0.73      1.00      0.85        44
          1       1.00      1.00      1.00        32
          2       0.83      0.56      0.67       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(169, 220)
(0, 0)
(0, 0)
1
169
             precision    recall  f1-score   support

          0       1.00      0.08      0.15        12
          1       0.79      1.00      0.88        91
          2       0.96      0.61      0.75        36
          3       0.90      0.90      0.90        30

avg / total       0.86      0.83      0.81       169

[ 1  9  0  2  0 91  0  0  0 13 22  1  0  2  1 27]
MNB Accuracy:  0.834319526627219
MNB F1:  0.6707760028354072
             precision    recall  f1-score   support

          0       0.90      0.75      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(332, 492)
(0, 0)
(0, 0)
1
332
             precision    recall  f1-score   support

          0       1.00      0.98      0.99        90
          1       0.00      0.00      0.00        19
          2       0.76      1.00      0.86       166
          3       0.00      0.00      0.00        13
          4       1.00      0.57      0.72        44

avg / total       0.78      0.84      0.80       332

[ 88   0   2   0   0   0   0  19   0   0   0   0 166   0   0   0   0  13
   0   0   0   0  19   0  25]
MNB Accuracy:  0.8403614457831325
MNB F1:  0.515

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.98      0.99        90
          1       1.00      0.95      0.97        19
          2       0.93      0.99      0.96       166
          3       1.00      0.85      0.92        13
          4       0.97      0.82      0.89        44

avg / total       0.96      0.96      0.96       332

[ 88   0   2   0   0   0  18   1   0   0   0   0 165   0   1   0   0   2
  11   0   0   0   8   0  36]
svc Accuracy:  0.9578313253012049
svc F1:  0.9453189798107487
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        90
          1       1.00      0.37      0.54        19
          2       0.78      1.00      0.88       166
          3       1.00      0.08      0.14        13
          4       1.00      0.59      0.74        44

avg / total       0.89      0.86      0.84       332

[ 85   0   5   0   0   0   7  12   0   0   0   0 166   0   0   0   0  12
   1   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[10  0  0 24]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       0.89      1.00      0.94        24

avg / total       0.92      0.91      0.91        34

[ 7  3  0 24]
LR Accuracy:  0.9117647058823529
LR F1:  0.8823529411764706
For name:  p_gaspar
total sample size before apply threshold:  93
Counter({'0000-0003-4217-5717': 87, '0000-0001-5967-0584': 3, '0000-0002-4832-8537': 2, '0000-0003-3388-1724': 1})
['0000-0003-4217-5717']
Total sample size after apply threshold:  87
For name:  r_o'connor
total sample size before apply threshold:  82
Counter({'0000-0003-4426-2507': 36, '0000-0002-4643-9794': 27, '0000-0002-6869-7954': 13, '0000-0002-3916-3101': 6})
['0000-0002-6869-7954', '0000-0003-4426-2507', '0000-0002-4643-9794']
Total sample size after apply threshold:  76
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8153846153846154
MNB F1:  0.8047329637792471
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       1.00      0.93      0.97        15
          2       1.00      0.86      0.92        21
          3       1.00      1.00      1.00        13
          4       1.00      0.71      0.83        17
          5       0.82      1.00      0.90        50

avg / total       0.93      0.92      0.91       130

[12  0  0  0  0  2  0 14  0  0  0  1  0  0 18  0  0  3  0  0  0 13  0  0
  0  0  0  0 12  5  0  0  0  0  0 50]
svc Accuracy:  0.9153846153846154
svc F1:  0.9233596992217681
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        14
          1       1.00      0.67      0.80        15
          2       1.00      0.86      0.92        21
          3       1.00      0.62      0.76        13
          4       1.00      0.65      0.79        17
          5       0.64      1.00  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(288, 528)
(0, 0)
(0, 0)
1
288
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        20
          1       1.00      0.20      0.33        15
          2       0.89      1.00      0.94       237
          3       1.00      0.19      0.32        16

avg / total       0.91      0.90      0.87       288

[ 16   0   4   0   0   3  12   0   0   0 237   0   0   0  13   3]
MNB Accuracy:  0.8993055555555556
MNB F1:  0.6200894050899284


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        20
          1       1.00      0.53      0.70        15
          2       0.94      1.00      0.97       237
          3       1.00      0.75      0.86        16

avg / total       0.95      0.95      0.94       288

[ 16   0   4   0   0   8   7   0   0   0 237   0   0   0   4  12]
svc Accuracy:  0.9479166666666666
svc F1:  0.8527522683297557
             precision    recall  f1-score   support

          0       1.00      0.35      0.52        20
          1       0.00      0.00      0.00        15
          2       0.85      1.00      0.92       237
          3       1.00      0.12      0.22        16

avg / total       0.82      0.85      0.80       288

[  7   0  13   0   0   0  15   0   0   0 237   0   0   0  14   2]
LR Accuracy:  0.8541666666666666
LR F1:  0.41483634797588287
For name:  s_rossi
total sample size before apply threshold:  199
Counter({'0000-0003-3257-8248': 86, '

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       1.00      0.88      0.94        25
          2       1.00      0.92      0.96        13
          3       0.94      0.91      0.93        34
          4       1.00      0.80      0.89        10
          5       0.91      1.00      0.95        86

avg / total       0.94      0.94      0.94       182

[12  0  0  1  0  1  0 22  0  1  0  2  0  0 12  0  0  1  0  0  0 31  0  3
  0  0  0  0  8  2  0  0  0  0  0 86]
svc Accuracy:  0.9395604395604396
svc F1:  0.9306309003590084
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        14
          1       1.00      0.84      0.91        25
          2       1.00      0.62      0.76        13
          3       1.00      0.79      0.89        34
          4       1.00      0.40      0.57        10
          5       0.74      1.00      0.85        86

avg / total       0.88     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 89)
(0, 0)
(0, 0)
1
22
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.85      1.00      0.92        11

avg / total       0.92      0.91      0.91        22

[ 9  2  0 11]
MNB Accuracy:  0.9090909090909091
MNB F1:  0.9083333333333333
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.91      0.95        11

avg / total       0.96      0.95      0.95        22

[11  0  1 10]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(159, 507)
(0, 0)
(0, 0)
1
159
             precision    recall  f1-score   support

          0       0.88      1.00      0.94       112
          1       1.00      0.68      0.81        47

avg / total       0.92      0.91      0.90       159

[112   0  15  32]
MNB Accuracy:  0.9056603773584906
MNB F1:  0.8736825380011652
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       112
          1       1.00      0.72      0.84        47

avg / total       0.93      0.92      0.91       159

[112   0  13  34]
svc Accuracy:  0.9182389937106918
svc F1:  0.8923269260822004
             precision    recall  f1-score   support

          0       0.81      1.00      0.89       112
          1       1.00      0.43      0.60        47

avg / total       0.86      0.83      0.81       159

[112   0  27  20]
LR Accuracy:  0.8301886792452831
LR F1:  0.7447226021287983
For name:  l_rasmussen
total sample size before apply threshold:  249
Counter({'0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        24
          1       0.97      1.00      0.98       214

avg / total       0.97      0.97      0.97       238

[ 17   7   0 214]
svc Accuracy:  0.9705882352941176
svc F1:  0.9065881693299691
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        24
          1       0.92      1.00      0.96       214

avg / total       0.93      0.92      0.90       238

[  6  18   0 214]
LR Accuracy:  0.9243697478991597
LR F1:  0.6798206278026906
For name:  m_saad
total sample size before apply threshold:  4
Counter({'0000-0003-0458-5942': 1, '0000-0002-8071-2328': 1, '0000-0002-5655-8674': 1, '0000-0003-1291-366X': 1})
[]
Total sample size after apply threshold:  0
For name:  j_carr
total sample size before apply threshold:  271
Counter({'0000-0002-4398-8237': 179, '0000-0002-6445-2992': 42, '0000-0002-5028-2160': 40, '0000-0002-2729-0920': 6, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        42
          1       1.00      0.68      0.81        40
          2       0.88      1.00      0.93       179

avg / total       0.92      0.90      0.90       261

[ 30   0  12   0  27  13   0   0 179]
svc Accuracy:  0.9042145593869731
svc F1:  0.8580097770503444
             precision    recall  f1-score   support

          0       1.00      0.21      0.35        42
          1       1.00      0.25      0.40        40
          2       0.74      1.00      0.85       179

avg / total       0.82      0.76      0.70       261

[  9   0  33   0  10  30   0   0 179]
LR Accuracy:  0.7586206896551724
LR F1:  0.5344324903358019
For name:  j_fraser
total sample size before apply threshold:  101
Counter({'0000-0002-5080-2859': 38, '0000-0002-6505-1883': 36, '0000-0002-5980-3989': 9, '0000-0003-0111-9137': 6, '0000-0002-8020-2985': 6, '0000-0001-9697-3795': 3, '0000-0003-4941-1997': 3})
['000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.92      0.96        36
          1       0.93      1.00      0.96        38

avg / total       0.96      0.96      0.96        74

[33  3  0 38]
LR Accuracy:  0.9594594594594594
LR F1:  0.9592735277930655
For name:  s_woo
total sample size before apply threshold:  25
Counter({'0000-0003-3692-7169': 22, '0000-0001-8788-2875': 1, '0000-0001-6765-4322': 1, '0000-0001-6902-0315': 1})
['0000-0003-3692-7169']
Total sample size after apply threshold:  22
For name:  s_bartlett
total sample size before apply threshold:  104
Counter({'0000-0001-9755-2490': 80, '0000-0003-4387-670X': 18, '0000-0002-7044-4454': 3, '0000-0003-0699-2250': 3})
['0000-0003-4387-670X', '0000-0001-9755-2490']
Total sample size after apply threshold:  98
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_featu

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[11  0  0  3  0 14  0  1  0  0 14  5  0  0  0 25]
LR Accuracy:  0.8767123287671232
LR F1:  0.8853649292457008
For name:  w_lee
total sample size before apply threshold:  590
Counter({'0000-0003-3171-7672': 108, '0000-0001-5833-989X': 100, '0000-0003-3231-9764': 82, '0000-0002-1082-7592': 62, '0000-0003-3267-4811': 40, '0000-0001-7805-869X': 36, '0000-0003-2883-0391': 21, '0000-0002-0607-038X': 21, '0000-0002-5461-6770': 16, '0000-0002-3912-6095': 11, '0000-0001-6757-885X': 11, '0000-0001-6408-7668': 10, '0000-0002-9873-1033': 9, '0000-0001-7801-083X': 8, '0000-0001-8430-4797': 7, '0000-0002-2572-7287': 5, '0000-0002-6766-8481': 5, '0000-0001-8706-6026': 4, '0000-0002-0036-2859': 4, '0000-0002-9624-0505': 3, '0000-0002-3413-4029': 3, '0000-0003-1817-8395': 3, '0000-0003-1744-8525': 3, '0000-0001-8052-2420': 2, '0000-0003-0853-8561': 2, '0000-0001-7285-4054': 2, '0000-0001-9645-8179': 2, '0000-0002-4383-756X': 2, '0000-0003-1911-3454': 2, '0000-0003-4333-5444': 1, '0000-0002-7324-5792':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



[ 7  0  0  0  0 13  0 15  0  1  0  0  0  0  0  0  0 10  0  1  0  0  0  0
  0  0  2  0  0 18  1  0  0  0  0  0  0  0  0  4  0  0  0  5  0  1  0  0
  0  0  0  0  5 34  1  0  0  0  0  0  0  0  0  0  1 98  7  2  0  0  0  0
  0  0  0  0  0 26 56  0  0  0  0  0  0  0  0  0  0  0  0 99  0  1  0  0
  0  0  0  0  0  7  0  0 14  0  0  0  0  0  0  0  0  2  0  7  0 53  0  0
  0  0  0  0  0  2  0  8  0  1  0  0  0  0  0  0  0  1  0 14  0  1  0  0]
MNB Accuracy:  0.6525096525096525
MNB F1:  0.42806809117095507
             precision    recall  f1-score   support

          0       0.96      0.67      0.79        36
          1       1.00      0.09      0.17        11
          2       1.00      0.43      0.60        21
          3       1.00      0.70      0.82        10
          4       0.71      0.68      0.69        40
          5       0.56      0.88      0.68       108
          6       0.87      0.71      0.78        82
          7       0.89      0.98      0.93       100
          8       1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.97      1.00      0.98       343

avg / total       0.93      0.96      0.95       355

[  0  12   1 342]
MNB Accuracy:  0.9633802816901409
MNB F1:  0.49067431850789095
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       1.00      1.00      1.00       343

avg / total       1.00      1.00      1.00       355

[ 11   1   0 343]
svc Accuracy:  0.9971830985915493
svc F1:  0.9775330675273717
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.97      1.00      0.98       343

avg / total       0.93      0.97      0.95       355

[  0  12   0 343]
LR Accuracy:  0.9661971830985916
LR F1:  0.49140401146131807
For name:  j_albert
total sample size before apply threshold:  78
Counter({'0000-0002-3420-7371': 40, '0000-0001-65

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.98        40
          1       1.00      0.95      0.97        19
          2       1.00      0.92      0.96        13

avg / total       0.97      0.97      0.97        72

[40  0  0  1 18  0  1  0 12]
svc Accuracy:  0.9722222222222222
svc F1:  0.9695275763568447
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        40
          1       1.00      0.84      0.91        19
          2       1.00      0.38      0.56        13

avg / total       0.88      0.85      0.83        72

[40  0  0  3 16  0  8  0  5]
LR Accuracy:  0.8472222222222222
LR F1:  0.782987382987383
For name:  k_goh
total sample size before apply threshold:  42
Counter({'0000-0002-2839-8722': 22, '0000-0002-3623-4891': 5, '0000-0003-0599-9696': 5, '0000-0001-5499-5187': 4, '0000-0002-2367-8303': 3, '0000-0001-5416-9627': 2, '0000-0002-8265-3421': 1})
['0000-0002-2839-8722']
Total

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      1.00      0.98       118
          1       1.00      0.67      0.80        12

avg / total       0.97      0.97      0.97       130

[118   0   4   8]
svc Accuracy:  0.9692307692307692
svc F1:  0.8916666666666666
             precision    recall  f1-score   support

          0       0.91      1.00      0.95       118
          1       0.00      0.00      0.00        12

avg / total       0.82      0.91      0.86       130

[118   0  12   0]
LR Accuracy:  0.9076923076923077
LR F1:  0.47580645161290325
For name:  p_pathak
total sample size before apply threshold:  9
Counter({'0000-0003-0118-3235': 4, '0000-0002-1157-5550': 3, '0000-0002-9771-6624': 1, '0000-0003-2152-3938': 1})
[]
Total sample size after apply threshold:  0
For name:  h_zeng
total sample size before apply threshold:  82
Counter({'0000-0002-8246-2000': 42, '0000-0002-0260-1059': 21, '0000-0002-9909-7732': 6, '0000-0002-9150-214X': 6, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        11
          1       0.78      0.90      0.83        39
          2       1.00      0.76      0.87        17
          3       1.00      0.54      0.70        13
          4       0.80      1.00      0.89       100
          5       1.00      0.92      0.96        26
          6       0.98      0.91      0.94        45
          7       0.88      0.78      0.82        18
          8       0.92      0.92      0.92        39
          9       1.00      0.30      0.46        10
         10       0.96      0.87      0.92        31

avg / total       0.90      0.88      0.87       349

[  7   0   0   0   4   0   0   0   0   0   0   0  35   0   0   2   0   0
   1   1   0   0   0   0  13   0   3   0   0   1   0   0   0   0   0   0
   7   6   0   0   0   0   0   0   0   0   0   0 100   0   0   0   0   0
   0   0   0   0   0   2  24   0   0   0   0   0   0   1   0   0   2   0
  41   0   1   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      1.00      0.95        19
          1       0.86      0.35      0.50        17
          2       0.64      0.94      0.76        17

avg / total       0.80      0.77      0.75        53

[19  0  0  2  6  9  0  1 16]
MNB Accuracy:  0.7735849056603774
MNB F1:  0.7373015873015873
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.93      0.82      0.87        17
          2       0.84      0.94      0.89        17

avg / total       0.93      0.92      0.92        53

[19  0  0  0 14  3  0  1 16]
svc Accuracy:  0.9245283018867925
svc F1:  0.9212962962962963
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        19
          1       0.88      0.82      0.85        17
          2       0.88      0.88      0.88        17

avg / total       0.90      0.91      0.90        53

[19  0  0  1 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        40
          1       1.00      1.00      1.00        73

avg / total       1.00      1.00      1.00       113

[40  0  0 73]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        40
          1       0.94      1.00      0.97        73

avg / total       0.96      0.96      0.96       113

[35  5  0 73]
LR Accuracy:  0.9557522123893806
LR F1:  0.9501103752759382
For name:  r_moore
total sample size before apply threshold:  221
Counter({'0000-0002-0776-5861': 75, '0000-0001-7221-6693': 51, '0000-0003-1072-2755': 45, '0000-0003-2027-2428': 44, '0000-0003-4196-1804': 6})
['0000-0003-2027-2428', '0000-0003-1072-2755', '0000-0001-7221-6693', '0000-0002-0776-5861']
Total sample size after apply threshold:  215
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.82      0.90        44
          1       1.00      0.82      0.90        45
          2       1.00      0.96      0.98        51
          3       0.81      1.00      0.89        75

avg / total       0.93      0.92      0.92       215

[36  0  0  8  0 37  0  8  0  0 49  2  0  0  0 75]
svc Accuracy:  0.9162790697674419
svc F1:  0.9188240418118466
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        44
          1       1.00      0.78      0.88        45
          2       1.00      0.96      0.98        51
          3       0.70      1.00      0.82        75

avg / total       0.90      0.85      0.85       215

[24  0  0 20  0 35  0 10  0  0 49  2  0  0  0 75]
LR Accuracy:  0.8511627906976744
LR F1:  0.8462645442792502
For name:  m_thomsen
total sample size before apply threshold:  98
Counter({'0000-0002-2469-6458': 37, '0000-0003-2453-5141': 32, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8953488372093024
svc F1:  0.8839863222510608
             precision    recall  f1-score   support

          0       0.94      0.94      0.94        32
          1       0.74      0.95      0.83        37
          2       1.00      0.41      0.58        17

avg / total       0.87      0.84      0.82        86

[30  2  0  2 35  0  0 10  7]
LR Accuracy:  0.8372093023255814
LR F1:  0.7847222222222223
For name:  l_ng
total sample size before apply threshold:  44
Counter({'0000-0003-1905-3586': 37, '0000-0002-6973-9466': 3, '0000-0001-7500-9403': 1, '0000-0001-5988-008X': 1, '0000-0003-3135-244X': 1, '0000-0002-7189-1272': 1})
['0000-0003-1905-3586']
Total sample size after apply threshold:  37
For name:  a_phillips
total sample size before apply threshold:  170
Counter({'0000-0002-5461-0598': 98, '0000-0001-6367-9784': 24, '0000-0001-5599-6499': 24, '0000-0003-4883-0022': 9, '0000-0003-4225-0158': 7, '0000-0003-4473-5108': 4, '0000-0001-6618-0145': 3, '0000-0001-6335-9430': 1})
['0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 148)
(0, 0)
(0, 0)
1
55
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.95      1.00      0.98        42

avg / total       0.97      0.96      0.96        55

[11  2  0 42]
MNB Accuracy:  0.9636363636363636
MNB F1:  0.9467054263565892
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.95      1.00      0.98        42

avg / total       0.97      0.96      0.96        55

[11  2  0 42]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.79      0.84        34
          1       0.90      0.98      0.94        86
          2       1.00      0.84      0.91        19

avg / total       0.92      0.91      0.91       139

[27  7  0  2 84  0  1  2 16]
svc Accuracy:  0.9136690647482014
svc F1:  0.8988610667730779
             precision    recall  f1-score   support

          0       0.87      0.59      0.70        34
          1       0.80      0.98      0.88        86
          2       1.00      0.58      0.73        19

avg / total       0.84      0.83      0.82       139

[20 14  0  2 84  0  1  7 11]
LR Accuracy:  0.8273381294964028
LR F1:  0.7715562903769021
For name:  s_teixeira
total sample size before apply threshold:  36
Counter({'0000-0003-0419-2348': 12, '0000-0001-5845-058X': 11, '0000-0002-2462-8535': 3, '0000-0002-9473-0113': 3, '0000-0002-7464-3944': 3, '0000-0002-6603-7936': 3, '0000-0003-3664-2577': 1})
['0000-0003-0419-2348'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.88      1.00      0.94        91
          2       1.00      0.55      0.71        11

avg / total       0.91      0.89      0.87       112

[ 3  7  0  0 91  0  0  5  6]
MNB Accuracy:  0.8928571428571429
MNB F1:  0.7018550481255149
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.93      1.00      0.96        91
          2       1.00      0.55      0.71        11

avg / total       0.94      0.94      0.93       112

[ 8  2  0  0 91  0  0  5  6]
svc Accuracy:  0.9375
svc F1:  0.8525780682643429
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.81      1.00      0.90        91
          2       0.00      0.00      0.00        11

avg / total       0.66      0.81      0.73       112

[ 0 10  0  0 91  0  0 11  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 224)
(0, 0)
(0, 0)
1
164
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.93      0.99      0.96       153

avg / total       0.87      0.92      0.89       164

[  0  11   2 151]
MNB Accuracy:  0.9207317073170732
MNB F1:  0.4793650793650794
             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       0.94      1.00      0.97       153

avg / total       0.95      0.95      0.93       164

[  2   9   0 15

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.99      1.00      0.99        92

avg / total       0.99      0.99      0.99       102

[ 9  1  0 92]
MNB Accuracy:  0.9901960784313726
MNB F1:  0.9709815078236131
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.99      1.00      0.99        92

avg / total       0.99      0.99      0.99       102

[ 9  1  0 92]
svc Accuracy:  0.9901960784313726
svc F1:  0.9709815078236131
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.94      1.00      0.97        92

avg / total       0.94      0.94      0.93       102

[ 4  6  0 92]
LR Accuracy:  0.9411764705882353
LR F1:  0.7699248120300752
For name:  p_lima
total sample size before apply threshold:  24
Counter({'0000-0002-1252-2565': 8, '0000-0002-9739-0783': 8, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97       174
          1       1.00      0.66      0.79        35

avg / total       0.95      0.94      0.94       209

[174   0  12  23]
MNB Accuracy:  0.9425837320574163
MNB F1:  0.8798850574712643
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       174
          1       1.00      0.77      0.87        35

avg / total       0.96      0.96      0.96       209

[174   0   8  27]
svc Accuracy:  0.9617224880382775
svc F1:  0.9242479159115622
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       174
          1       1.00      0.26      0.41        35

avg / total       0.89      0.88      0.84       209

[174   0  26   9]
LR Accuracy:  0.8755980861244019
LR F1:  0.6697860962566844
For name:  h_moreira
total sample size before apply threshold:  28
Counter({'0000-0002-1487-0539': 13, '0000-0002-548

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 52)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       0.80      0.97      0.88        29
          1       1.00      0.55      0.71        11
          2       0.89      0.84      0.86        19

avg / total       0.87      0.85      0.84        59

[28  0  1  4  6  1  3  0 16]
MNB Accuracy:  0.847457627118644
MNB F1:  0.8152490726020138
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        29
          1       1.00      0.55      0.71        11
          

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.87      0.88        47
          1       1.00      0.17      0.29        18
          2       0.84      0.84      0.84        62
          3       1.00      0.08      0.15        24
          4       1.00      0.77      0.87        26
          5       0.58      1.00      0.74        62

avg / total       0.83      0.75      0.71       239

[41  0  0  0  0  6  1  3  1  0  0 13  0  0 52  0  0 10  3  0  7  2  0 12
  1  0  2  0 20  3  0  0  0  0  0 62]
MNB Accuracy:  0.7531380753138075
MNB F1:  0.6279418337623106
             precision    recall  f1-score   support

          0       0.96      0.91      0.93        47
          1       0.85      0.61      0.71        18
          2       0.85      1.00      0.92        62
          3       0.79      0.79      0.79        24
          4       1.00      0.92      0.96        26
          5       0.98      0.95      0.97        62

avg / total       0.92     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.88      0.94        17
          1       0.75      0.60      0.67        15
          2       1.00      0.93      0.96        14
          3       0.72      0.95      0.82        19
          4       1.00      1.00      1.00        20

avg / total       0.89      0.88      0.88        85

[15  1  0  1  0  0  9  0  6  0  0  1 13  0  0  0  1  0 18  0  0  0  0  0
 20]
svc Accuracy:  0.8823529411764706
svc F1:  0.8770622895622896
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        17
          1       1.00      0.40      0.57        15
          2       1.00      0.79      0.88        14
          3       0.62      0.95      0.75        19
          4       0.87      1.00      0.93        20

avg / total       0.88      0.84      0.83        85

[16  0  0  1  0  0  6  0  8  1  0  0 11  2  1  0  0  0 18  1  0  0  0  0
 20]
LR Accuracy:  0.8352941176470589
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.58      0.98      0.72        43
          1       1.00      0.95      0.97        38
          2       1.00      0.20      0.33        10
          3       0.89      0.53      0.67        15
          4       1.00      0.62      0.77        24
          5       1.00      0.62      0.76        13

avg / total       0.86      0.78      0.77       143

[42  0  0  1  0  0  2 36  0  0  0  0  8  0  2  0  0  0  7  0  0  8  0  0
  9  0  0  0 15  0  5  0  0  0  0  8]
svc Accuracy:  0.7762237762237763
svc F1:  0.7047077391904978
             precision    recall  f1-score   support

          0       0.55      0.98      0.70        43
          1       1.00      0.97      0.99        38
          2       0.00      0.00      0.00        10
          3       0.88      0.47      0.61        15
          4       1.00      0.62      0.77        24
          5       1.00      0.46      0.63        13

avg / total       0.78     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.93      0.96        40
          1       0.95      1.00      0.97        58

avg / total       0.97      0.97      0.97        98

[37  3  0 58]
svc Accuracy:  0.9693877551020408
svc F1:  0.9679144385026738
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        40
          1       0.91      1.00      0.95        58

avg / total       0.94      0.94      0.94        98

[34  6  0 58]
LR Accuracy:  0.9387755102040817
LR F1:  0.9348692955250332
For name:  h_brown
total sample size before apply threshold:  48
Counter({'0000-0001-8578-5510': 17, '0000-0002-0067-991X': 9, '0000-0003-4870-8369': 8, '0000-0001-7418-5536': 6, '0000-0001-6227-5147': 3, '0000-0001-9404-9515': 3, '0000-0003-2292-7766': 2})
['0000-0001-8578-5510']
Total sample size after apply threshold:  17
For name:  s_martins
total sample size before apply threshold:  84
Counter({'0000-0002-9396-5

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(198, 651)
(0, 0)
(0, 0)
1
198
             precision    recall  f1-score   support

          0       0.50      1.00      0.67        78
          1       1.00      0.22      0.36        18
          2       1.00      0.87      0.93        23
          3       0.00      0.00      0.00        11
          4       1.00      0.59      0.75        32
          5       0.00      0.00      0.00        13
          6       0.00      0.00      0.00        23

avg / total       0.57      0.61      0.53       198

[78  0  0  0  0  0  0 14  4  0  0  0  0  0  3

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      1.00      0.78        78
          1       1.00      0.56      0.71        18
          2       1.00      0.91      0.95        23
          3       1.00      0.45      0.62        11
          4       1.00      0.69      0.81        32
          5       1.00      0.46      0.63        13
          6       0.92      0.52      0.67        23

avg / total       0.85      0.78      0.77       198

[78  0  0  0  0  0  0  8 10  0  0  0  0  0  2  0 21  0  0  0  0  6  0  0
  5  0  0  0 10  0  0  0 22  0  0  6  0  0  0  0  6  1 11  0  0  0  0  0
 12]
svc Accuracy:  0.7777777777777778
svc F1:  0.7415444565244317
             precision    recall  f1-score   support

          0       0.48      1.00      0.65        78
          1       1.00      0.17      0.29        18
          2       1.00      0.83      0.90        23
          3       0.00      0.00      0.00        11
          4       1.00      0.44      0.6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        29
          1       1.00      0.83      0.91        12
          2       0.95      1.00      0.97        97
          3       1.00      0.79      0.88        14

avg / total       0.97      0.97      0.97       152

[29  0  0  0  0 10  2  0  0  0 97  0  0  0  3 11]
svc Accuracy:  0.9671052631578947
svc F1:  0.9409913202375514
             precision    recall  f1-score   support

          0       1.00      0.79      0.88        29
          1       1.00      0.83      0.91        12
          2       0.87      1.00      0.93        97
          3       1.00      0.50      0.67        14

avg / total       0.91      0.90      0.89       152

[23  0  6  0  0 10  2  0  0  0 97  0  0  0  7  7]
LR Accuracy:  0.9013157894736842
LR F1:  0.8471506563611827
For name:  m_sahin
total sample size before apply threshold:  48
Counter({'0000-0001-7044-2953': 41, '0000-0002-3490-6009': 3, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.97      0.98        30
          1       0.90      1.00      0.95        26
          2       1.00      0.83      0.91        12
          3       1.00      1.00      1.00        12

avg / total       0.97      0.96      0.96        80

[29  1  0  0  0 26  0  0  0  2 10  0  0  0  0 12]
svc Accuracy:  0.9625
svc F1:  0.9593990755007704
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        30
          1       0.93      1.00      0.96        26
          2       1.00      0.83      0.91        12
          3       1.00      1.00      1.00        12

avg / total       0.98      0.97      0.97        80

[30  0  0  0  0 26  0  0  0  2 10  0  0  0  0 12]
LR Accuracy:  0.975
LR F1:  0.968013468013468
For name:  j_coutinho
total sample size before apply threshold:  129
Counter({'0000-0002-3841-743X': 105, '0000-0002-6303-9549': 13, '0000-0002-1562-0099': 8, '00

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.89      1.00      0.94       105

avg / total       0.79      0.89      0.84       118

[  0  13   0 105]
LR Accuracy:  0.8898305084745762
LR F1:  0.47085201793721976
For name:  s_huber
total sample size before apply threshold:  44
Counter({'0000-0002-4125-159X': 26, '0000-0003-3558-351X': 12, '0000-0002-8271-7835': 3, '0000-0002-5842-5859': 2, '0000-0001-6303-5188': 1})
['0000-0003-3558-351X', '0000-0002-4125-159X']
Total sample size after apply threshold:  38
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, us

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


73
Counter({'0000-0003-3218-7001': 26, '0000-0001-9710-9835': 21, '0000-0003-2165-5519': 12, '0000-0002-4094-7982': 3, '0000-0002-5637-1041': 3, '0000-0001-6528-9034': 3, '0000-0003-4940-6522': 2, '0000-0003-0298-8246': 2, '0000-0001-8679-2886': 1})
['0000-0001-9710-9835', '0000-0003-2165-5519', '0000-0003-3218-7001']
Total sample size after apply threshold:  59
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 108)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        21
          1       1.00      0.92      0.96        12
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 350
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(350, 1067)
(0, 0)
(0, 0)
1
350
             precision    recall  f1-score   support

          0       0.89      0.94      0.91        85
          1       1.00      0.19      0.32        16
          2       0.76      0.99      0.86       108
          3       0.00      0.00      0.00        13
          4       1.00      0.73      0.85        41
          5       0.98      0.98      0.98        87

avg / total       0.86      0.87      0.84       350

[ 80   0   4   0   0   1   5   3   8   0   0   0   0   0 107   0   0   1
   2   0  11   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      1.00      0.78       155
          1       1.00      0.41      0.58        44
          2       1.00      0.65      0.79        60
          3       1.00      0.44      0.62        36
          4       0.00      0.00      0.00        21

avg / total       0.76      0.72      0.68       316

[155   0   0   0   0  26  18   0   0   0  21   0  39   0   0  20   0   0
  16   0  21   0   0   0   0]
MNB Accuracy:  0.7215189873417721
MNB F1:  0.552560607383107
             precision    recall  f1-score   support

          0       0.74      1.00      0.85       155
          1       1.00      0.61      0.76        44
          2       1.00      0.72      0.83        60
          3       1.00      0.75      0.86        36
          4       1.00      0.48      0.65        21

avg / total       0.87      0.83      0.82       316

[155   0   0   0   0  17  27   0   0   0  17   0  43   0   0   9   0   0
  27   0  11   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.64      0.96      0.77        57
          2       1.00      0.13      0.24        15
          3       0.60      0.93      0.73        42
          4       1.00      0.28      0.44        25
          5       1.00      0.09      0.17        11

avg / total       0.70      0.65      0.56       161

[ 0  1  0 10  0  0  0 55  0  2  0  0  0  2  2 11  0  0  0  3  0 39  0  0
  0 15  0  3  7  0  0 10  0  0  0  1]
MNB Accuracy:  0.6459627329192547
MNB F1:  0.3896105860268862
             precision    recall  f1-score   support

          0       0.75      0.27      0.40        11
          1       0.78      0.95      0.86        57
          2       0.75      0.40      0.52        15
          3       0.79      0.88      0.83        42
          4       0.62      0.60      0.61        25
          5       1.00      0.82      0.90        11

avg / total       0.77     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.96      0.63      0.76        43
          1       1.00      0.67      0.80        21
          2       1.00      0.18      0.30        17
          3       1.00      0.83      0.91        30
          4       1.00      0.92      0.96        12
          5       0.88      0.87      0.87       124
          6       1.00      0.59      0.74        17
          7       1.00      0.39      0.56        18
          8       0.57      0.93      0.71        99

avg / total       0.85      0.78      0.77       381

[ 27   0   0   0   0   2   0   0  14   0  14   0   0   0   1   0   0   6
   0   0   3   0   0   1   0   0  13   0   0   0  25   0   1   0   0   4
   0   0   0   0  11   0   0   0   1   0   0   0   0   0 108   0   0  16
   0   0   0   0   0   2  10   0   5   1   0   0   0   0   1   0   7   9
   0   0   0   0   0   7   0   0  92]
svc Accuracy:  0.7795275590551181
svc F1:  0.7346483785326657
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 278)
(0, 0)
(0, 0)
1
41
             precision    recall  f1-score   support

          0       1.00      0.20      0.33        10
          1       0.79      1.00      0.89        31

avg / total       0.84      0.80      0.75        41

[ 2  8  0 31]
MNB Accuracy:  0.8048780487804879
MNB F1:  0.6095238095238096
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.97      1.00      0.98        31

avg / total       0.98      0.98      0.98        41

[ 9  1  0 31]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.89      1.00      0.94        17

avg / total       0.93      0.93      0.92        27

[ 8  2  0 17]
svc Accuracy:  0.9259259259259259
svc F1:  0.9166666666666667
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.81      1.00      0.89        17

avg / total       0.88      0.85      0.84        27

[ 6  4  0 17]
LR Accuracy:  0.8518518518518519
LR F1:  0.8223684210526315
For name:  y_zhao
total sample size before apply threshold:  338
Counter({'0000-0003-1215-2565': 48, '0000-0002-7916-8687': 47, '0000-0001-6783-5182': 20, '0000-0002-9408-9979': 20, '0000-0002-6541-0612': 18, '0000-0002-5455-2586': 17, '0000-0002-2903-4218': 16, '0000-0003-0302-3470': 16, '0000-0002-6184-2530': 15, '0000-0003-1035-2272': 15, '0000-0002-6923-1099': 13, '0000-0002-1442-992X': 12, '0000-0001-6747-1665':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.71      0.42      0.53        12
          1       0.85      0.61      0.71        18
          2       1.00      0.45      0.62        11
          3       1.00      0.95      0.97        20
          4       0.78      0.47      0.58        15
          5       0.75      0.50      0.60        12
          6       0.94      0.94      0.94        16
          7       0.93      0.82      0.87        17
          8       1.00      0.55      0.71        11
          9       1.00      0.31      0.47        13
         10       1.00      0.67      0.80        15
         11       0.54      0.94      0.68        48
         12       0.68      0.94      0.79        47
         13       0.69      0.69      0.69        16
         14       0.86      0.60      0.71        20

avg / total       0.79      0.74      0.73       291

[ 5  0  0  0  1  1  0  0  0  0  0  3  2  0  0  1 11  0  0  0  0  0  0  0
  0  0  5  1  0  0  0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.95      1.00      0.97        18

avg / total       0.97      0.97      0.97        29

[10  1  0 18]
svc Accuracy:  0.9655172413793104
svc F1:  0.9626769626769627
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.95      1.00      0.97        18

avg / total       0.97      0.97      0.97        29

[10  1  0 18]
LR Accuracy:  0.9655172413793104
LR F1:  0.9626769626769627
For name:  k_scott
total sample size before apply threshold:  16
Counter({'0000-0001-7263-6778': 11, '0000-0001-7952-0348': 3, '0000-0002-7066-887X': 1, '0000-0003-0345-5417': 1})
['0000-0001-7263-6778']
Total sample size after apply threshold:  11
For name:  a_martinez
total sample size before apply threshold:  180
Counter({'0000-0003-1643-6506': 64, '0000-0002-2707-8110': 56, '0000-0002-4804-6687': 20, '0000-0003-4

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       1.00      0.24      0.38        17
          2       0.75      1.00      0.86        64
          3       1.00      0.89      0.94        56
          4       1.00      0.90      0.95        20

avg / total       0.91      0.87      0.86       167

[10  0  0  0  0  0  4 13  0  0  0  0 64  0  0  0  0  6 50  0  0  0  2  0
 18]
svc Accuracy:  0.874251497005988
svc F1:  0.8261554862209343
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.00      0.00      0.00        17
          2       0.64      1.00      0.78        64
          3       0.96      0.86      0.91        56
          4       1.00      0.65      0.79        20

avg / total       0.75      0.77      0.73       167

[ 4  0  6  0  0  0  0 15  2  0  0  0 64  0  0  0  0  8 48  0  0  0  7  0
 13]
LR Accuracy:  0.7724550898203593
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       1.00      0.85      0.92        13
          2       1.00      0.76      0.86        41
          3       0.78      1.00      0.88        75

avg / total       0.88      0.85      0.83       140

[ 2  0  0  9  0 11  0  2  0  0 31 10  0  0  0 75]
svc Accuracy:  0.85
svc F1:  0.7406657669815565
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       1.00      0.54      0.70        13
          2       1.00      0.71      0.83        41
          3       0.72      1.00      0.84        75

avg / total       0.77      0.79      0.76       140

[ 0  0  0 11  0  7  0  6  0  0 29 12  0  0  0 75]
LR Accuracy:  0.7928571428571428
LR F1:  0.5916400638467677
For name:  l_you
total sample size before apply threshold:  32
Counter({'0000-0001-7304-0474': 12, '0000-0003-3058-2884': 12, '0000-0003-1162-0064': 7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


119
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(119, 135)
(0, 0)
(0, 0)
1
119
             precision    recall  f1-score   support

          0       0.99      0.99      0.99        86
          1       0.97      0.97      0.97        33

avg / total       0.98      0.98      0.98       119

[85  1  1 32]
MNB Accuracy:  0.9831932773109243
MNB F1:  0.9790345313601128
             precision    recall  f1-score   support

          0       0.98      1.00      0.99        86
          1       1.00      0.94      0.97        33

avg / total       0.98      0.98      0.98       119

[86  0  2 31]
s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.67      0.80        12
          1       0.91      1.00      0.96        43

avg / total       0.93      0.93      0.92        55

[ 8  4  0 43]
MNB Accuracy:  0.9272727272727272
MNB F1:  0.8777777777777778
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       0.98      1.00      0.99        43

avg / total       0.98      0.98      0.98        55

[11  1  0 43]
svc Accuracy:  0.9818181818181818
svc F1:  0.9725137431284359
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       0.88      1.00      0.93        43

avg / total       0.90      0.89      0.88        55

[ 6  6  0 43]
LR Accuracy:  0.8909090909090909
LR F1:  0.8007246376811594
For name:  m_amorim
total sample size before apply threshold:  95
Counter({'0000-0001-8137-3295': 55, '0000-0002-4159-4023': 20, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      1.00      0.96       113
          1       1.00      0.76      0.86        42

avg / total       0.94      0.94      0.93       155

[113   0  10  32]
svc Accuracy:  0.9354838709677419
svc F1:  0.9112459917544663
             precision    recall  f1-score   support

          0       0.86      1.00      0.93       113
          1       1.00      0.57      0.73        42

avg / total       0.90      0.88      0.87       155

[113   0  18  24]
LR Accuracy:  0.8838709677419355
LR F1:  0.8267511177347243
For name:  w_he
total sample size before apply threshold:  48
Counter({'0000-0003-3254-1242': 20, '0000-0003-3137-8420': 16, '0000-0003-0161-3274': 7, '0000-0003-1236-3047': 5})
['0000-0003-3254-1242', '0000-0003-3137-8420']
Total sample size after apply threshold:  36
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(236, 546)
(0, 0)
(0, 0)
1
236
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        18
          1       0.92      0.99      0.95       218

avg / total       0.85      0.91      0.88       236

[  0  18   3 215]
MNB Accuracy:  0.9110169491525424
MNB F1:  0.47671840354767187


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        18
          1       0.92      1.00      0.96       218

avg / total       0.85      0.92      0.89       236

[  0  18   0 218]
svc Accuracy:  0.923728813559322
svc F1:  0.4801762114537445
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        18
          1       0.92      1.00      0.96       218

avg / total       0.85      0.92      0.89       236

[  0  18   0 218]
LR Accuracy:  0.923728813559322
LR F1:  0.4801762114537445
For name:  a_ward
total sample size before apply threshold:  164
Counter({'0000-0001-7945-7975': 92, '0000-0003-4102-8694': 40, '0000-0002-7000-2453': 10, '0000-0001-6948-4814': 9, '0000-0002-6376-0061': 6, '0000-0003-0038-9426': 4, '0000-0002-9774-8677': 2, '0000-0003-1321-3358': 1})
['0000-0001-7945-7975', '0000-0002-7000-2453', '0000-0003-4102-8694']
Total sample size after apply threshold:  142
TfidfVectori

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.90      0.87      0.89        31
          1       0.70      0.84      0.76        73
          2       1.00      0.76      0.86        33
          3       0.50      0.95      0.65        55
          4       1.00      0.93      0.96        28
          5       1.00      0.64      0.78        14
          6       1.00      0.76      0.87        17
          7       0.72      0.85      0.78        92
          8       0.00      0.00      0.00        11
          9       0.85      0.93      0.89        87
         10       1.00      1.00      1.00        16
         11       0.71      0.65      0.68        49
         12       0.67      0.33      0.44        12
         13       1.00      0.70      0.82        10
         14       0.86      0.50      0.63        12
         15       0.82      0.72      0.77        43
         16       1.00      0.89      0.94        18
         17       0.93      0.68      0.79   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       1.00      0.27      0.42        15
          2       0.76      0.54      0.63        24
          3       0.81      1.00      0.89        85

avg / total       0.84      0.82      0.79       134

[ 8  0  1  1  0  4  3  8  0  0 13 11  0  0  0 85]
LR Accuracy:  0.8208955223880597
LR F1:  0.7097061760091286
For name:  c_henderson
total sample size before apply threshold:  107
Counter({'0000-0002-4764-639X': 97, '0000-0002-9936-3279': 6, '0000-0001-6954-7328': 2, '0000-0002-4020-0854': 2})
['0000-0002-4764-639X']
Total sample size after apply threshold:  97
For name:  j_mcdonald
total sample size before apply threshold:  21
Counter({'0000-0003-1955-6052': 7, '0000-0002-7494-1466': 7, '0000-0002-8317-0069': 4, '0000-0002-7953-1458': 1, '0000-0003-4115-7875': 1, '0000-0002-6328-3752': 1})
[]
Total sample size after apply threshold:  0
For name:  m_ismail
total sample s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.71      0.75        17
          1       1.00      0.86      0.92        28
          2       1.00      0.85      0.92        13
          3       0.65      0.97      0.78       107
          4       1.00      0.65      0.79        17
          5       0.96      0.76      0.85        34
          6       1.00      0.53      0.70        15
          7       1.00      0.89      0.94        19
          8       1.00      0.84      0.92        32
          9       0.82      0.71      0.76        38
         10       0.67      0.22      0.33        18

avg / total       0.84      0.80      0.80       338

[ 12   0   0   3   0   0   0   0   0   1   1   0  24   0   2   0   0   0
   0   0   2   0   0   0  11   1   0   0   0   0   0   1   0   2   0   0
 104   0   0   0   0   0   1   0   0   0   0   6  11   0   0   0   0   0
   0   0   0   0   8   0  26   0   0   0   0   0   0   0   0   7   0   0
   8   0   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.24      0.38        17
          1       1.00      0.56      0.72        16
          2       1.00      0.18      0.31        11
          3       1.00      0.50      0.67        16
          4       1.00      1.00      1.00        13
          5       1.00      0.95      0.98        22
          6       0.94      1.00      0.97        30
          7       0.46      1.00      0.63        31

avg / total       0.88      0.76      0.74       156

[ 4  0  0  0  0  0  1 12  0  9  0  0  0  0  1  6  0  0  2  0  0  0  0  9
  0  0  0  8  0  0  0  8  0  0  0  0 13  0  0  0  0  0  0  0  0 21  0  1
  0  0  0  0  0  0 30  0  0  0  0  0  0  0  0 31]
MNB Accuracy:  0.7564102564102564
MNB F1:  0.7065563172582785
             precision    recall  f1-score   support

          0       0.92      0.71      0.80        17
          1       1.00      0.62      0.77        16
          2       0.88      0.64      0.74       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.41      0.58        22
          1       0.00      0.00      0.00        13
          2       0.57      1.00      0.73        63
          3       1.00      0.16      0.27        19
          4       1.00      0.62      0.76        13

avg / total       0.69      0.64      0.57       130

[ 9  0 13  0  0  0  0 13  0  0  0  0 63  0  0  0  0 16  3  0  0  0  5  0
  8]
MNB Accuracy:  0.6384615384615384
MNB F1:  0.46872017906886454
             precision    recall  f1-score   support

          0       1.00      0.59      0.74        22
          1       1.00      0.15      0.27        13
          2       0.68      1.00      0.81        63
          3       1.00      0.53      0.69        19
          4       1.00      0.92      0.96        13

avg / total       0.84      0.77      0.74       130

[13  0  9  0  0  0  2 11  0  0  0  0 63  0  0  0  0  9 10  0  0  0  1  0
 12]
svc Accuracy:  0.7692307692307693

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.08      0.15        12
          1       0.83      1.00      0.91        55

avg / total       0.86      0.84      0.77        67

[ 1 11  0 55]
MNB Accuracy:  0.835820895522388
MNB F1:  0.5314685314685315
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       0.98      1.00      0.99        55

avg / total       0.99      0.99      0.98        67

[11  1  0 55]
svc Accuracy:  0.9850746268656716
svc F1:  0.9737563650607128
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.82      1.00      0.90        55

avg / total       0.67      0.82      0.74        67

[ 0 12  0 55]
LR Accuracy:  0.8208955223880597
LR F1:  0.4508196721311476
For name:  s_howell
total sample size before apply threshold:  31
Counter({'0000-0001-8141-6515': 28, '0000-0001-8184-0324': 1, '0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.62      0.77        16
          1       1.00      0.77      0.87        13
          2       0.62      1.00      0.77        15

avg / total       0.87      0.80      0.80        44

[10  0  6  0 10  3  0  0 15]
LR Accuracy:  0.7954545454545454
LR F1:  0.802675585284281
For name:  s_morris
total sample size before apply threshold:  33
Counter({'0000-0003-2551-9717': 14, '0000-0002-5334-5809': 11, '0000-0002-7023-8634': 4, '0000-0002-8056-0934': 2, '0000-0003-4866-110X': 2})
['0000-0002-5334-5809', '0000-0003-2551-9717']
Total sample size after apply threshold:  25
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(376, 1096)
(0, 0)
(0, 0)
1
376
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        28
          1       1.00      0.42      0.59        57
          2       0.00      0.00      0.00        10
          3       0.00      0.00      0.00        14
          4       1.00      0.71      0.83        21
          5       0.95      0.88      0.91        81
          6       1.00      0.83      0.91        48
          7       0.56      0.99      0.71       117

avg / total       0.79      0.74      0.72   

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.79      0.88        28
          1       0.83      0.61      0.71        57
          2       1.00      0.40      0.57        10
          3       1.00      0.21      0.35        14
          4       1.00      0.95      0.98        21
          5       0.97      0.80      0.88        81
          6       1.00      0.77      0.87        48
          7       0.62      0.97      0.76       117

avg / total       0.85      0.80      0.79       376

[ 22   0   0   0   0   0   0   6   0  35   0   0   0   1   0  21   0   1
   4   0   0   0   0   5   0   1   0   3   0   0   0  10   0   0   0   0
  20   0   0   1   0   1   0   0   0  65   0  15   0   1   0   0   0   0
  37  10   0   3   0   0   0   1   0 113]
svc Accuracy:  0.7952127659574468
svc F1:  0.7493007608106113
             precision    recall  f1-score   support

          0       1.00      0.61      0.76        28
          1       1.00      0.39     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(258, 768)
(0, 0)
(0, 0)
1
258
             precision    recall  f1-score   support

          0       1.00      0.45      0.62        47
          1       0.00      0.00      0.00        11
          2       0.67      1.00      0.80       115
          3       0.98      0.89      0.93        71
          4       1.00      0.14      0.25        14

avg / total       0.81      0.78      0.74       258

[ 21   0  25   1   0   0   0  11   0   0   0   0 115   0   0   0   0   8
  63   0   0   0  12   0   2]
MNB Accuracy:  0.7790697674418605
MNB F1:  0.5210352392705333
             precision    recall  f1-score   support

          0       1.00      0.74      0.85        47
          1       1.00      0.64      0.78        11
          2       0.79      1.00      0.88       115
          3       1.00      0.87      0.93        71
          4       1.00      0.64      0.78        14

avg / total       0.91      0.88      0.88       258

[ 35   0  12   0   0   0   7   4   0   0   0   0 115   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



For name:  m_fischer
total sample size before apply threshold:  48
Counter({'0000-0002-3429-1876': 10, '0000-0001-5133-1537': 9, '0000-0002-9429-0859': 8, '0000-0002-4014-3626': 8, '0000-0002-1888-1809': 7, '0000-0002-1885-0535': 3, '0000-0002-7826-9726': 2, '0000-0003-0810-6064': 1})
['0000-0002-3429-1876']
Total sample size after apply threshold:  10
For name:  y_zeng
total sample size before apply threshold:  26
Counter({'0000-0001-7483-5017': 20, '0000-0002-5310-0473': 3, '0000-0002-6164-5502': 1, '0000-0003-1193-3335': 1, '0000-0002-4237-6669': 1})
['0000-0001-7483-5017']
Total sample size after apply threshold:  20
For name:  j_turner
total sample size before apply threshold:  178
Counter({'0000-0003-0076-8434': 78, '0000-0002-2760-1071': 26, '0000-0003-2427-1430': 23, '0000-0002-7258-1639': 17, '0000-0003-4106-6295': 14, '0000-0002-0023-4275': 13, '0000-0001-7311-0313': 4, '0000-0003-0286-8949': 1, '0000-0002-4327-9385': 1, '0000-0003-0793-4159': 1})
['0000-0002-7258-1639', '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  86
Counter({'0000-0001-8950-1036': 28, '0000-0002-9381-3320': 21, '0000-0003-3317-8756': 13, '0000-0003-4392-4644': 7, '0000-0003-3421-7833': 4, '0000-0003-2100-0280': 4, '0000-0002-1937-6548': 4, '0000-0002-9602-2452': 3, '0000-0001-9718-3867': 1, '0000-0002-8132-0625': 1})
['0000-0003-3317-8756', '0000-0002-9381-3320', '0000-0001-8950-1036']
Total sample size after apply threshold:  62
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(62, 222)
(0, 0)
(0, 0)
1
62
             precision    recall  f1-score   support

          0       1.00      0.46      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.96      0.81        47
          1       0.95      0.80      0.87        45
          2       0.00      0.00      0.00        10

avg / total       0.74      0.79      0.76       102

[45  2  0  9 36  0 10  0  0]
MNB Accuracy:  0.7941176470588235
MNB F1:  0.5594268967762943
             precision    recall  f1-score   support

          0       0.79      0.94      0.85        47
          1       0.87      0.89      0.88        45
          2       0.00      0.00      0.00        10

avg / total       0.75      0.82      0.78       102

[44  3  0  5 40  0  7  3  0]
svc Accuracy:  0.8235294117647058
svc F1:  0.577829937053238
             precision    recall  f1-score   support

          0       0.77      0.87      0.82        47
          1       0.82      0.89      0.85        45
          2       0.00      0.00      0.00        10

avg / total       0.72      0.79      0.75       102

[41  6  0  5 40

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.73      0.82        22
          1       0.95      0.97      0.96        39
          2       1.00      1.00      1.00        11
          3       0.89      0.98      0.93        55
          4       1.00      0.89      0.94        18

avg / total       0.93      0.93      0.93       145

[16  0  0  6  0  0 38  0  1  0  0  0 11  0  0  1  0  0 54  0  0  2  0  0
 16]
svc Accuracy:  0.9310344827586207
svc F1:  0.9309498180630745
             precision    recall  f1-score   support

          0       0.93      0.64      0.76        22
          1       0.86      0.95      0.90        39
          2       1.00      0.36      0.53        11
          3       0.75      1.00      0.86        55
          4       1.00      0.56      0.71        18

avg / total       0.86      0.83      0.81       145

[14  0  0  8  0  0 37  0  2  0  0  2  4  5  0  0  0  0 55  0  1  4  0  3
 10]
LR Accuracy:  0.8275862068965517
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



             precision    recall  f1-score   support

          0       1.00      0.43      0.60        21
          1       0.71      1.00      0.83        49
          2       1.00      0.47      0.64        15

avg / total       0.83      0.76      0.74        85

[ 9 12  0  0 49  0  0  8  7]
LR Accuracy:  0.7647058823529411
LR F1:  0.6889573703133025
For name:  c_correia
total sample size before apply threshold:  55
Counter({'0000-0001-5564-6675': 20, '0000-0001-5481-2010': 13, '0000-0002-4979-3254': 9, '0000-0002-6996-0734': 6, '0000-0003-2482-7873': 5, '0000-0002-0527-3206': 2})
['0000-0001-5564-6675', '0000-0001-5481-2010']
Total sample size after apply threshold:  33
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_ac

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 103)
(0, 0)
(0, 0)
1
46
             precision    recall  f1-score   support

          0       1.00      0.42      0.59        12
          1       0.83      1.00      0.91        34

avg / total       0.87      0.85      0.82        46

[ 5  7  0 34]
MNB Accuracy:  0.8478260869565217
MNB F1:  0.747450980392157
             precision    recall  f1-score   support

          0       1.00      0.42      0.59        12
          1       0.83      1.00      0.91        34

avg / total       0.87      0.85      0.82        46

[ 5  7  0 34]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      1.00      0.96        55
          1       1.00      1.00      1.00        17
          2       1.00      0.64      0.78        14

avg / total       0.95      0.94      0.94        86

[55  0  0  0 17  0  5  0  9]
svc Accuracy:  0.9418604651162791
svc F1:  0.9130434782608696
             precision    recall  f1-score   support

          0       0.69      1.00      0.81        55
          1       1.00      0.35      0.52        17
          2       0.00      0.00      0.00        14

avg / total       0.64      0.71      0.62        86

[55  0  0 11  6  0 14  0  0]
LR Accuracy:  0.7093023255813954
LR F1:  0.4455179817498658
For name:  s_yoon
total sample size before apply threshold:  73
Counter({'0000-0002-8556-423X': 27, '0000-0003-3487-6863': 16, '0000-0001-8904-0292': 15, '0000-0003-1787-7282': 8, '0000-0003-1868-1054': 1, '0000-0001-7263-8036': 1, '0000-0001-8323-6462': 1, '0000-0002-5330-8784': 1, 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(38, 80)
(0, 0)
(0, 0)
1
38
             precision    recall  f1-score   support

          0       0.80      1.00      0.89        16
          1       1.00      1.00      1.00        12
          2       1.00      0.60      0.75        10

avg / total       0.92      0.89      0.89        38

[16  0  0  0 12  0  4  0  6]
MNB Accuracy:  0.8947368421052632
MNB F1:  0.8796296296296297
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        16
          1       1.00      0.83      0.91        12
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.93      0.88        40
          1       0.93      0.78      0.85        18
          2       0.88      0.91      0.89        46
          3       1.00      0.92      0.96        12
          4       1.00      0.89      0.94        18

avg / total       0.90      0.90      0.90       134

[37  0  3  0  0  3 14  1  0  0  3  1 42  0  0  1  0  0 11  0  0  0  2  0
 16]
svc Accuracy:  0.8955223880597015
svc F1:  0.9041504920864991
             precision    recall  f1-score   support

          0       0.78      0.88      0.82        40
          1       0.93      0.78      0.85        18
          2       0.84      0.93      0.89        46
          3       1.00      0.92      0.96        12
          4       1.00      0.67      0.80        18

avg / total       0.87      0.86      0.86       134

[35  0  5  0  0  3 14  1  0  0  2  1 43  0  0  1  0  0 11  0  4  0  2  0
 12]
LR Accuracy:  0.8582089552238806
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       1.00      0.94      0.97        17
          2       0.72      1.00      0.84        38
          3       1.00      0.65      0.79        20

avg / total       0.88      0.83      0.82        88

[ 6  0  7  0  0 16  1  0  0  0 38  0  0  0  7 13]
svc Accuracy:  0.8295454545454546
svc F1:  0.8060798850272535
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        13
          1       1.00      0.71      0.83        17
          2       0.62      1.00      0.77        38
          3       1.00      0.55      0.71        20

avg / total       0.84      0.74      0.72        88

[ 4  0  9  0  0 12  5  0  0  0 38  0  0  0  9 11]
LR Accuracy:  0.7386363636363636
LR F1:  0.6938821573055689
For name:  j_yue
total sample size before apply threshold:  62
Counter({'0000-0001-9694-7722': 25, '0000-0001-6384-5447': 24, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(61, 168)
(0, 0)
(0, 0)
1
61
             precision    recall  f1-score   support

          0       0.86      0.96      0.91        25
          1       0.95      1.00      0.98        21
          2       1.00      0.73      0.85        15

avg / total       0.93      0.92      0.92        61

[24  1  0  0 21  0  4  0 11]
MNB Accuracy:  0.9180327868852459
MNB F1:  0.9095194698529493
             precision    recall  f1-score   support

          0       0.89      1.00      0.94        25
          1       1.00      1.00      1.00        21
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.67      0.80        15
          1       1.00      0.75      0.86        16
          2       0.71      1.00      0.83        25
          3       1.00      0.90      0.95        10
          4       1.00      1.00      1.00        11

avg / total       0.91      0.87      0.87        77

[10  0  5  0  0  0 12  4  0  0  0  0 25  0  0  0  0  1  9  0  0  0  0  0
 11]
svc Accuracy:  0.8701298701298701
svc F1:  0.8875689223057645
             precision    recall  f1-score   support

          0       1.00      0.53      0.70        15
          1       1.00      0.56      0.72        16
          2       0.56      1.00      0.71        25
          3       1.00      0.40      0.57        10
          4       1.00      1.00      1.00        11

avg / total       0.86      0.74      0.73        77

[ 8  0  7  0  0  0  9  7  0  0  0  0 25  0  0  0  0  6  4  0  0  0  0  0
 11]
LR Accuracy:  0.7402597402597403
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       0.76      0.96      0.85        27
          2       1.00      0.80      0.89        15
          3       0.76      0.83      0.79        23
          4       0.86      0.90      0.88        20

avg / total       0.85      0.83      0.82        98

[ 6  3  0  2  2  0 26  0  0  1  0  0 12  3  0  0  4  0 19  0  0  1  0  1
 18]
LR Accuracy:  0.826530612244898
LR F1:  0.8085284599610448
For name:  p_li
total sample size before apply threshold:  118
Counter({'0000-0002-5715-548X': 20, '0000-0001-9602-9550': 18, '0000-0001-9098-7598': 14, '0000-0002-5876-2177': 9, '0000-0001-5836-1069': 9, '0000-0002-2572-5935': 7, '0000-0001-9339-3111': 7, '0000-0002-4273-4577': 7, '0000-0002-4684-4909': 6, '0000-0001-8771-3369': 5, '0000-0001-7960-1025': 4, '0000-0002-5192-8509': 4, '0000-0001-5761-9435': 3, '0000-0001-7603-7852': 2, '0000-0002-9330-5713': 1, '0000-0002-7112-9974': 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.89      0.44      0.59        18
          2       0.75      0.97      0.85        40

avg / total       0.83      0.80      0.78        69

[ 8  0  3  0  8 10  0  1 39]
svc Accuracy:  0.7971014492753623
svc F1:  0.7608413142356696
             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       1.00      0.11      0.20        18
          2       0.62      1.00      0.76        40

avg / total       0.78      0.64      0.54        69

[ 2  0  9  0  2 16  0  0 40]
LR Accuracy:  0.6376811594202898
LR F1:  0.4231990231990232
For name:  h_gomes
total sample size before apply threshold:  11
Counter({'0000-0003-1131-7604': 7, '0000-0001-6898-2408': 2, '0000-0003-3664-4740': 1, '0000-0002-6222-9180': 1})
[]
Total sample size after apply threshold:  0
For name:  m_matos
total sample size before apply threshol

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  182
Counter({'0000-0002-1059-9681': 79, '0000-0003-3670-8505': 36, '0000-0001-5304-2026': 23, '0000-0003-4563-3744': 22, '0000-0001-9149-260X': 11, '0000-0002-0582-3693': 7, '0000-0002-9454-8768': 3, '0000-0002-6057-452X': 1})
['0000-0001-9149-260X', '0000-0003-3670-8505', '0000-0001-5304-2026', '0000-0003-4563-3744', '0000-0002-1059-9681']
Total sample size after apply threshold:  171
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(171, 1543)
(0, 0)
(0, 0)
1
171
             precision    recall  f1-score   support

          0       0.00      0.00      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.97      0.81      0.88        36
          2       1.00      0.70      0.82        23
          3       1.00      1.00      1.00        22
          4       0.80      0.99      0.89        79

avg / total       0.90      0.88      0.88       171

[ 6  0  0  0  5  0 29  0  0  7  0  0 16  0  7  0  0  0 22  0  0  1  0  0
 78]
svc Accuracy:  0.8830409356725146
svc F1:  0.8583093377211025
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.96      0.72      0.83        36
          2       1.00      0.13      0.23        23
          3       1.00      0.86      0.93        22
          4       0.64      0.99      0.78        79

avg / total       0.76      0.74      0.68       171

[ 0  0  0  0 11  0 26  0  0 10  0  0  3  0 20  0  0  0 19  3  0  1  0  0
 78]
LR Accuracy:  0.7368421052631579
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.76      1.00      0.86        48
          1       1.00      0.38      0.55        24

avg / total       0.84      0.79      0.76        72

[48  0 15  9]
LR Accuracy:  0.7916666666666666
LR F1:  0.7051597051597052
For name:  j_franco
total sample size before apply threshold:  85
Counter({'0000-0002-3874-8618': 46, '0000-0001-9255-8084': 16, '0000-0002-0898-3510': 13, '0000-0002-3165-394X': 9, '0000-0002-8249-5224': 1})
['0000-0001-9255-8084', '0000-0002-3874-8618', '0000-0002-0898-3510']
Total sample size after apply threshold:  75
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.66      0.79        29
          1       0.83      1.00      0.91       107
          2       1.00      0.92      0.96        12
          3       1.00      0.74      0.85        42

avg / total       0.90      0.88      0.88       190

[ 19  10   0   0   0 107   0   0   0   1  11   0   0  11   0  31]
svc Accuracy:  0.8842105263157894
svc F1:  0.8760707838268003
             precision    recall  f1-score   support

          0       1.00      0.24      0.39        29
          1       0.66      1.00      0.80       107
          2       1.00      0.25      0.40        12
          3       1.00      0.45      0.62        42

avg / total       0.81      0.72      0.67       190

[  7  22   0   0   0 107   0   0   0   9   3   0   0  23   0  19]
LR Accuracy:  0.7157894736842105
LR F1:  0.5525867928118968
For name:  a_gordon
total sample size before apply threshold:  126
Counter({'0000-0003-1676-9853': 36, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.92      0.93        36
          1       0.83      0.76      0.79        25
          2       0.72      0.90      0.80        29
          3       1.00      0.85      0.92        27

avg / total       0.88      0.86      0.87       117

[33  0  3  0  2 19  4  0  0  3 26  0  0  1  3 23]
svc Accuracy:  0.8632478632478633
svc F1:  0.8603110328638497
             precision    recall  f1-score   support

          0       0.72      1.00      0.84        36
          1       1.00      0.72      0.84        25
          2       0.96      0.86      0.91        29
          3       1.00      0.85      0.92        27

avg / total       0.90      0.87      0.87       117

[36  0  0  0  7 18  0  0  4  0 25  0  3  0  1 23]
LR Accuracy:  0.8717948717948718
LR F1:  0.8758773784355179
For name:  z_yin
total sample size before apply threshold:  52
Counter({'0000-0003-1752-644X': 12, '0000-0002-3547-0606': 12, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(66, 52)
(0, 0)
(0, 0)
1
66
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        13
          1       0.85      1.00      0.92        53

avg / total       0.88      0.86      0.83        66

[ 4  9  0 53]
MNB Accuracy:  0.8636363636363636
MNB F1:  0.6961636828644502
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        13
          1       0.95      1.00      0.97        53

avg / total       0.96      0.95      0.95        66

[10  3  0 53]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       1.00      1.00      1.00        12

avg / total       1.00      1.00      1.00        32

[20  0  0 12]
MNB Accuracy:  1.0
MNB F1:  1.0
             precision    recall  f1-score   support

          0       0.95      1.00      0.98        20
          1       1.00      0.92      0.96        12

avg / total       0.97      0.97      0.97        32

[20  0  1 11]
svc Accuracy:  0.96875
svc F1:  0.9660657476139979
             precision    recall  f1-score   support

          0       0.83      1.00      0.91        20
          1       1.00      0.67      0.80        12

avg / total       0.90      0.88      0.87        32

[20  0  4  8]
LR Accuracy:  0.875
LR F1:  0.8545454545454545
For name:  s_phillips
total sample size before apply threshold:  183
Counter({'0000-0002-1956-4098': 138, '0000-0002-5694-0670': 20, '0000-0002-2549-8111': 11, '0000-0001-7157-4122': 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  15
Counter({'0000-0003-2544-2705': 7, '0000-0002-7543-575X': 4, '0000-0001-7312-4299': 3, '0000-0001-5886-0650': 1})
[]
Total sample size after apply threshold:  0
For name:  a_lau
total sample size before apply threshold:  35
Counter({'0000-0002-5933-9290': 21, '0000-0002-6489-204X': 8, '0000-0003-3802-828X': 4, '0000-0002-7338-7176': 2})
['0000-0002-5933-9290']
Total sample size after apply threshold:  21
For name:  j_berg
total sample size before apply threshold:  171
Counter({'0000-0003-0157-5888': 86, '0000-0003-3022-0963': 66, '0000-0003-2360-2664': 11, '0000-0003-2126-6476': 4, '0000-0001-8583-6349': 2, '0000-0001-7947-5073': 2})
['0000-0003-3022-0963', '0000-0003-2360-2664', '0000-0003-0157-5888']
Total sample size after apply threshold:  163
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.74      0.98      0.84        66
          1       1.00      0.18      0.31        11
          2       0.99      0.84      0.91        86

avg / total       0.89      0.85      0.84       163

[65  0  1  9  2  0 14  0 72]
svc Accuracy:  0.852760736196319
svc F1:  0.6858361764022142
             precision    recall  f1-score   support

          0       0.73      0.67      0.70        66
          1       0.00      0.00      0.00        11
          2       0.74      0.88      0.80        86

avg / total       0.69      0.74      0.71       163

[44  0 22  6  0  5 10  0 76]
LR Accuracy:  0.7361963190184049
LR F1:  0.5008818342151676
For name:  l_wilson
total sample size before apply threshold:  59
Counter({'0000-0001-8709-8968': 18, '0000-0003-4175-7125': 11, '0000-0001-6659-6001': 11, '0000-0002-3779-8277': 11, '0000-0002-3532-0309': 5, '0000-0002-8333-5660': 3})
['0000-0001-8709-8968', '0000-0003-4175-7125', '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(346, 469)
(0, 0)
(0, 0)
1
346
             precision    recall  f1-score   support

          0       0.97      0.83      0.89        35
          1       0.00      0.00      0.00        10
          2       0.00      0.00      0.00        16
          3       0.00      0.00      0.00        11
          4       0.93      0.81      0.87        69
          5       0.52      1.00      0.69       106
          6       1.00      0.40      0.57        40
          7       1.00      0.64      0.78        14
          8       1.00      0.64      0.78     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.97      0.89      0.93        35
          1       1.00      0.90      0.95        10
          2       1.00      0.62      0.77        16
          3       1.00      0.36      0.53        11
          4       0.91      0.84      0.87        69
          5       0.75      0.95      0.84       106
          6       0.92      0.82      0.87        40
          7       1.00      0.79      0.88        14
          8       0.84      0.84      0.84        45

avg / total       0.87      0.85      0.85       346

[ 31   0   0   0   0   1   0   0   3   0   9   0   0   0   1   0   0   0
   0   0  10   0   0   3   2   0   1   0   0   0   4   1   5   0   0   1
   0   0   0   0  58  10   1   0   0   0   0   0   0   3 101   0   0   2
   0   0   0   0   0   7  33   0   0   0   0   0   0   0   3   0  11   0
   1   0   0   0   2   4   0   0  38]
svc Accuracy:  0.8526011560693642
svc F1:  0.8309473200008728
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(333, 1474)
(0, 0)
(0, 0)
1
333
             precision    recall  f1-score   support

          0       0.64      0.94      0.76        95
          1       1.00      0.59      0.74        46
          2       0.00      0.00      0.00        13
          3       1.00      0.86      0.93        74
          4       0.83      0.91      0.86        95
          5       0.00      0.00      0.00        10

avg / total       0.78      0.80      0.77       333

[89  0  0  0  6  0 13 27  0  0  6  0 10  0  0  0  3  0  8  0  0 64  2  0
  9  0  0  0 86  0  9  0  0  0  1  0]
MNB Accuracy:  0.7987987987987988
MNB F1:  0.5492553941959328
             precision    recall  f1-score   support

          0       0.66      0.99      0.79        95
          1       1.00      0.67      0.81        46
          2       1.00      0.46      0.63        13
          3       1.00      0.84      0.91        74
          4       0.99      0.86      0.92        95
          5       1.00      0.90      0.95       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      1.00      0.95        87
          1       1.00      0.84      0.91        55

avg / total       0.94      0.94      0.94       142

[87  0  9 46]
MNB Accuracy:  0.9366197183098591
MNB F1:  0.9308553806200291
             precision    recall  f1-score   support

          0       0.91      0.99      0.95        87
          1       0.98      0.85      0.91        55

avg / total       0.94      0.94      0.94       142

[86  1  8 47]
svc Accuracy:  0.9366197183098591
svc F1:  0.9314488011586118
             precision    recall  f1-score   support

          0       0.90      1.00      0.95        87
          1       1.00      0.82      0.90        55

avg / total       0.94      0.93      0.93       142

[87  0 10 45]
LR Accuracy:  0.9295774647887324
LR F1:  0.9228260869565218
For name:  s_henderson
total sample size before apply threshold:  82
Counter({'0000-0002-1076-3867': 52, '0000-0002-9032-3828': 2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      1.00      0.90        44
          1       1.00      0.67      0.80        15
          2       1.00      0.55      0.71        11

avg / total       0.88      0.86      0.85        70

[44  0  0  5 10  0  5  0  6]
MNB Accuracy:  0.8571428571428571
MNB F1:  0.801280512204882
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        44
          1       1.00      0.73      0.85        15
          2       1.00      0.82      0.90        11

avg / total       0.92      0.91      0.91        70

[44  0  0  4 11  0  2  0  9]
svc Accuracy:  0.9142857142857143
svc F1:  0.8941080196399346
             precision    recall  f1-score   support

          0       0.75      1.00      0.85        44
          1       1.00      0.40      0.57        15
          2       1.00      0.45      0.62        11

avg / total       0.84      0.79      0.76        70

[44  0  0  9  6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.98      0.94        41
          1       0.98      0.94      0.96        65

avg / total       0.95      0.95      0.95       106

[40  1  4 61]
svc Accuracy:  0.9528301886792453
svc F1:  0.9509031959240388
             precision    recall  f1-score   support

          0       1.00      0.73      0.85        41
          1       0.86      1.00      0.92        65

avg / total       0.91      0.90      0.89       106

[30 11  0 65]
LR Accuracy:  0.8962264150943396
LR F1:  0.8835281190690241
For name:  z_xie
total sample size before apply threshold:  99
Counter({'0000-0003-2974-1825': 48, '0000-0001-5816-6159': 17, '0000-0002-8348-4455': 16, '0000-0002-1539-5100': 8, '0000-0002-4526-9746': 6, '0000-0003-0308-5233': 1, '0000-0002-3137-561X': 1, '0000-0003-2492-0592': 1, '0000-0002-6600-8190': 1})
['0000-0003-2974-1825', '0000-0002-8348-4455', '0000-0001-5816-6159']
Total sample size after apply threshold:

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      0.98      0.92        48
          1       1.00      0.94      0.97        16
          2       0.92      0.65      0.76        17

avg / total       0.91      0.90      0.90        81

[47  0  1  1 15  0  6  0 11]
svc Accuracy:  0.9012345679012346
svc F1:  0.8826437508633412
             precision    recall  f1-score   support

          0       0.71      1.00      0.83        48
          1       1.00      0.50      0.67        16
          2       1.00      0.29      0.45        17

avg / total       0.83      0.75      0.72        81

[48  0  0  8  8  0 12  0  5]
LR Accuracy:  0.7530864197530864
LR F1:  0.6495994427028909
For name:  m_wright
total sample size before apply threshold:  379
Counter({'0000-0001-7133-4970': 213, '0000-0002-0541-7556': 87, '0000-0002-2650-2426': 25, '0000-0001-8036-1161': 17, '0000-0003-2731-4707': 15, '0000-0002-9348-8740': 13, '0000-0001-7121-504X': 6, '0000-0001-5522-779

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.06      0.11        17
          1       0.76      1.00      0.87       213
          2       1.00      0.08      0.15        25
          3       0.00      0.00      0.00        13
          4       0.94      0.94      0.94        87
          5       1.00      0.07      0.12        15

avg / total       0.82      0.81      0.74       370

[  1  16   0   0   0   0   0 213   0   0   0   0   0  18   2   0   5   0
   0  13   0   0   0   0   0   5   0   0  82   0   0  14   0   0   0   1]
MNB Accuracy:  0.8081081081081081
MNB F1:  0.365440275571338
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        17
          1       0.88      1.00      0.94       213
          2       1.00      0.56      0.72        25
          3       1.00      0.77      0.87        13
          4       1.00      0.97      0.98        87
          5       1.00      0.80      0.89    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.62      0.70      0.65        23
          1       1.00      0.96      0.98        26
          2       0.85      0.97      0.91        70
          3       0.71      0.82      0.76        33
          4       0.80      0.36      0.50        11
          5       1.00      0.54      0.70        13
          6       0.94      0.84      0.89        19
          7       1.00      0.70      0.82        10

avg / total       0.84      0.83      0.82       205

[16  0  1  6  0  0  0  0  0 25  1  0  0  0  0  0  1  0 68  0  0  0  1  0
  5  0  0 27  1  0  0  0  2  0  0  5  4  0  0  0  1  0  5  0  0  7  0  0
  0  0  3  0  0  0 16  0  1  0  2  0  0  0  0  7]
svc Accuracy:  0.8292682926829268
svc F1:  0.7766377161193116
             precision    recall  f1-score   support

          0       0.59      0.57      0.58        23
          1       1.00      0.81      0.89        26
          2       0.66      0.99      0.79       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.93      0.96       112
          1       0.98      0.80      0.88        80
          2       0.84      0.99      0.91       180
          3       1.00      0.53      0.69        19

avg / total       0.93      0.91      0.91       391

[104   0   8   0   0  64  16   0   0   1 179   0   0   0   9  10]
MNB Accuracy:  0.9130434782608695
MNB F1:  0.8621605155472151
             precision    recall  f1-score   support

          0       1.00      0.87      0.93       112
          1       1.00      0.75      0.86        80
          2       0.84      1.00      0.91       180
          3       1.00      1.00      1.00        19

avg / total       0.93      0.91      0.91       391

[ 97   0  15   0   0  60  20   0   0   0 180   0   0   0   0  19]
svc Accuracy:  0.9104859335038363
svc F1:  0.9241912318194796
             precision    recall  f1-score   support

          0       1.00      0.74      0.85      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      1.00      0.96        32
          1       1.00      0.94      0.97        33
          2       1.00      0.93      0.96        14

avg / total       0.97      0.96      0.96        79

[32  0  0  2 31  0  1  0 13]
svc Accuracy:  0.9620253164556962
svc F1:  0.9623122811866592
             precision    recall  f1-score   support

          0       0.86      1.00      0.93        32
          1       0.97      0.94      0.95        33
          2       1.00      0.71      0.83        14

avg / total       0.93      0.92      0.92        79

[32  0  0  2 31  0  3  1 10]
LR Accuracy:  0.9240506329113924
LR F1:  0.9049052396878484
For name:  y_su
total sample size before apply threshold:  190
Counter({'0000-0002-1771-9017': 83, '0000-0002-5390-4113': 24, '0000-0003-3537-6246': 23, '0000-0003-3398-6294': 17, '0000-0001-8434-1758': 15, '0000-0003-2660-9183': 10, '0000-0003-2193-5473': 5, '0000-0002-4293-5037': 2

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.59      0.67      0.62        15
          1       1.00      1.00      1.00        17
          2       0.88      0.58      0.70        24
          3       1.00      0.91      0.95        23
          4       0.82      0.98      0.89        83
          5       1.00      0.20      0.33        10

avg / total       0.86      0.84      0.83       172

[10  0  0  0  5  0  0 17  0  0  0  0  1  0 14  0  9  0  0  0  0 21  2  0
  0  0  2  0 81  0  6  0  0  0  2  2]
svc Accuracy:  0.8430232558139535
svc F1:  0.750498112998113
             precision    recall  f1-score   support

          0       1.00      0.07      0.12        15
          1       1.00      0.88      0.94        17
          2       0.93      0.54      0.68        24
          3       1.00      0.78      0.88        23
          4       0.66      0.99      0.79        83
          5       0.00      0.00      0.00        10

avg / total       0.77      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      0.80      0.89        15
          2       0.97      1.00      0.99        99

avg / total       0.98      0.98      0.98       127

[13  0  0  0 12  3  0  0 99]
svc Accuracy:  0.9763779527559056
svc F1:  0.9579878385848536
             precision    recall  f1-score   support

          0       1.00      0.54      0.70        13
          1       0.00      0.00      0.00        15
          2       0.82      1.00      0.90        99

avg / total       0.75      0.83      0.78       127

[ 7  0  6  0  0 15  0  0 99]
LR Accuracy:  0.8346456692913385
LR F1:  0.534703196347032
For name:  w_liao
total sample size before apply threshold:  79
Counter({'0000-0001-5362-6953': 29, '0000-0001-6383-3470': 25, '0000-0002-5619-4997': 16, '0000-0002-9768-0959': 5, '0000-0001-7221-5906': 3, '0000-0002-5333-2717': 1})
['0000-0001-5362-6953', '0000-0002-5619-4997', '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(262, 508)
(0, 0)
(0, 0)
1
262
             precision    recall  f1-score   support

          0       1.00      0.57      0.72        37
          1       0.00      0.00      0.00        19
          2       1.00      0.19      0.32        21
          3       0.68      0.96      0.80       115
          4       0.83      0.89      0.86        70

avg / total       0.74      0.75      0.71       262

[ 21   1   0  15   0   0   0   0  15   4   0   0   4  13   4   0   0   0
 110   5   0   0   0   8  62]
MNB Accuracy:  0.7519083969465649
MNB F1:  0.539

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      1.00      0.89        72
          1       1.00      0.27      0.42        15
          2       1.00      0.84      0.91        43

avg / total       0.89      0.86      0.84       130

[72  0  0 11  4  0  7  0 36]
svc Accuracy:  0.8615384615384616
svc F1:  0.7404446418437093
             precision    recall  f1-score   support

          0       0.73      1.00      0.85        72
          1       1.00      0.13      0.24        15
          2       1.00      0.70      0.82        43

avg / total       0.85      0.80      0.77       130

[72  0  0 13  2  0 13  0 30]
LR Accuracy:  0.8
LR F1:  0.6347569164652164
For name:  m_walsh
total sample size before apply threshold:  37
Counter({'0000-0001-5683-1151': 30, '0000-0001-8920-7419': 3, '0000-0002-1770-3314': 2, '0000-0003-0982-4105': 2})
['0000-0001-5683-1151']
Total sample size after apply threshold:  30
For name:  r_figueiredo
total sample size before a

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 667
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(667, 786)
(0, 0)
(0, 0)
1
667
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.00      0.00      0.00        13
          2       1.00      0.13      0.23        39
          3       0.00      0.00      0.00        20
          4       0.00      0.00      0.00        33
          5       0.82      0.82      0.82        49
          6       0.00      0.00      0.00        20
          7       0.53      0.97      0.69       146
          8       1.00      0.13      0.23

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.69      0.82        13
          1       1.00      0.62      0.76        13
          2       0.92      0.59      0.72        39
          3       0.45      0.25      0.32        20
          4       0.77      0.73      0.75        33
          5       0.92      0.96      0.94        49
          6       1.00      0.55      0.71        20
          7       0.71      0.99      0.83       146
          8       0.95      0.78      0.86        23
          9       0.88      0.58      0.70        12
         10       0.76      0.54      0.63        24
         11       0.58      0.47      0.52        15
         12       0.73      0.92      0.82       115
         13       0.92      0.84      0.88        64
         14       1.00      1.00      1.00        15
         15       0.12      0.05      0.07        22
         16       0.90      0.86      0.88        21
         17       0.87      0.57      0.68   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(24, 60)
(0, 0)
(0, 0)
1
24
             precision    recall  f1-score   support

          0       1.00      0.27      0.43        11
          1       0.62      1.00      0.76        13

avg / total       0.79      0.67      0.61        24

[ 3  8  0 13]
MNB Accuracy:  0.6666666666666666
MNB F1:  0.5966386554621849
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00        24

[11  0  0 13]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.76      1.00      0.86        34
          1       1.00      0.21      0.35        14

avg / total       0.83      0.77      0.71        48

[34  0 11  3]
LR Accuracy:  0.7708333333333334
LR F1:  0.6068503350707372
For name:  s_chou
total sample size before apply threshold:  39
Counter({'0000-0001-9237-4517': 16, '0000-0003-1155-6082': 8, '0000-0003-0787-0044': 6, '0000-0001-8081-1679': 4, '0000-0001-5512-9977': 2, '0000-0002-4121-019X': 2, '0000-0001-8163-7430': 1})
['0000-0001-9237-4517']
Total sample size after apply threshold:  16
For name:  s_hughes
total sample size before apply threshold:  106
Counter({'0000-0001-8227-9225': 74, '0000-0002-9409-9405': 12, '0000-0001-8360-929X': 6, '0000-0002-2264-8479': 5, '0000-0002-9778-140X': 3, '0000-0001-6340-2646': 3, '0000-0001-7689-4272': 1, '0000-0002-8187-4871': 1, '0000-0003-4542-1821': 1})
['0000-0002-9409-9405', '0000-0001-8227-9225']
Total sample size after a

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(200, 837)
(0, 0)
(0, 0)
1
200
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        24
          1       0.00      0.00      0.00        10
          2       1.00      0.64      0.78        14
          3       0.00      0.00      0.00        13
          4       0.87      0.79      0.83        52
          5       0.55      1.00      0.71        69
          6       1.00      0.33      0.50        18

avg / total       0.70      0.69      0.64       200

[12  0  0  0  0 12  0  0  0  0  0  0 10  0  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.59      1.00      0.74        68
          2       1.00      0.09      0.17        11
          3       0.96      0.79      0.87        34
          4       0.00      0.00      0.00        11
          5       0.00      0.00      0.00        10

avg / total       0.58      0.66      0.56       145

[ 0 11  0  0  0  0  0 68  0  0  0  0  0  9  1  1  0  0  0  7  0 27  0  0
  0 11  0  0  0  0  0 10  0  0  0  0]
MNB Accuracy:  0.6620689655172414
MNB F1:  0.2961274738974599
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.77      1.00      0.87        68
          2       1.00      0.55      0.71        11
          3       1.00      0.88      0.94        34
          4       1.00      0.45      0.62        11
          5       1.00      0.80      0.89        10

avg / total       0.89     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        16
          1       0.91      1.00      0.95        41

avg / total       0.94      0.93      0.93        57

[12  4  0 41]
svc Accuracy:  0.9298245614035088
svc F1:  0.9053156146179402
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        16
          1       0.84      1.00      0.91        41

avg / total       0.88      0.86      0.84        57

[ 8  8  0 41]
LR Accuracy:  0.8596491228070176
LR F1:  0.7888888888888889
For name:  a_figueiredo
total sample size before apply threshold:  150
Counter({'0000-0002-9105-9619': 79, '0000-0002-3239-3190': 19, '0000-0001-6956-0514': 16, '0000-0001-8156-7700': 14, '0000-0001-8386-8216': 9, '0000-0001-7039-5341': 6, '0000-0003-2329-2854': 3, '0000-0003-0487-8956': 3, '0000-0002-8555-8649': 1})
['0000-0001-6956-0514', '0000-0001-8156-7700', '0000-0002-9105-9619', '0000-0002-3239-3190']
Total sa

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.94      0.97        16
          1       1.00      0.93      0.96        14
          2       0.96      1.00      0.98        79
          3       1.00      0.95      0.97        19

avg / total       0.98      0.98      0.98       128

[15  0  1  0  0 13  1  0  0  0 79  0  0  0  1 18]
svc Accuracy:  0.9765625
svc F1:  0.971261082761784
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        16
          1       1.00      0.43      0.60        14
          2       0.81      1.00      0.89        79
          3       1.00      0.84      0.91        19

avg / total       0.88      0.85      0.84       128

[ 8  0  8  0  0  6  8  0  0  0 79  0  0  0  3 16]
LR Accuracy:  0.8515625
LR F1:  0.7684019370460048
For name:  s_clark
total sample size before apply threshold:  39
Counter({'0000-0001-5907-9671': 12, '0000-0002-7488-3438': 9, '0000-0001-7328-0726': 8, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[]
Total sample size after apply threshold:  0
For name:  s_ma
total sample size before apply threshold:  136
Counter({'0000-0002-1897-7069': 69, '0000-0002-2029-7943': 42, '0000-0002-1810-8357': 9, '0000-0002-0232-8590': 6, '0000-0001-8581-2216': 3, '0000-0002-2704-3540': 2, '0000-0001-8087-0249': 1, '0000-0001-6361-9706': 1, '0000-0002-7995-2041': 1, '0000-0003-4846-9513': 1, '0000-0002-8992-1177': 1})
['0000-0002-2029-7943', '0000-0002-1897-7069']
Total sample size after apply threshold:  111
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(111, 212)
(0, 0)
(0, 0)
1
111
             precision 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(51, 127)
(0, 0)
(0, 0)
1
51
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        26
          1       0.93      1.00      0.96        25

avg / total       0.96      0.96      0.96        51

[24  2  0 25]
MNB Accuracy:  0.9607843137254902
MNB F1:  0.9607692307692308
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        26
          1       1.00      1.00      1.00        25

avg / total       1.00      1.00      1.00        51

[26  0  0 25]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       1.00      0.85      0.92        20
          2       1.00      0.86      0.92        14
          3       0.77      1.00      0.87        47
          4       1.00      0.75      0.86        12

avg / total       0.90      0.86      0.86       103

[ 4  0  0  6  0  0 17  0  3  0  0  0 12  2  0  0  0  0 47  0  0  0  0  3
  9]
svc Accuracy:  0.8640776699029126
svc F1:  0.8281875281875282
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        10
          1       1.00      0.70      0.82        20
          2       1.00      0.79      0.88        14
          3       0.63      1.00      0.77        47
          4       1.00      0.17      0.29        12

avg / total       0.83      0.73      0.68       103

[ 1  0  0  9  0  0 14  0  6  0  0  0 11  3  0  0  0  0 47  0  0  0  0 10
  2]
LR Accuracy:  0.7281553398058253
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.99      0.92       102
          1       1.00      0.95      0.97        40
          2       1.00      0.98      0.99        91
          3       1.00      0.61      0.76        41
          4       0.94      0.99      0.96        74

avg / total       0.94      0.94      0.93       348

[101   0   0   0   1   2  38   0   0   0   2   0  89   0   0  12   0   0
  25   4   1   0   0   0  73]
MNB Accuracy:  0.9367816091954023
MNB F1:  0.9199063509589825
             precision    recall  f1-score   support

          0       0.91      1.00      0.95       102
          1       1.00      0.95      0.97        40
          2       1.00      0.97      0.98        91
          3       1.00      0.93      0.96        41
          4       1.00      0.97      0.99        74

avg / total       0.97      0.97      0.97       348

[102   0   0   0   0   2  38   0   0   0   3   0  88   0   0   3   0   0
  38   0   2  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.94      0.97        31
          1       0.94      1.00      0.97        32

avg / total       0.97      0.97      0.97        63

[29  2  0 32]
svc Accuracy:  0.9682539682539683
svc F1:  0.9681818181818181
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        31
          1       1.00      1.00      1.00        32

avg / total       1.00      1.00      1.00        63

[31  0  0 32]
LR Accuracy:  1.0
LR F1:  1.0
For name:  r_menezes
total sample size before apply threshold:  29
Counter({'0000-0003-0552-8480': 15, '0000-0002-6612-3543': 6, '0000-0003-3109-9683': 5, '0000-0003-4316-2168': 2, '0000-0002-4842-641X': 1})
['0000-0003-0552-8480']
Total sample size after apply threshold:  15
For name:  s_tsang
total sample size before apply threshold:  20
Counter({'0000-0003-0788-4905': 6, '0000-0002-9862-8503': 5, '0000-0001-6099-6696': 5, '0000-0002-2232-9814'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8
LR F1:  0.794396551724138
For name:  b_zheng
total sample size before apply threshold:  90
Counter({'0000-0002-7682-6648': 82, '0000-0002-3272-843X': 5, '0000-0003-1551-0970': 2, '0000-0002-2044-2848': 1})
['0000-0002-7682-6648']
Total sample size after apply threshold:  82
For name:  f_xu
total sample size before apply threshold:  94
Counter({'0000-0003-4351-0222': 29, '0000-0002-8465-5834': 22, '0000-0001-5239-4572': 19, '0000-0001-7958-3787': 12, '0000-0002-0245-057X': 5, '0000-0002-8166-0275': 4, '0000-0003-1600-6346': 2, '0000-0002-2598-2528': 1})
['0000-0001-7958-3787', '0000-0001-5239-4572', '0000-0003-4351-0222', '0000-0002-8465-5834']
Total sample size after apply threshold:  82
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_word

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(260, 977)
(0, 0)
(0, 0)
1
260
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        30
          1       1.00      0.26      0.41        23
          2       1.00      0.06      0.12        16
          3       1.00      0.65      0.79        40
          4       1.00      0.06      0.11        17
          5       0.00      0.00      0.00        10
          6       0.58      1.00      0.74       124

avg / total       0.76      0.66      0.60       260

[ 14   0   0   0   0   0  16   0   6   0   0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.63      0.78        30
          1       1.00      0.52      0.69        23
          2       1.00      0.69      0.81        16
          3       0.97      0.75      0.85        40
          4       1.00      0.71      0.83        17
          5       1.00      0.70      0.82        10
          6       0.74      1.00      0.85       124

avg / total       0.87      0.83      0.82       260

[ 19   0   0   0   0   0  11   0  12   0   0   0   0  11   0   0  11   0
   0   0   5   0   0   0  30   0   0  10   0   0   0   0  12   0   5   0
   0   0   1   0   7   2   0   0   0   0   0   0 124]
svc Accuracy:  0.8269230769230769
svc F1:  0.8030772020429074
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        30
          1       1.00      0.26      0.41        23
          2       1.00      0.06      0.12        16
          3       1.00      0.53      0.69   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        13

avg / total       1.00      1.00      1.00        26

[13  0  0 13]
LR Accuracy:  1.0
LR F1:  1.0
For name:  j_matos
total sample size before apply threshold:  25
Counter({'0000-0002-3754-3709': 12, '0000-0002-0505-8282': 5, '0000-0001-9917-6126': 4, '0000-0003-1335-0635': 3, '0000-0003-0570-7913': 1})
['0000-0002-3754-3709']
Total sample size after apply threshold:  12
For name:  l_santos
total sample size before apply threshold:  172
Counter({'0000-0003-3040-0358': 55, '0000-0002-2712-0622': 32, '0000-0002-7013-8852': 15, '0000-0002-1915-6780': 13, '0000-0001-5166-530X': 11, '0000-0003-0986-9880': 10, '0000-0002-0694-733X': 9, '0000-0001-8366-1557': 5, '0000-0001-8906-9976': 5, '0000-0002-4453-5766': 4, '0000-0001-7551-5605': 3, '0000-0003-0458-427X': 3, '0000-0001-5915-1186': 2, '0000-0001-9172-6429': 1, '0000-0003-0568-917X

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(107, 379)
(0, 0)
(0, 0)
1
107
             precision    recall  f1-score   support

          0       0.76      0.81      0.79        32
          1       1.00      0.10      0.18        10
          2       1.00      0.50      0.67        12
          3       1.00      0.62      0.76        13
          4       0.64      0.93      0.76        40

avg / total       0.79      0.73      0.70       107

[26  0  0  0  6  1  1  0  0  8  0  0  6  0  6  4  0  0  8  1  3  0  0  0
 37]
MNB Accuracy:  0.7289719626168224
MNB F1:  0.630674087816945
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      1.00      0.96        47
          1       1.00      0.79      0.88        19

avg / total       0.94      0.94      0.94        66

[47  0  4 15]
svc Accuracy:  0.9393939393939394
svc F1:  0.9207683073229291
             precision    recall  f1-score   support

          0       0.87      1.00      0.93        47
          1       1.00      0.63      0.77        19

avg / total       0.91      0.89      0.89        66

[47  0  7 12]
LR Accuracy:  0.8939393939393939
LR F1:  0.8524433088470138
For name:  p_wong
total sample size before apply threshold:  36
Counter({'0000-0002-6360-849X': 13, '0000-0003-1592-4823': 8, '0000-0001-7935-7245': 7, '0000-0003-4982-8127': 3, '0000-0003-4645-0384': 3, '0000-0003-3804-3041': 1, '0000-0002-8171-3242': 1})
['0000-0002-6360-849X']
Total sample size after apply threshold:  13
For name:  a_cooper
total sample size before apply threshold:  265
Counter({'0000-0001-6709-73

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


MNB F1:  0.4275806318129152
             precision    recall  f1-score   support

          0       1.00      0.74      0.85        23
          1       0.67      1.00      0.80       112
          2       1.00      0.47      0.64        15
          3       0.00      0.00      0.00        12
          4       1.00      0.68      0.81        72
          5       1.00      0.62      0.77        16

avg / total       0.80      0.78      0.76       250

[ 17   6   0   0   0   0   0 112   0   0   0   0   0   8   7   0   0   0
   0  12   0   0   0   0   0  23   0   0  49   0   0   6   0   0   0  10]
svc Accuracy:  0.78
svc F1:  0.6447298574131418
             precision    recall  f1-score   support

          0       1.00      0.26      0.41        23
          1       0.58      1.00      0.73       112
          2       1.00      0.07      0.12        15
          3       0.00      0.00      0.00        12
          4       1.00      0.64      0.78        72
          5       1.00      0.2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      1.00      0.94        83
          1       0.00      0.00      0.00        11

avg / total       0.78      0.88      0.83        94

[83  0 11  0]
svc Accuracy:  0.8829787234042553
svc F1:  0.4689265536723164
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        83
          1       0.00      0.00      0.00        11

avg / total       0.78      0.88      0.83        94

[83  0 11  0]
LR Accuracy:  0.8829787234042553
LR F1:  0.4689265536723164
For name:  s_russo
total sample size before apply threshold:  45
Counter({'0000-0003-3589-3040': 33, '0000-0002-9699-4681': 10, '0000-0001-9137-9391': 1, '0000-0002-5490-3155': 1})
['0000-0003-3589-3040', '0000-0002-9699-4681']
Total sample size after apply threshold:  43
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.68      0.81        19
          1       0.96      1.00      0.98       131

avg / total       0.96      0.96      0.96       150

[ 13   6   0 131]
svc Accuracy:  0.96
svc F1:  0.8950559701492538
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        19
          1       0.87      1.00      0.93       131

avg / total       0.76      0.87      0.81       150

[  0  19   0 131]
LR Accuracy:  0.8733333333333333
LR F1:  0.46619217081850534
For name:  m_moore
total sample size before apply threshold:  112
Counter({'0000-0002-5127-4509': 45, '0000-0003-3074-6631': 38, '0000-0002-7853-5756': 18, '0000-0003-4768-5329': 7, '0000-0002-7914-0166': 4})
['0000-0002-7853-5756', '0000-0003-3074-6631', '0000-0002-5127-4509']
Total sample size after apply threshold:  101
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.in

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.72      0.84        18
          1       0.82      0.95      0.88        38
          2       0.93      0.91      0.92        45

avg / total       0.90      0.89      0.89       101

[13  4  1  0 36  2  0  4 41]
svc Accuracy:  0.8910891089108911
svc F1:  0.8793689241713004
             precision    recall  f1-score   support

          0       1.00      0.61      0.76        18
          1       1.00      0.87      0.93        38
          2       0.79      1.00      0.88        45

avg / total       0.91      0.88      0.88       101

[11  0  7  0 33  5  0  0 45]
LR Accuracy:  0.8811881188118812
LR F1:  0.8568503652067919
For name:  c_johnson
total sample size before apply threshold:  300
Counter({'0000-0002-6864-6604': 114, '0000-0002-9719-3771': 47, '0000-0001-9616-6205': 44, '0000-0002-9511-905X': 21, '0000-0001-9190-8441': 18, '0000-0003-3892-7082': 16, '0000-0003-4428-3594': 14, '0000-0002-2298-7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.38      0.55        21
          1       0.52      0.73      0.61        44
          2       1.00      0.19      0.32        16
          3       1.00      0.92      0.96        12
          4       0.76      0.96      0.85       114
          5       1.00      0.78      0.88        18
          6       0.87      0.72      0.79        47
          7       1.00      0.36      0.53        14

avg / total       0.81      0.76      0.74       286

[  8   8   0   0   4   0   1   0   0  32   0   0  12   0   0   0   0   6
   3   0   7   0   0   0   0   1   0  11   0   0   0   0   0   3   0   0
 110   0   1   0   0   0   0   0   4  14   0   0   0   5   0   0   8   0
  34   0   0   6   0   0   0   0   3   5]
svc Accuracy:  0.7587412587412588
svc F1:  0.6843741841978285
             precision    recall  f1-score   support

          0       1.00      0.24      0.38        21
          1       1.00      0.36     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      1.00      0.80        18
          1       1.00      0.10      0.18        10

avg / total       0.79      0.68      0.58        28

[18  0  9  1]
LR Accuracy:  0.6785714285714286
LR F1:  0.49090909090909096
For name:  x_xie
total sample size before apply threshold:  24
Counter({'0000-0002-2701-8660': 13, '0000-0002-1964-4370': 6, '0000-0003-2988-3065': 2, '0000-0002-6796-8521': 1, '0000-0002-3103-3724': 1, '0000-0002-7970-2974': 1})
['0000-0002-2701-8660']
Total sample size after apply threshold:  13
For name:  x_jin
total sample size before apply threshold:  62
Counter({'0000-0002-1550-2199': 27, '0000-0003-2454-1621': 11, '0000-0002-2809-7882': 11, '0000-0003-4293-8665': 9, '0000-0001-7339-2920': 2, '0000-0001-6742-1799': 1, '0000-0003-3033-758X': 1})
['0000-0002-1550-2199', '0000-0003-2454-1621', '0000-0002-2809-7882']
Total sample size after apply threshold:  49
TfidfVectorizer(analyzer='word', binar

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       1.00      0.80      0.89        15
          2       1.00      0.94      0.97        18
          3       0.92      0.79      0.85        14
          4       1.00      0.80      0.89        61
          5       0.56      0.99      0.71        70
          6       1.00      0.45      0.62        11
          7       0.97      0.70      0.81        40
          8       1.00      0.63      0.77        19

avg / total       0.87      0.78      0.78       261

[ 1  0  0  0  0 12  0  0  0  0 12  0  0  0  3  0  0  0  0  0 17  0  0  1
  0  0  0  0  0  0 11  0  3  0  0  0  0  0  0  0 49 11  0  1  0  0  0  0
  1  0 69  0  0  0  0  0  0  0  0  6  5  0  0  0  0  0  0  0 12  0 28  0
  0  0  0  0  0  7  0  0 12]
svc Accuracy:  0.7816091954022989
svc F1:  0.7402628330787505
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      0.95      0.87        57
          1       1.00      0.77      0.87        30
          2       0.88      0.83      0.85        52

avg / total       0.87      0.86      0.86       139

[54  0  3  4 23  3  9  0 43]
svc Accuracy:  0.8633093525179856
svc F1:  0.8634591395840742
             precision    recall  f1-score   support

          0       0.68      1.00      0.81        57
          1       1.00      0.47      0.64        30
          2       0.98      0.77      0.86        52

avg / total       0.86      0.80      0.79       139

[57  0  0 15 14  1 12  0 40]
LR Accuracy:  0.7985611510791367
LR F1:  0.7683631094749832
For name:  y_zhang
total sample size before apply threshold:  1244
Counter({'0000-0001-8642-4071': 104, '0000-0002-3254-8965': 64, '0000-0001-7307-9408': 56, '0000-0002-9956-3879': 48, '0000-0003-2932-4159': 48, '0000-0003-2317-2190': 45, '0000-0003-2753-7601': 37, '0000-0001-6118-66

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.88      1.00      0.94        22
          1       1.00      0.71      0.83        14
          2       0.88      0.67      0.76        21
          3       0.71      0.25      0.37        20
          4       1.00      1.00      1.00        13
          5       0.56      0.23      0.32        22
          6       0.52      0.73      0.61        64
          7       0.61      0.70      0.65        56
          8       1.00      0.77      0.87        22
          9       0.78      0.96      0.86        26
         10       1.00      0.86      0.92        28
         11       0.95      0.88      0.91        48
         12       0.90      0.64      0.75        14
         13       0.86      0.50      0.63        12
         14       0.69      0.85      0.77        48
         15       1.00      0.74      0.85        27
         16       0.76      0.85      0.80        26
         17       0.42      0.56      0.48   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 51
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(51, 39)
(0, 0)
(0, 0)
1
51
             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.88      1.00      0.94        38

avg / total       0.91      0.90      0.89        51

[ 8  5  0 38]
MNB Accuracy:  0.9019607843137255
MNB F1:  0.8500881834215168
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        13
          1       0.93      1.00      0.96        38

avg / total       0.95      0.94      0.94        51

[10  3  0 38]
svc 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      1.00      0.95        36
          1       1.00      0.81      0.89        21

avg / total       0.94      0.93      0.93        57

[36  0  4 17]
svc Accuracy:  0.9298245614035088
svc F1:  0.9210526315789473
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        36
          1       1.00      0.76      0.86        21

avg / total       0.92      0.91      0.91        57

[36  0  5 16]
LR Accuracy:  0.9122807017543859
LR F1:  0.8999648999648999
For name:  m_brito
total sample size before apply threshold:  86
Counter({'0000-0002-8493-4649': 51, '0000-0001-6394-658X': 31, '0000-0002-8973-104X': 2, '0000-0001-9689-7040': 1, '0000-0002-1779-4535': 1})
['0000-0002-8493-4649', '0000-0001-6394-658X']
Total sample size after apply threshold:  82
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.00      0.00      0.00        22
          2       0.00      0.00      0.00        16
          3       0.37      0.90      0.52        67
          4       0.00      0.00      0.00        28
          5       1.00      0.40      0.57        35
          6       0.62      0.88      0.73        78
          7       0.00      0.00      0.00        14
          8       1.00      0.25      0.40        32
          9       1.00      0.05      0.10        20
         10       0.00      0.00      0.00        15
         11       1.00      0.54      0.70        28
         12       0.72      0.99      0.84       171
         13       0.68      0.57      0.62        56

avg / total       0.59      0.62      0.54       595

[  0   0   0   2   0   0   3   0   0   0   0   0   7   1   0   0   0  13
   0   0   2   0   0   0   0   0   5   2   0   0   0  14   0   0   2   0
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       1.00      0.13      0.24        15
          2       1.00      0.50      0.67        18
          3       1.00      0.62      0.77        24
          4       0.00      0.00      0.00        14
          5       0.00      0.00      0.00        10
          6       0.51      1.00      0.67        97
          7       1.00      0.10      0.18        10
          8       0.00      0.00      0.00        14
          9       1.00      0.50      0.67        12

avg / total       0.57      0.58      0.48       225

[ 0  0  0  0  0  0 11  0  0  0  0  2  0  0  0  0 13  0  0  0  0  0  9  0
  0  0  9  0  0  0  0  0  0 15  0  0  9  0  0  0  0  0  0  0  0  0 14  0
  0  0  0  0  0  0  0  0 10  0  0  0  0  0  0  0  0  0 97  0  0  0  0  0
  0  0  0  0  9  1  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0
  6  0  0  6]
MNB Accuracy:  0.5777777777777777
MNB F1:  0.3190956678

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.98      1.00      0.99        42
          1       1.00      0.91      0.95        11

avg / total       0.98      0.98      0.98        53

[42  0  1 10]
svc Accuracy:  0.9811320754716981
svc F1:  0.9703081232492996
             precision    recall  f1-score   support

          0       0.89      1.00      0.94        42
          1       1.00      0.55      0.71        11

avg / total       0.92      0.91      0.89        53

[42  0  5  6]
LR Accuracy:  0.9056603773584906
LR F1:  0.8248512888301387
For name:  r_rao
total sample size before apply threshold:  94
Counter({'0000-0002-5776-8366': 52, '0000-0002-0262-5122': 14, '0000-0002-2285-6788': 12, '0000-0002-1475-3893': 9, '0000-0002-6415-0185': 7})
['0000-0002-2285-6788', '0000-0002-5776-8366', '0000-0002-0262-5122']
Total sample size after apply threshold:  78
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.i

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.79      1.00      0.88        67
          1       1.00      0.38      0.56        13
          2       1.00      0.81      0.89        21
          3       0.89      0.81      0.85        31
          4       1.00      0.73      0.84        11

avg / total       0.88      0.85      0.84       143

[67  0  0  0  0  7  5  0  1  0  2  0 17  2  0  6  0  0 25  0  3  0  0  0
  8]
svc Accuracy:  0.8531468531468531
svc F1:  0.8042868470611557
             precision    recall  f1-score   support

          0       0.66      1.00      0.80        67
          1       1.00      0.31      0.47        13
          2       1.00      0.81      0.89        21
          3       0.95      0.65      0.77        31
          4       0.00      0.00      0.00        11

avg / total       0.76      0.76      0.71       143

[67  0  0  0  0  8  4  0  1  0  4  0 17  0  0 11  0  0 20  0 11  0  0  0
  0]
LR Accuracy:  0.7552447552447552
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.68      1.00      0.81        36
          1       1.00      0.87      0.93        15
          2       1.00      0.72      0.84        18
          3       1.00      0.45      0.62        11
          4       1.00      0.67      0.80        12

avg / total       0.87      0.82      0.81        92

[36  0  0  0  0  2 13  0  0  0  5  0 13  0  0  6  0  0  5  0  4  0  0  0
  8]
LR Accuracy:  0.8152173913043478
LR F1:  0.8002539740071454
For name:  s_bose
total sample size before apply threshold:  28
Counter({'0000-0001-7310-9881': 16, '0000-0003-2397-4740': 6, '0000-0002-6569-4643': 5, '0000-0003-0137-4322': 1})
['0000-0001-7310-9881']
Total sample size after apply threshold:  16
For name:  j_dyer
total sample size before apply threshold:  61
Counter({'0000-0002-7220-6062': 44, '0000-0002-3275-8612': 13, '0000-0002-7570-9941': 3, '0000-0001-6215-0053': 1})
['0000-0002-3275-8612', '0000-0002-7220-6062']
Total sample 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 123
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(123, 1030)
(0, 0)
(0, 0)
1
123
             precision    recall  f1-score   support

          0       1.00      0.95      0.97        38
          1       1.00      0.71      0.83        14
          2       1.00      0.19      0.32        16
          3       0.74      1.00      0.85        55

avg / total       0.89      0.85      0.82       123

[36  0  0  2  0 10  0  4  0  0  3 13  0  0  0 55]
MNB Accuracy:  0.8455284552845529
MNB F1:  0.7437022395712727
             precision    recall  f1-score   support

          0       1.00      0.95

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(85, 80)
(0, 0)
(0, 0)
1
85
             precision    recall  f1-score   support

          0       0.73      1.00      0.84        38
          1       0.80      0.24      0.36        17
          2       0.96      0.90      0.93        30

avg / total       0.83      0.81      0.78        85

[38  0  0 12  4  1  2  1 27]
MNB Accuracy:  0.8117647058823529
MNB F1:  0.7130384302798095
             precision    recall  f1-score   support

          0       0.84      0.95      0.89        38
          1       0.85      0.65      0.73        17
         

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        42
          1       0.00      0.00      0.00        12
          2       0.76      1.00      0.87        58

avg / total       0.77      0.84      0.79       112

[36  0  6  0  0 12  0  0 58]
MNB Accuracy:  0.8392857142857143
MNB F1:  0.596249521622656
             precision    recall  f1-score   support

          0       1.00      0.98      0.99        42
          1       1.00      0.33      0.50        12
          2       0.87      1.00      0.93        58

avg / total       0.93      0.92      0.90       112

[41  0  1  0  4  8  0  0 58]
svc Accuracy:  0.9196428571428571
svc F1:  0.8053172690763052
             precision    recall  f1-score   support

          0       1.00      0.88      0.94        42
          1       0.00      0.00      0.00        12
          2       0.77      1.00      0.87        58

avg / total       0.78      0.85      0.80       112

[37  0  5  0  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



             precision    recall  f1-score   support

          0       0.95      1.00      0.98        20
          1       1.00      0.94      0.97        17

avg / total       0.97      0.97      0.97        37

[20  0  1 16]
LR Accuracy:  0.972972972972973
LR F1:  0.9726533628972653
For name:  b_jackson
total sample size before apply threshold:  29
Counter({'0000-0002-4917-1199': 14, '0000-0001-6313-0812': 10, '0000-0002-7127-1735': 4, '0000-0001-6405-8111': 1})
['0000-0001-6313-0812', '0000-0002-4917-1199']
Total sample size after apply threshold:  24
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabular

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 37)
(0, 0)
(0, 0)
1
21
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.92      1.00      0.96        11

avg / total       0.96      0.95      0.95        21

[ 9  1  0 11]
MNB Accuracy:  0.9523809523809523
MNB F1:  0.9519450800915332
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.85      1.00      0.92        11

avg / total       0.92      0.90      0.90        21

[ 8  2  0 11]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[ 8  4  0  0 25  0  0  3 14]
svc Accuracy:  0.8703703703703703
svc F1:  0.8601395963025844
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        12
          1       0.68      1.00      0.81        25
          2       1.00      0.76      0.87        17

avg / total       0.85      0.78      0.76        54

[ 4  8  0  0 25  0  0  4 13]
LR Accuracy:  0.7777777777777778
LR F1:  0.7243727598566307
For name:  a_silva
total sample size before apply threshold:  786
Counter({'0000-0003-2861-8286': 158, '0000-0001-5525-0494': 156, '0000-0002-8984-8600': 74, '0000-0001-5790-5116': 41, '0000-0002-7524-9914': 39, '0000-0002-7802-8690': 39, '0000-0003-4968-5138': 30, '0000-0002-7713-1813': 22, '0000-0002-9968-3707': 18, '0000-0002-6332-5182': 16, '0000-0002-5668-7134': 16, '0000-0001-5554-7714': 14, '0000-0002-4839-8279': 14, '0000-0002-1112-1209': 11, '0000-0003-0423-2514': 10, '0000-0002-4386-5851': 10, '0000-0002-9679-8357': 10, '0000-0003-3786

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.77      0.87        39
          1       1.00      0.50      0.67        18
          2       0.83      0.36      0.50        14
          3       1.00      0.70      0.82        10
          4       0.88      0.70      0.78        10
          5       1.00      0.70      0.82        10
          6       0.66      0.96      0.78       156
          7       0.75      0.94      0.83       158
          8       1.00      0.92      0.96        74
          9       1.00      0.82      0.90        22
         10       0.00      0.00      0.00        10
         11       1.00      0.10      0.18        10
         12       1.00      0.62      0.77        16
         13       1.00      0.73      0.85        30
         14       0.83      0.59      0.69        41
         15       1.00      0.87      0.93        39
         16       1.00      0.64      0.78        14
         17       1.00      0.64      0.78   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.78      0.97      0.86        39
          1       1.00      0.43      0.60        14
          2       0.92      0.80      0.86        15

avg / total       0.85      0.82      0.81        68

[38  0  1  8  6  0  3  0 12]
MNB Accuracy:  0.8235294117647058
MNB F1:  0.7735930735930735
             precision    recall  f1-score   support

          0       0.89      1.00      0.94        39
          1       1.00      0.79      0.88        14
          2       1.00      0.87      0.93        15

avg / total       0.93      0.93      0.92        68

[39  0  0  3 11  0  2  0 13]
svc Accuracy:  0.9264705882352942
svc F1:  0.9161101549053358
             precision    recall  f1-score   support

          0       0.72      1.00      0.84        39
          1       1.00      0.29      0.44        14
          2       1.00      0.67      0.80        15

avg / total       0.84      0.78      0.75        68

[39  0  0 10  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.73      0.99      0.84       103
          2       0.97      0.72      0.82        46
          3       1.00      0.83      0.90        69

avg / total       0.82      0.83      0.81       231

[  0  13   0   0   0 102   1   0   0  13  33   0   0  12   0  57]
svc Accuracy:  0.8311688311688312
svc F1:  0.6423170194003527
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.67      1.00      0.80       103
          2       1.00      0.54      0.70        46
          3       1.00      0.77      0.87        69

avg / total       0.80      0.78      0.76       231

[  0  13   0   0   0 103   0   0   0  21  25   0   0  16   0  53]
LR Accuracy:  0.7835497835497836
LR F1:  0.5944413277822673
For name:  m_kobayashi
total sample size before apply threshold:  51
Counter({'0000-0002-6657-1928': 33,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      1.00      0.90        44
          1       0.00      0.00      0.00        10

avg / total       0.66      0.81      0.73        54

[44  0 10  0]
LR Accuracy:  0.8148148148148148
LR F1:  0.44897959183673464
For name:  a_mills
total sample size before apply threshold:  169
Counter({'0000-0001-9863-9950': 115, '0000-0003-4880-7332': 34, '0000-0002-6997-5581': 15, '0000-0002-6893-3857': 3, '0000-0003-4932-8413': 1, '0000-0002-9065-0458': 1})
['0000-0001-9863-9950', '0000-0002-6997-5581', '0000-0003-4880-7332']
Total sample size after apply threshold:  164
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        toke

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      1.00      0.98       155
          1       1.00      0.75      0.86        20

avg / total       0.97      0.97      0.97       175

[155   0   5  15]
svc Accuracy:  0.9714285714285714
svc F1:  0.9206349206349206
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       155
          1       1.00      0.10      0.18        20

avg / total       0.91      0.90      0.86       175

[155   0  18   2]
LR Accuracy:  0.8971428571428571
LR F1:  0.563470066518847
For name:  a_marino
total sample size before apply threshold:  15
Counter({'0000-0002-1709-538X': 7, '0000-0002-0528-4925': 6, '0000-0003-0308-859X': 1, '0000-0001-8751-8811': 1})
[]
Total sample size after apply threshold:  0
For name:  r_jiang
total sample size before apply threshold:  102
Counter({'0000-0002-8280-6029': 54, '0000-0002-7533-3753': 28, '0000-0002-3816-4639': 19, '0000-0001-5857-8540': 1})
['0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



             precision    recall  f1-score   support

          0       0.84      1.00      0.92        54
          1       0.92      0.58      0.71        19
          2       0.92      0.82      0.87        28

avg / total       0.88      0.87      0.86       101

[54  0  0  6 11  2  4  1 23]
LR Accuracy:  0.8712871287128713
LR F1:  0.8309520616482869
For name:  t_becker
total sample size before apply threshold:  21
Counter({'0000-0002-4117-8249': 12, '0000-0002-5656-4564': 5, '0000-0003-3432-783X': 3, '0000-0002-5193-4044': 1})
['0000-0002-4117-8249']
Total sample size after apply threshold:  12
For name:  s_pedersen
total sample size before apply threshold:  322
Counter({'0000-0002-7838-8063': 166, '0000-0002-3044-7714': 80, '0000-0002-6500-9263': 40, '0000-0002-4786-6464': 21, '0000-0001-8055-3251': 11, '0000-0002-8566-7693': 1, '0000-0002-4355-1764': 1, '0000-0002-3822-5075': 1, '0000-0001-8017-4227': 1})
['0000-0002-7838-8063', '0000-0002-3044-7714', '0000-0002-4786-6464', '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.96      0.99      0.98       166
          1       0.85      0.96      0.90        80
          2       1.00      0.62      0.76        21
          3       1.00      0.55      0.71        11
          4       1.00      0.95      0.97        40

avg / total       0.94      0.94      0.93       318

[164   2   0   0   0   3  77   0   0   0   1   7  13   0   0   2   3   0
   6   0   0   2   0   0  38]
svc Accuracy:  0.9371069182389937
svc F1:  0.8643444962330411
             precision    recall  f1-score   support

          0       0.80      1.00      0.89       166
          1       0.98      0.81      0.89        80
          2       1.00      0.57      0.73        21
          3       0.00      0.00      0.00        11
          4       1.00      0.80      0.89        40

avg / total       0.86      0.86      0.85       318

[166   0   0   0   0  15  65   0   0   0   9   0  12   0   0  11   0   0
   0   0   7  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[15  0  1 10]
LR Accuracy:  0.9615384615384616
LR F1:  0.9600614439324117
For name:  k_jones
total sample size before apply threshold:  607
Counter({'0000-0001-7108-9776': 331, '0000-0002-0294-0851': 74, '0000-0001-8923-2999': 55, '0000-0001-8398-2190': 32, '0000-0003-4764-7031': 29, '0000-0002-7380-9797': 18, '0000-0001-9136-0877': 15, '0000-0002-7216-2506': 13, '0000-0003-3815-5713': 9, '0000-0002-8819-8992': 6, '0000-0002-7127-1612': 4, '0000-0002-0242-7097': 4, '0000-0002-6916-8640': 4, '0000-0001-5692-653X': 3, '0000-0002-9982-8742': 3, '0000-0002-0478-8021': 2, '0000-0001-9373-0982': 1, '0000-0001-7335-1379': 1, '0000-0001-6553-8897': 1, '0000-0002-1552-7847': 1, '0000-0001-9115-4192': 1})
['0000-0002-7380-9797', '0000-0001-8923-2999', '0000-0002-7216-2506', '0000-0003-4764-7031', '0000-0001-7108-9776', '0000-0001-9136-0877', '0000-0002-0294-0851', '0000-0001-8398-2190']
Total sample size after apply threshold:  567
TfidfVectorizer(analyzer='word', binary=False, decode_error='st

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.22      0.36        18
          1       1.00      0.60      0.75        55
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        29
          4       0.72      1.00      0.84       331
          5       1.00      0.20      0.33        15
          6       0.96      0.73      0.83        74
          7       1.00      0.34      0.51        32

avg / total       0.76      0.77      0.72       567

[  4   0   0   0  14   0   0   0   0  33   0   0  22   0   0   0   0   0
   0   0  13   0   0   0   0   0   0   0  28   0   1   0   0   0   0   0
 331   0   0   0   0   0   0   0  11   3   1   0   0   0   0   0  20   0
  54   0   0   0   0   0  21   0   0  11]
MNB Accuracy:  0.7689594356261023
MNB F1:  0.453285266475995
             precision    recall  f1-score   support

          0       1.00      0.89      0.94        18
          1       1.00      0.73      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        28
          1       0.91      1.00      0.96        32
          2       1.00      1.00      1.00        29

avg / total       0.97      0.97      0.97        89

[25  3  0  0 32  0  0  0 29]
svc Accuracy:  0.9662921348314607
svc F1:  0.9662067023373698
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        28
          1       0.86      1.00      0.93        32
          2       1.00      1.00      1.00        29

avg / total       0.95      0.94      0.94        89

[23  5  0  0 32  0  0  0 29]
LR Accuracy:  0.9438202247191011
LR F1:  0.9431656720659278
For name:  s_rafiq
total sample size before apply threshold:  33
Counter({'0000-0003-4873-4540': 23, '0000-0002-9295-3065': 9, '0000-0003-4821-5783': 1})
['0000-0003-4873-4540']
Total sample size after apply threshold:  23
For name:  h_liang
total sample size before apply threshold: 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        15
          1       0.70      0.85      0.77        27
          2       0.72      0.97      0.83        30
          3       1.00      0.50      0.67        10
          4       1.00      0.83      0.91        12

avg / total       0.83      0.78      0.76        94

[ 6  7  2  0  0  0 23  4  0  0  0  1 29  0  0  0  2  3  5  0  0  0  2  0
 10]
LR Accuracy:  0.776595744680851
LR F1:  0.7484848484848485
For name:  c_davis
total sample size before apply threshold:  43
Counter({'0000-0002-5045-0507': 34, '0000-0002-3971-3505': 2, '0000-0003-0866-7822': 2, '0000-0002-0024-2742': 2, '0000-0002-3274-5707': 2, '0000-0001-6205-9719': 1})
['0000-0002-5045-0507']
Total sample size after apply threshold:  34
For name:  e_hall
total sample size before apply threshold:  115
Counter({'0000-0001-5999-5020': 49, '0000-0002-5306-082X': 34, '0000-0002-9477-8619': 24, '0000-0002-9206-4436': 4, '0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.94      0.88        49
          1       1.00      0.79      0.88        24
          2       0.91      0.88      0.90        34

avg / total       0.90      0.89      0.89       107

[46  0  3  5 19  0  4  0 30]
MNB Accuracy:  0.8878504672897196
MNB F1:  0.8879529009692148
             precision    recall  f1-score   support

          0       0.86      1.00      0.92        49
          1       1.00      0.83      0.91        24
          2       1.00      0.88      0.94        34

avg / total       0.94      0.93      0.93       107

[49  0  0  4 20  0  4  0 30]
svc Accuracy:  0.9252336448598131
svc F1:  0.9237064036592338
             precision    recall  f1-score   support

          0       0.68      1.00      0.81        49
          1       1.00      0.46      0.63        24
          2       1.00      0.71      0.83        34

avg / total       0.85      0.79      0.77       107

[49  0  0 13 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.65      0.73      0.69        15
          1       0.67      0.57      0.62        14

avg / total       0.66      0.66      0.65        29

[11  4  6  8]
LR Accuracy:  0.6551724137931034
LR F1:  0.6514423076923077
For name:  r_lewis
total sample size before apply threshold:  427
Counter({'0000-0003-3470-923X': 185, '0000-0002-2002-4339': 175, '0000-0003-4044-9104': 41, '0000-0002-4598-7553': 7, '0000-0003-1395-3276': 6, '0000-0003-1859-0021': 4, '0000-0001-9929-2629': 3, '0000-0001-6642-5771': 3, '0000-0002-2680-6235': 1, '0000-0002-6644-6385': 1, '0000-0003-1046-811X': 1})
['0000-0002-2002-4339', '0000-0003-3470-923X', '0000-0003-4044-9104']
Total sample size after apply threshold:  401
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.99      0.95      0.97       175
          1       0.89      1.00      0.94       185
          2       1.00      0.63      0.78        41

avg / total       0.95      0.94      0.94       401

[167   8   0   0 185   0   1  14  26]
MNB Accuracy:  0.942643391521197
MNB F1:  0.8979192956500297
             precision    recall  f1-score   support

          0       1.00      0.93      0.96       175
          1       0.88      1.00      0.93       185
          2       1.00      0.68      0.81        41

avg / total       0.94      0.94      0.93       401

[162  13   0   0 185   0   0  13  28]
svc Accuracy:  0.9351620947630923
svc F1:  0.9024539898620662
             precision    recall  f1-score   support

          0       1.00      0.90      0.95       175
          1       0.83      1.00      0.91       185
          2       1.00      0.51      0.68        41

avg / total       0.92      0.91      0.90       40

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.94      0.97        18
          1       0.96      1.00      0.98        23

avg / total       0.98      0.98      0.98        41

[17  1  0 23]
LR Accuracy:  0.975609756097561
LR F1:  0.9750759878419453
For name:  p_hall
total sample size before apply threshold:  22
Counter({'0000-0001-6015-7841': 11, '0000-0001-9218-6233': 9, '0000-0002-4239-4226': 1, '0000-0002-8214-0351': 1})
['0000-0001-6015-7841']
Total sample size after apply threshold:  11
For name:  r_srivastava
total sample size before apply threshold:  184
Counter({'0000-0002-0065-4069': 144, '0000-0003-3112-4252': 22, '0000-0002-6703-9642': 7, '0000-0001-9328-146X': 6, '0000-0002-0165-1556': 3, '0000-0002-9965-851X': 2})
['0000-0002-0065-4069', '0000-0003-3112-4252']
Total sample size after apply threshold:  166
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input=

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.87      1.00      0.93       144
          1       0.00      0.00      0.00        22

avg / total       0.75      0.87      0.81       166

[144   0  22   0]
svc Accuracy:  0.8674698795180723
svc F1:  0.46451612903225803
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       144
          1       0.00      0.00      0.00        22

avg / total       0.75      0.87      0.81       166

[144   0  22   0]
LR Accuracy:  0.8674698795180723
LR F1:  0.46451612903225803
For name:  a_macedo
total sample size before apply threshold:  29
Counter({'0000-0002-2613-4838': 18, '0000-0003-3436-2010': 8, '0000-0002-6854-9855': 2, '0000-0001-6985-4520': 1})
['0000-0002-2613-4838']
Total sample size after apply threshold:  18
For name:  m_schultz
total sample size before apply threshold:  40
Counter({'0000-0003-3458-1811': 16, '0000-0002-7689-6531': 16, '0000-0003-3455-774X': 4, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(66, 207)
(0, 0)
(0, 0)
1
66
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        14
          1       0.56      0.88      0.68        16
          2       1.00      0.91      0.95        11
          3       0.56      0.38      0.45        13
          4       1.00      0.75      0.86        12

avg / total       0.81      0.77      0.77        66

[13  1  0  0  0  0 14  0  2  0  0  1 10  0  0  0  8  0  5  0  0  1  0  2
  9]
MNB Accuracy:  0.7727272727272727
MNB F1:  0.7819918112601039
             

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.08      0.13        13
          1       0.83      0.98      0.90        61

avg / total       0.77      0.82      0.77        74

[ 1 12  1 60]
MNB Accuracy:  0.8243243243243243
MNB F1:  0.5177944862155389
             precision    recall  f1-score   support

          0       1.00      0.23      0.38        13
          1       0.86      1.00      0.92        61

avg / total       0.88      0.86      0.83        74

[ 3 10  0 61]
svc Accuracy:  0.8648648648648649
svc F1:  0.6496212121212122
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.82      1.00      0.90        61

avg / total       0.68      0.82      0.74        74

[ 0 13  0 61]
LR Accuracy:  0.8243243243243243
LR F1:  0.45185185185185184
For name:  r_gross
total sample size before apply threshold:  71
Counter({'0000-0001-5884-3607': 38, '0000-0003-4524-7552': 23, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(109, 274)
(0, 0)
(0, 0)
1
109
             precision    recall  f1-score   support

          0       1.00      0.15      0.27        26
          1       0.50      0.07      0.12        14
          2       0.66      0.99      0.79        69

avg / total       0.72      0.67      0.58       109

[ 4  0 22  0  1 13  0  1 68]
MNB Accuracy:  0.6697247706422018
MNB F1:  0.3941214470284238
             precision    recall  f1-score   support

          0       0.83      0.58      0.68        26
          1       1.00      0.71      0.83        14
          2       0.83      0.97      0.89        69

avg / total       0.85      0.84      0.84       109

[15  0 11  1 10  3  2  0 67]
svc Accuracy:  0.8440366972477065
svc F1:  0.8028282828282828
             precision    recall  f1-score   support

          0       1.00      0.12      0.21        26
          1       1.00      0.21      0.35        14
          2       0.67      1.00      0.80        69

avg / total       0.79      0.69    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(27, 114)
(0, 0)
(0, 0)
1
27
             precision    recall  f1-score   support

          0       0.80      1.00      0.89        16
          1       1.00      0.64      0.78        11

avg / total       0.88      0.85      0.84        27

[16  0  4  7]
MNB Accuracy:  0.8518518518518519
MNB F1:  0.8333333333333334
             precision    recall  f1-score   support

          0       0.83      0.94      0.88        16
          1       0.89      0.73      0.80        11

avg / total       0.86      0.85      0.85        27

[15  1  3  8]
svc Accuracy:  0.8518518518518519
svc F1:  0.8411764705882352
             precision    recall  f1-score   support

          0       0.73      1.00      0.84        16
          1       1.00      0.45      0.62        11

avg / total       0.84      0.78      0.75        27

[16  0  6  5]
LR Accuracy:  0.7777777777777778
LR F1:  0.7335526315789473
For name:  d_lloyd
total sample size before apply threshold:  157
Counter({'0000-0002-0824-9682': 10

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.98      1.00      0.99       174

avg / total       0.98      0.98      0.98       184

[  6   4   0 174]
svc Accuracy:  0.9782608695652174
svc F1:  0.8693181818181818
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.95      1.00      0.97       174

avg / total       0.89      0.95      0.92       184

[  0  10   0 174]
LR Accuracy:  0.9456521739130435
LR F1:  0.4860335195530726
For name:  s_chang
total sample size before apply threshold:  592
Counter({'0000-0001-6505-4139': 322, '0000-0002-5620-0867': 61, '0000-0003-3751-1720': 37, '0000-0002-6164-0875': 28, '0000-0002-7624-439X': 22, '0000-0002-2663-5042': 20, '0000-0002-5015-8178': 19, '0000-0003-1523-7986': 15, '0000-0003-4160-7549': 12, '0000-0002-2564-2945': 11, '0000-0003-1488-1649': 11, '0000-0002-0558-0038': 8, '0000-0003-08

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.21      0.35        19
          1       0.67      0.99      0.80       322
          2       0.00      0.00      0.00        11
          3       0.00      0.00      0.00        11
          4       0.00      0.00      0.00        15
          5       0.67      0.11      0.19        37
          6       0.00      0.00      0.00        28
          7       0.86      0.97      0.91        61
          8       0.00      0.00      0.00        20
          9       0.00      0.00      0.00        12
         10       0.00      0.00      0.00        22

avg / total       0.56      0.69      0.58       558

[  4  14   0   0   0   1   0   0   0   0   0   0 319   0   0   0   0   0
   3   0   0   0   0   9   0   0   0   0   0   2   0   0   0   0  11   0
   0   0   0   0   0   0   0   0   0  15   0   0   0   0   0   0   0   0
   0   0  33   0   0   0   4   0   0   0   0   0   0  28   0   0   0   0
   0   0   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(214, 693)
(0, 0)
(0, 0)
1
214
             precision    recall  f1-score   support

          0       0.96      0.89      0.93        85
          1       0.80      1.00      0.89       102
          2       1.00      0.30      0.46        27

avg / total       0.89      0.87      0.85       214

[ 76   9   0   0 102   0   3  16   8]
MNB Accuracy:  0.8691588785046729
MNB F1:  0.7582672732528949


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.99      0.88      0.93        85
          1       0.81      1.00      0.89       102
          2       1.00      0.44      0.62        27

avg / total       0.90      0.88      0.87       214

[ 75  10   0   0 102   0   1  14  12]
svc Accuracy:  0.883177570093458
svc F1:  0.813932825374473
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        85
          1       0.74      1.00      0.85       102
          2       1.00      0.15      0.26        27

avg / total       0.88      0.83      0.80       214

[ 72  13   0   0 102   0   0  23   4]
LR Accuracy:  0.8317757009345794
LR F1:  0.6750873227861106
For name:  h_yoo
total sample size before apply threshold:  22
Counter({'0000-0001-6186-3262': 11, '0000-0001-9677-0947': 4, '0000-0001-9819-3135': 3, '0000-0002-8039-9482': 3, '0000-0003-3810-1811': 1})
['0000-0001-6186-3262']
Total sample size after apply threshold:

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[ 7  3  2  0 47  1  0  0 37]
MNB Accuracy:  0.9381443298969072
MNB F1:  0.8856882465905023
             precision    recall  f1-score   support

          0       0.90      0.75      0.82        12
          1       0.94      0.96      0.95        48
          2       0.97      1.00      0.99        37

avg / total       0.95      0.95      0.95        97

[ 9  3  0  1 46  1  0  0 37]
svc Accuracy:  0.9484536082474226
svc F1:  0.9177673643653025
             precision    recall  f1-score   support

          0       1.00      0.67      0.80        12
          1       0.94      0.96      0.95        48
          2       0.93      1.00      0.96        37

avg / total       0.94      0.94      0.93        97

[ 8  3  1  0 46  2  0  0 37]
LR Accuracy:  0.9381443298969072
LR F1:  0.903164189762128
For name:  m_vitale
total sample size before apply threshold:  217
Counter({'0000-0002-3261-6868': 98, '0000-0001-5372-7885': 63, '0000-0003-2084-2718': 35, '0000-0002-6740-2472': 12, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        43
          1       1.00      1.00      1.00        11
          2       1.00      1.00      1.00        35

avg / total       1.00      1.00      1.00        89

[43  0  0  0 11  0  0  0 35]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       0.83      1.00      0.91        43
          1       1.00      0.27      0.43        11
          2       1.00      0.97      0.99        35

avg / total       0.92      0.90      0.88        89

[43  0  0  8  3  0  1  0 34]
LR Accuracy:  0.898876404494382
LR F1:  0.7731139442809923
For name:  a_hassan
total sample size before apply threshold:  16
Counter({'0000-0001-9509-9266': 7, '0000-0002-7719-0805': 4, '0000-0001-9346-3765': 2, '0000-0001-8842-1798': 1, '0000-0002-1853-7987': 1, '0000-0002-5574-8791': 1})
[]
Total sample size after apply threshold:  0
For name:  w_martin
total sample siz

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.85      1.00      0.92       180
          1       0.98      0.73      0.84        60
          2       0.00      0.00      0.00        18

avg / total       0.82      0.87      0.83       258

[180   0   0  16  44   0  17   1   0]
svc Accuracy:  0.8682170542635659
svc F1:  0.584708590815461
             precision    recall  f1-score   support

          0       0.78      1.00      0.88       180
          1       1.00      0.45      0.62        60
          2       0.00      0.00      0.00        18

avg / total       0.78      0.80      0.76       258

[180   0   0  33  27   0  18   0   0]
LR Accuracy:  0.8023255813953488
LR F1:  0.498867354643846
For name:  a_krishnan
total sample size before apply threshold:  46
Counter({'0000-0002-9173-7811': 41, '0000-0002-7489-9229': 3, '0000-0002-7980-4110': 1, '0000-0002-9677-9092': 1})
['0000-0002-9173-7811']
Total sample size after apply threshold:  41
For name:  l_tav

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.74      1.00      0.85        55
          1       1.00      0.20      0.33        10
          2       1.00      0.21      0.35        14

avg / total       0.82      0.76      0.70        79

[55  0  0  8  2  0 11  0  3]
svc Accuracy:  0.759493670886076
svc F1:  0.5129958960328317
             precision    recall  f1-score   support

          0       0.70      1.00      0.82        55
          1       0.00      0.00      0.00        10
          2       0.00      0.00      0.00        14

avg / total       0.48      0.70      0.57        79

[55  0  0 10  0  0 14  0  0]
LR Accuracy:  0.6962025316455697
LR F1:  0.2736318407960199
For name:  a_schmidt
total sample size before apply threshold:  90
Counter({'0000-0002-1090-8165': 51, '0000-0002-3925-9429': 14, '0000-0003-1327-0424': 12, '0000-0002-1185-3012': 9, '0000-0001-8946-1310': 1, '0000-0002-9963-7786': 1, '0000-0002-6448-6367': 1, '0000-0001-6144-9950': 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(295, 1580)
(0, 0)
(0, 0)
1
295
             precision    recall  f1-score   support

          0       1.00      0.42      0.59        48
          1       0.98      0.96      0.97        51
          2       1.00      0.12      0.21        17
          3       0.00      0.00      0.00        13
          4       0.67      1.00      0.80       149
          5       0.00      0.00      0.00        17

avg / total       0.73      0.75      0.68       295

[ 20   1   0   0  27   0   0  49   0   0   2   0   0   0   2   0  15   0
   0   0   0   0  13   0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.95      0.85      0.90        48
          1       1.00      0.96      0.98        51
          2       1.00      0.71      0.83        17
          3       1.00      0.46      0.63        13
          4       0.86      1.00      0.93       149
          5       1.00      0.71      0.83        17

avg / total       0.92      0.91      0.91       295

[ 41   0   0   0   7   0   0  49   0   0   2   0   0   0  12   0   5   0
   2   0   0   6   5   0   0   0   0   0 149   0   0   0   0   0   5  12]
svc Accuracy:  0.911864406779661
svc F1:  0.8488860167949571
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        48
          1       1.00      0.94      0.97        51
          2       1.00      0.29      0.45        17
          3       1.00      0.15      0.27        13
          4       0.71      1.00      0.83       149
          5       1.00      0.29      0.45    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[20  1  0 17]
MNB Accuracy:  0.9736842105263158
MNB F1:  0.9735191637630662
             precision    recall  f1-score   support

          0       1.00      0.95      0.98        21
          1       0.94      1.00      0.97        17

avg / total       0.98      0.97      0.97        38

[20  1  0 17]
svc Accuracy:  0.9736842105263158
svc F1:  0.9735191637630662
             precision    recall  f1-score   support

          0       0.95      0.95      0.95        21
          1       0.94      0.94      0.94        17

avg / total       0.95      0.95      0.95        38

[20  1  1 16]
LR Accuracy:  0.9473684210526315
LR F1:  0.9467787114845938
For name:  s_lam
total sample size before apply threshold:  90
Counter({'0000-0003-3294-6637': 69, '0000-0001-7468-1142': 6, '0000-0002-5318-1760': 5, '0000-0002-2982-9192': 3, '0000-0002-1888-1067': 3, '0000-0001-7943-5004': 3, '0000-0002-1471-5176': 1})
['0000-0003-3294-6637']
Total sample size after apply threshold:  69
For name:  t_tran
t

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.72      1.00      0.84       140
          1       1.00      0.41      0.58        27
          2       1.00      0.84      0.91        89
          3       0.00      0.00      0.00        12
          4       0.00      0.00      0.00        12

avg / total       0.78      0.81      0.77       280

[140   0   0   0   0  16  11   0   0   0  14   0  75   0   0  12   0   0
   0   0  12   0   0   0   0]
MNB Accuracy:  0.8071428571428572
MNB F1:  0.4663809736111858
             precision    recall  f1-score   support

          0       0.86      0.99      0.92       140
          1       0.90      0.67      0.77        27
          2       0.94      0.90      0.92        89
          3       1.00      0.33      0.50        12
          4       1.00      0.75      0.86        12

avg / total       0.90      0.89      0.88       280

[139   0   1   0   0   5  18   4   0   0   9   0  80   0   0   6   2   0
   4   0   3  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.97      0.88        29
          1       0.94      1.00      0.97        31
          2       1.00      0.27      0.43        11

avg / total       0.89      0.87      0.85        71

[28  1  0  0 31  0  7  1  3]
LR Accuracy:  0.8732394366197183
LR F1:  0.7574404761904762
For name:  j_hong
total sample size before apply threshold:  143
Counter({'0000-0002-2476-3737': 29, '0000-0002-4592-7083': 26, '0000-0002-2891-5785': 20, '0000-0001-9467-6463': 16, '0000-0001-9912-633X': 12, '0000-0003-2212-2861': 12, '0000-0002-9915-8072': 8, '0000-0003-0617-9307': 6, '0000-0001-7979-5966': 5, '0000-0002-0109-5975': 5, '0000-0001-5172-6889': 4})
['0000-0001-9912-633X', '0000-0002-2891-5785', '0000-0003-2212-2861', '0000-0002-2476-3737', '0000-0001-9467-6463', '0000-0002-4592-7083']
Total sample size after apply threshold:  115
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(127, 434)
(0, 0)
(0, 0)
1
127
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.95      0.73      0.82        48
          2       0.76      0.99      0.86        69

avg / total       0.77      0.81      0.78       127

[ 0  1  9  0 35 13  0  1 68]
MNB Accuracy:  0.8110236220472441
MNB F1:  0.5596251079047971
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.97      0.81      0.89        48
      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(109, 284)
(0, 0)
(0, 0)
1
109
             precision    recall  f1-score   support

          0       1.00      0.88      0.94        17
          1       1.00      0.69      0.82        13
          2       1.00      0.29      0.44        14
          3       1.00      0.45      0.62        11
          4       1.00      0.20      0.33        15
          5       0.53      1.00      0.70        39

avg / total       0.83      0.69      0.66       109

[15  0  0  0  0  2  0  9  0  0  0  4  0  0  4  0  0 10  0  0  0  5  0  6
  0  0  0  0  3 12  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 734
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(734, 830)
(0, 0)
(0, 0)
1
734
             precision    recall  f1-score   support

          0       1.00      0.21      0.34        24
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        15
          3       0.96      0.86      0.91        28
          4       0.00      0.00      0.00        18
          5       0.00      0.00      0.00        16
          6       0.00      0.00      0.00        20
          7       0.00      0.00      0.00        10
          8       1.00      0.10      0.17

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.95      0.88      0.91        24
          1       1.00      0.67      0.80        12
          2       0.67      0.53      0.59        15
          3       0.93      0.93      0.93        28
          4       0.86      0.33      0.48        18
          5       0.93      0.81      0.87        16
          6       0.86      0.95      0.90        20
          7       1.00      0.90      0.95        10
          8       0.95      0.86      0.90        21
          9       0.67      0.97      0.79       188
         10       0.94      0.89      0.91        18
         11       0.93      0.58      0.72        24
         12       1.00      0.97      0.98        58
         13       0.56      0.31      0.40        16
         14       0.93      0.74      0.82        19
         15       0.81      0.93      0.87        14
         16       1.00      0.35      0.52        20
         17       0.77      0.53      0.62   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       1.00      0.96      0.98        23
          2       0.94      0.79      0.86        19
          3       1.00      1.00      1.00        16
          4       1.00      0.75      0.86        20
          5       1.00      0.79      0.88        24
          6       1.00      0.94      0.97        34
          7       0.63      1.00      0.77        39
          8       1.00      0.75      0.86        12
          9       1.00      0.77      0.87        13

avg / total       0.93      0.89      0.89       212

[11  0  0  0  0  0  0  1  0  0  0 22  0  0  0  0  0  1  0  0  0  0 15  0
  0  0  0  4  0  0  0  0  0 16  0  0  0  0  0  0  0  0  0  0 15  0  0  5
  0  0  0  0  1  0  0 19  0  4  0  0  0  0  0  0  0  0 32  2  0  0  0  0
  0  0  0  0  0 39  0  0  0  0  0  0  0  0  0  3  9  0  0  0  0  0  0  0
  0  3  0 10]
svc Accuracy:  0.8867924528301887
svc F1:  0.9000988433

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        38
          1       1.00      0.75      0.86        12
          2       0.87      1.00      0.93        45

avg / total       0.94      0.93      0.93        95

[34  0  4  0  9  3  0  0 45]
svc Accuracy:  0.9263157894736842
svc F1:  0.9098074510445645
             precision    recall  f1-score   support

          0       1.00      0.87      0.93        38
          1       1.00      0.67      0.80        12
          2       0.83      1.00      0.91        45

avg / total       0.92      0.91      0.90        95

[33  0  5  0  8  4  0  0 45]
LR Accuracy:  0.9052631578947369
LR F1:  0.8795561246265472
For name:  c_barros
total sample size before apply threshold:  34
Counter({'0000-0003-4666-5000': 16, '0000-0003-3244-7467': 13, '0000-0003-2330-398X': 2, '0000-0002-5863-2874': 2, '0000-0003-2236-4553': 1})
['0000-0003-3244-7467', '0000-0003-4666-5000']
Total sample size after apply

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 157
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(157, 794)
(0, 0)
(0, 0)
1
157
             precision    recall  f1-score   support

          0       0.91      1.00      0.96       139
          1       1.00      0.28      0.43        18

avg / total       0.92      0.92      0.90       157

[139   0  13   5]
MNB Accuracy:  0.9171974522292994
MNB F1:  0.695054534588376
             precision    recall  f1-score   support

          0       0.95      1.00      0.97       139
          1       1.00      0.56      0.71        18

avg / total       0.95      0.95      0.94       157

[139   0   

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.60      0.75        15
          1       1.00      1.00      1.00        14
          2       0.81      1.00      0.89        79
          3       1.00      0.93      0.96        14
          4       1.00      0.80      0.89        10
          5       1.00      0.86      0.92        14
          6       1.00      0.65      0.79        23

avg / total       0.91      0.89      0.88       169

[ 9  0  6  0  0  0  0  0 14  0  0  0  0  0  0  0 79  0  0  0  0  0  0  1
 13  0  0  0  0  0  2  0  8  0  0  0  0  2  0  0 12  0  0  0  8  0  0  0
 15]
svc Accuracy:  0.8875739644970414
svc F1:  0.8867225466244201
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        15
          1       1.00      1.00      1.00        14
          2       0.61      1.00      0.76        79
          3       1.00      0.29      0.44        14
          4       1.00      0.10      0.1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.93      0.96        29
          1       0.95      1.00      0.98        40

avg / total       0.97      0.97      0.97        69

[27  2  0 40]
LR Accuracy:  0.9710144927536232
LR F1:  0.9699477351916377
For name:  j_chang
total sample size before apply threshold:  360
Counter({'0000-0001-5726-9797': 85, '0000-0002-8423-5987': 38, '0000-0002-6596-931X': 33, '0000-0002-0890-9302': 31, '0000-0002-3880-3787': 29, '0000-0002-2717-0101': 23, '0000-0001-5582-0928': 17, '0000-0002-4655-1516': 17, '0000-0002-6477-6938': 15, '0000-0003-3773-182X': 12, '0000-0001-8651-2602': 11, '0000-0001-5039-2186': 9, '0000-0001-7843-2688': 9, '0000-0002-6711-1739': 8, '0000-0002-3974-8089': 5, '0000-0001-7449-4080': 4, '0000-0003-3469-9553': 4, '0000-0001-5241-8175': 3, '0000-0002-3811-1254': 2, '0000-0003-4633-587X': 2, '0000-0003-2613-7585': 1, '0000-0003-0041-4804': 1, '0000-0002-4296-4065': 1})
['0000-0001-5726-9797', '0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.94      0.89      0.92        85
          1       1.00      1.00      1.00        12
          2       0.93      0.82      0.87        17
          3       0.91      0.87      0.89        23
          4       0.97      0.88      0.92        33
          5       0.91      0.67      0.77        15
          6       0.93      0.97      0.95        29
          7       0.78      0.90      0.84        31
          8       0.62      0.82      0.70        38
          9       1.00      0.88      0.94        17
         10       1.00      0.82      0.90        11

avg / total       0.89      0.87      0.88       311

[76  0  0  0  0  1  0  0  8  0  0  0 12  0  0  0  0  0  0  0  0  0  0  0
 14  0  0  0  1  2  0  0  0  1  0  0 20  0  0  0  0  2  0  0  0  0  0  0
 29  0  0  1  3  0  0  2  0  0  0  0 10  0  0  3  0  0  0  0  0  0  0  0
 28  1  0  0  0  0  0  1  0  0  0  0 28  2  0  0  2  0  0  2  1  0  0  2
 31  0  0  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.82      0.82        11
          1       0.67      0.95      0.78        19
          2       1.00      0.85      0.92        13
          3       1.00      0.93      0.96        14
          4       1.00      0.62      0.76        13

avg / total       0.88      0.84      0.85        70

[ 9  2  0  0  0  1 18  0  0  0  1  1 11  0  0  0  1  0 13  0  0  5  0  0
  8]
svc Accuracy:  0.8428571428571429
svc F1:  0.8484649810736767
             precision    recall  f1-score   support

          0       0.86      0.55      0.67        11
          1       0.58      1.00      0.73        19
          2       1.00      0.77      0.87        13
          3       1.00      1.00      1.00        14
          4       1.00      0.46      0.63        13

avg / total       0.86      0.79      0.78        70

[ 6  5  0  0  0  0 19  0  0  0  1  2 10  0  0  0  0  0 14  0  0  7  0  0
  6]
LR Accuracy:  0.7857142857142857
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        15
          1       1.00      0.08      0.15        25
          2       1.00      0.09      0.16        23
          3       0.00      0.00      0.00        13
          4       0.97      0.86      0.91        42
          5       0.00      0.00      0.00        21
          6       1.00      0.10      0.18        10
          7       0.00      0.00      0.00        21
          8       0.00      0.00      0.00        15
          9       0.00      0.00      0.00        13
         10       0.00      0.00      0.00        20
         11       0.31      1.00      0.47        98
         12       0.00      0.00      0.00        14
         13       1.00      0.58      0.73        19
         14       0.00      0.00      0.00        12
         15       0.00      0.00      0.00        10

avg / total       0.40      0.40      0.29       371

[ 0  0  0  0  0  0  0  0  0  0  0 15  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


28
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(28, 91)
(0, 0)
(0, 0)
1
28
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        14
          1       0.93      1.00      0.97        14

avg / total       0.97      0.96      0.96        28

[13  1  0 14]
MNB Accuracy:  0.9642857142857143
MNB F1:  0.9642401021711366
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        14
          1       0.93      1.00      0.97        14

avg / total       0.97      0.96      0.96        28

[13  1  0 14]
svc A

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      1.00      0.96        92
          1       1.00      0.80      0.89        41

avg / total       0.94      0.94      0.94       133

[92  0  8 33]
svc Accuracy:  0.9398496240601504
svc F1:  0.9251126126126126
             precision    recall  f1-score   support

          0       0.81      1.00      0.90        92
          1       1.00      0.49      0.66        41

avg / total       0.87      0.84      0.82       133

[92  0 21 20]
LR Accuracy:  0.8421052631578947
LR F1:  0.7766493402638945
For name:  d_hwang
total sample size before apply threshold:  52
Counter({'0000-0002-2487-2255': 40, '0000-0002-9684-3998': 9, '0000-0001-5275-0354': 2, '0000-0001-6899-1769': 1})
['0000-0002-2487-2255']
Total sample size after apply threshold:  40
For name:  c_shen
total sample size before apply threshold:  111
Counter({'0000-0002-0747-217X': 56, '0000-0003-2833-2771': 22, '0000-0002-2517-3472': 7, '0000-0001-7392-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.23      0.37        22
          1       0.77      1.00      0.87        56

avg / total       0.83      0.78      0.73        78

[ 5 17  0 56]
LR Accuracy:  0.782051282051282
LR F1:  0.6192937123169682
For name:  v_lopes
total sample size before apply threshold:  26
Counter({'0000-0003-1599-2180': 20, '0000-0003-2278-8559': 3, '0000-0003-2079-4170': 2, '0000-0001-8276-4490': 1})
['0000-0003-1599-2180']
Total sample size after apply threshold:  20
For name:  m_quintana
total sample size before apply threshold:  68
Counter({'0000-0003-3601-0262': 29, '0000-0002-7036-8658': 17, '0000-0002-3808-8189': 16, '0000-0002-7934-4361': 3, '0000-0001-6190-3324': 2, '0000-0002-2677-6179': 1})
['0000-0002-3808-8189', '0000-0002-7036-8658', '0000-0003-3601-0262']
Total sample size after apply threshold:  62
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, enco

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(887, 1729)
(0, 0)
(0, 0)
1
887
             precision    recall  f1-score   support

          0       0.99      0.68      0.81       104
          1       1.00      0.08      0.14        39
          2       0.00      0.00      0.00        24
          3       0.67      1.00      0.80       487
          4       0.00      0.00      0.00        35
          5       1.00      0.11      0.19        57
          6       0.00      0.00      0.00        13
          7       0.00      0.00      0.00        23
          8       0.97      0.71      0.82    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.95      0.80      0.87       104
          1       1.00      0.54      0.70        39
          2       1.00      0.46      0.63        24
          3       0.81      0.99      0.89       487
          4       1.00      0.46      0.63        35
          5       0.86      0.67      0.75        57
          6       1.00      0.85      0.92        13
          7       1.00      0.70      0.82        23
          8       1.00      0.82      0.90       105

avg / total       0.88      0.86      0.85       887

[ 83   0   0  21   0   0   0   0   0   0  21   0  18   0   0   0   0   0
   1   0  11  10   0   2   0   0   0   3   0   0 482   0   2   0   0   0
   0   0   0  17  16   2   0   0   0   0   0   0  19   0  38   0   0   0
   0   0   0   2   0   0  11   0   0   0   0   0   7   0   0   0  16   0
   0   0   0  19   0   0   0   0  86]
svc Accuracy:  0.8613303269447576
svc F1:  0.7895837055808141
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(25, 92)
(0, 0)
(0, 0)
1
25
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.93      1.00      0.97        14

avg / total       0.96      0.96      0.96        25

[10  1  0 14]
MNB Accuracy:  0.96
MNB F1:  0.9589490968801313
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.93      1.00      0.97        14

avg / total       0.96      0.96      0.96        25

[10  1  0 14]
svc Accuracy:  0.96
sv

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.84      0.91        19
          1       1.00      1.00      1.00        11
          2       1.00      0.78      0.88        27
          3       0.84      1.00      0.92        76
          4       0.86      0.55      0.67        11
          5       1.00      0.91      0.95        11

avg / total       0.91      0.90      0.90       155

[16  0  0  3  0  0  0 11  0  0  0  0  0  0 21  5  1  0  0  0  0 76  0  0
  0  0  0  5  6  0  0  0  0  1  0 10]
svc Accuracy:  0.9032258064516129
svc F1:  0.8873326639892904
             precision    recall  f1-score   support

          0       1.00      0.37      0.54        19
          1       1.00      0.45      0.62        11
          2       1.00      0.63      0.77        27
          3       0.63      1.00      0.78        76
          4       1.00      0.18      0.31        11
          5       1.00      0.36      0.53        11

avg / total       0.82     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.83      0.91        24
          1       1.00      0.69      0.81        16
          2       0.81      1.00      0.89        38

avg / total       0.91      0.88      0.88        78

[20  0  4  0 11  5  0  0 38]
svc Accuracy:  0.8846153846153846
svc F1:  0.8726744569881824
             precision    recall  f1-score   support

          0       0.95      0.75      0.84        24
          1       1.00      0.12      0.22        16
          2       0.67      1.00      0.80        38

avg / total       0.82      0.74      0.69        78

[18  0  6  1  2 13  0  0 38]
LR Accuracy:  0.7435897435897436
LR F1:  0.6198105081826012
For name:  c_shao
total sample size before apply threshold:  96
Counter({'0000-0003-2618-9342': 61, '0000-0002-6953-2203': 23, '0000-0001-8260-4761': 9, '0000-0002-8691-5177': 3})
['0000-0002-6953-2203', '0000-0003-2618-9342']
Total sample size after apply threshold:  84
TfidfVectori

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        31
          1       1.00      0.71      0.83        31
          2       0.79      1.00      0.88        64

avg / total       0.89      0.87      0.86       126

[23  0  8  0 22  9  0  0 64]
MNB Accuracy:  0.8650793650793651
MNB F1:  0.8549330505955967
             precision    recall  f1-score   support

          0       1.00      0.87      0.93        31
          1       0.96      0.74      0.84        31
          2       0.85      1.00      0.92        64

avg / total       0.92      0.90      0.90       126

[27  1  3  0 23  8  0  0 64]
svc Accuracy:  0.9047619047619048
svc F1:  0.896087142824925
             precision    recall  f1-score   support

          0       1.00      0.58      0.73        31
          1       1.00      0.61      0.76        31
          2       0.72      1.00      0.84        64

avg / total       0.86      0.80      0.79       126

[18  0 13  0 19

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.93      0.78      0.85        18
          1       0.93      0.99      0.96        85
          2       0.95      0.90      0.92        39

avg / total       0.94      0.94      0.94       142

[14  3  1  0 84  1  1  3 35]
svc Accuracy:  0.9366197183098591
svc F1:  0.909845826687932
             precision    recall  f1-score   support

          0       1.00      0.22      0.36        18
          1       0.78      1.00      0.88        85
          2       0.97      0.72      0.82        39

avg / total       0.86      0.82      0.80       142

[ 4 13  1  0 85  0  0 11 28]
LR Accuracy:  0.823943661971831
LR F1:  0.6878181450649613
For name:  c_franco
total sample size before apply threshold:  64
Counter({'0000-0003-1958-3851': 28, '0000-0003-2288-1518': 18, '0000-0002-2861-3883': 17, '0000-0003-2729-4064': 1})
['0000-0003-2288-1518', '0000-0002-2861-3883', '0000-0003-1958-3851']
Total sample size after apply thr

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.83      0.91        18
          1       1.00      0.65      0.79        17
          2       0.76      1.00      0.86        28

avg / total       0.89      0.86      0.85        63

[15  0  3  0 11  6  0  0 28]
LR Accuracy:  0.8571428571428571
LR F1:  0.8521145521145521
For name:  v_wong
total sample size before apply threshold:  35
Counter({'0000-0001-6751-7942': 14, '0000-0002-2951-8108': 12, '0000-0001-9356-7556': 8, '0000-0003-2844-3789': 1})
['0000-0001-6751-7942', '0000-0002-2951-8108']
Total sample size after apply threshold:  26
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(131, 252)
(0, 0)
(0, 0)
1
131
             precision    recall  f1-score   support

          0       1.00      0.65      0.79        17
          1       0.92      1.00      0.96        81
          2       1.00      0.97      0.98        33

avg / total       0.95      0.95      0.94       131

[11  6  0  0 81  0  0  1 32]
MNB Accuracy:  0.9465648854961832
MNB F1:  0.909636517328825
             precision    recall  f1-score   support

          0       1.00      0.88      0.94        17
          1       0.95      1.00      0.98        81
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(111, 380)
(0, 0)
(0, 0)
1
111
             precision    recall  f1-score   support

          0       0.58      1.00      0.73        40
          1       1.00      0.42      0.59        19
          2       1.00      0.78      0.88        18
          3       1.00      0.47      0.64        15
          4       1.00      0.68      0.81        19

avg / total       0.85      0.74      0.73       111

[40  0  0  0  0 11  8  0  0  0  4  0 14  0  0  8  0  0  7  0  6  0  0  0
 13]
MNB Accuracy:  0.7387387387387387
MNB F1:  0.7300802366169339
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(25, 109)
(0, 0)
(0, 0)
1
25
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.94      1.00      0.97        15

avg / total       0.96      0.96      0.96        25

[ 9  1  0 15]
MNB Accuracy:  0.96
MNB F1:  0.9575551782682513
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.94      1.00      0.97        15

avg / total       0.96      0.96      0.96        25

[ 9  1  0 15]
svc Accuracy:  0.96
s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(467, 419)
(0, 0)
(0, 0)
1
467
             precision    recall  f1-score   support

          0       0.52      0.83      0.64        53
          1       0.00      0.00      0.00        15
          2       0.00      0.00      0.00        13
          3       0.82      0.22      0.35        41
          4       0.34      0.97      0.50        71
          5       1.00      0.46      0.63        13
          6       1.00      0.75      0.86        12
          7       0.00      0.00      0.00        12
          8       0.60      0.07      0.13        42
          9       0.00      0.00      0.00        17
         10       0.00      0.00      0.00        10
         11       1.00      0.09      0.17        11
         12       0.68      0.88      0.77        32
         13       1.00      0.68      0.81        25
         14       0.52      0.79      0.63        48
         15       0.87      0.72      0.79        18
         16       1.00      0.06      0.11        17
         17  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      1.00      0.91       206
          1       0.00      0.00      0.00        14
          2       1.00      0.67      0.80        18
          3       0.98      0.81      0.89       120

avg / total       0.86      0.88      0.86       358

[206   0   0   0  14   0   0   0   4   0  12   2  23   0   0  97]
svc Accuracy:  0.8798882681564246
svc F1:  0.6488342556472829
             precision    recall  f1-score   support

          0       0.75      1.00      0.86       206
          1       0.00      0.00      0.00        14
          2       0.00      0.00      0.00        18
          3       1.00      0.70      0.82       120

avg / total       0.77      0.81      0.77       358

[206   0   0   0  14   0   0   0  18   0   0   0  36   0   0  84]
LR Accuracy:  0.8100558659217877
LR F1:  0.42046568627450975
For name:  s_keating
total sample size before apply threshold:  54
Counter({'0000-0002-8324-3694': 28, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.85      1.00      0.92        28
          2       1.00      0.70      0.82        10

avg / total       0.92      0.90      0.90        52

[12  2  0  0 28  0  0  3  7]
svc Accuracy:  0.9038461538461539
svc F1:  0.888213040575625
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        14
          1       0.70      1.00      0.82        28
          2       1.00      0.60      0.75        10

avg / total       0.84      0.77      0.75        52

[ 6  8  0  0 28  0  0  4  6]
LR Accuracy:  0.7692307692307693
LR F1:  0.7245098039215686
For name:  a_bennett
total sample size before apply threshold:  56
Counter({'0000-0003-3829-0309': 51, '0000-0001-8895-6418': 2, '0000-0001-7448-8182': 1, '0000-0003-4194-9741': 1, '0000-0001-6968-9465': 1})
['0000-0003-3829-0309']
Total sample size after apply threshold:  51
For name

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(90, 224)
(0, 0)
(0, 0)
1
90
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.70      1.00      0.82        47
          2       1.00      0.64      0.78        11
          3       1.00      0.76      0.86        21

avg / total       0.72      0.78      0.73        90

[ 0 11  0  0  0 47  0  0  0  4  7  0  0  5  0 16]
MNB Accuracy:  0.7777777777777778
MNB F1:  0.6168010115378536
             precision    recall  f1-score   support

          0       0.00      0.00      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(133, 423)
(0, 0)
(0, 0)
1
133
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        11
          1       1.00      0.69      0.82        26
          2       0.59      1.00      0.74        40
          3       1.00      0.57      0.73        14
          4       1.00      0.67      0.80        15
          5       0.88      0.81      0.85        27

avg / total       0.85      0.77      0.77       133

[ 4  0  7  0  0  0  0 18  6  0  0  2  0  0 40  0  0  0  0  0  5  8  0  1
  0  0  5  0 10  0  0  0  5  0  0 22]
MNB Accuracy:  0.7669172932330827
MNB F1:  0.7442804109470775
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.96      0.88      0.92        26
          2       0.75      1.00      0.86        40
          3       1.00      0.79      0.88        14
          4       1.00      0.80      0.89        15
          5       0.92      0.85      0.88        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(207, 407)
(0, 0)
(0, 0)
1
207
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.90      0.41      0.56        22
          2       1.00      0.36      0.53        14
          3       0.52      0.98      0.68        63
          4       1.00      0.87      0.93        30
          5       1.00      0.46      0.63        26
          6       0.97      0.89      0.93        38

avg / total       0.77      0.71      0.69       207

[ 0  0  0 14  0  0  0  0  9  0 13  0  0  0  0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       0.79      0.50      0.61        22
          2       1.00      0.64      0.78        14
          3       0.66      0.98      0.79        63
          4       1.00      0.83      0.91        30
          5       0.91      0.77      0.83        26
          6       1.00      0.89      0.94        38

avg / total       0.86      0.82      0.82       207

[ 9  0  0  5  0  0  0  0 11  0 10  0  1  0  0  1  9  3  0  1  0  0  1  0
 62  0  0  0  0  1  0  4 25  0  0  0  0  0  6  0 20  0  0  0  0  4  0  0
 34]
svc Accuracy:  0.821256038647343
svc F1:  0.8075723009259426
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.91      0.45      0.61        22
          2       1.00      0.50      0.67        14
          3       0.56      0.98      0.71        63
          4       0.93      0.87      0.90

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96        82
          1       1.00      0.45      0.62        11

avg / total       0.94      0.94      0.92        93

[82  0  6  5]
svc Accuracy:  0.9354838709677419
svc F1:  0.7948529411764707
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        82
          1       0.00      0.00      0.00        11

avg / total       0.78      0.88      0.83        93

[82  0 11  0]
LR Accuracy:  0.8817204301075269
LR F1:  0.4685714285714285
For name:  y_ding
total sample size before apply threshold:  106
Counter({'0000-0003-1352-1000': 21, '0000-0002-6823-4722': 21, '0000-0001-7772-6449': 19, '0000-0002-8845-4618': 15, '0000-0001-7461-0213': 8, '0000-0001-8161-2743': 7, '0000-0003-4761-5486': 4, '0000-0003-0465-7870': 4, '0000-0003-1176-6397': 3, '0000-0001-8312-8672': 2, '0000-0002-9713-5694': 1, '0000-0002-0010-8279': 1})
['0000-0003-1352-1000', '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  302
Counter({'0000-0001-9103-6532': 41, '0000-0003-2186-8100': 24, '0000-0002-6714-5939': 24, '0000-0002-0107-9940': 24, '0000-0001-8012-4203': 20, '0000-0001-8638-5594': 20, '0000-0003-3787-9138': 12, '0000-0003-2790-6294': 12, '0000-0001-6837-5739': 10, '0000-0001-5445-1032': 8, '0000-0001-8753-4950': 8, '0000-0002-7898-5503': 7, '0000-0003-4516-6904': 6, '0000-0003-3162-250X': 6, '0000-0002-0330-3643': 5, '0000-0003-1554-4687': 5, '0000-0003-0402-2971': 5, '0000-0003-4158-6098': 5, '0000-0002-0747-7835': 4, '0000-0002-6477-5345': 4, '0000-0002-0841-4844': 4, '0000-0002-0685-2963': 4, '0000-0001-5526-8109': 4, '0000-0001-5611-6385': 3, '0000-0002-2308-9904': 3, '0000-0001-9287-0959': 3, '0000-0001-6532-1700': 3, '0000-0003-0593-4665': 3, '0000-0002-6859-084X': 3, '0000-0002-2220-5862': 2, '0000-0001-9605-6276': 2, '0000-0001-9955-0915': 2, '0000-0003-1202-7748': 2, '0000-0002-5614-229X': 2, '0000-0003-1214-8240': 2, '0000-0001-8144-4583': 1,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.83      0.91        12
          1       1.00      0.90      0.95        20
          2       0.93      0.65      0.76        20
          3       1.00      0.96      0.98        24
          4       1.00      0.58      0.74        12
          5       1.00      0.90      0.95        10
          6       1.00      0.96      0.98        24
          7       1.00      0.92      0.96        24
          8       0.66      0.98      0.78        41

avg / total       0.92      0.88      0.89       187

[10  0  0  0  0  0  0  0  2  0 18  0  0  0  0  0  0  2  0  0 13  0  0  0
  0  0  7  0  0  0 23  0  0  0  0  1  0  0  0  0  7  0  0  0  5  0  0  0
  0  0  9  0  0  1  0  0  0  0  0  0 23  0  1  0  0  0  0  0  0  0 22  2
  0  0  1  0  0  0  0  0 40]
svc Accuracy:  0.8823529411764706
svc F1:  0.8892953346603935
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.77      0.97      0.86        69
          1       0.95      0.62      0.75        32
          2       0.83      0.38      0.53        13

avg / total       0.83      0.81      0.79       114

[67  1  1 12 20  0  8  0  5]
svc Accuracy:  0.8070175438596491
svc F1:  0.7133357098600395
             precision    recall  f1-score   support

          0       0.70      1.00      0.83        69
          1       1.00      0.44      0.61        32
          2       1.00      0.15      0.27        13

avg / total       0.82      0.75      0.70       114

[69  0  0 18 14  0 11  0  2]
LR Accuracy:  0.7456140350877193
LR F1:  0.5672365414099337
For name:  k_brown
total sample size before apply threshold:  231
Counter({'0000-0003-2434-0037': 89, '0000-0002-0729-4959': 61, '0000-0003-3382-5546': 33, '0000-0002-6803-5336': 12, '0000-0003-2472-5754': 9, '0000-0001-7716-1425': 7, '0000-0001-9428-9420': 6, '0000-0002-1047-4328': 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.99      0.91      0.95        89
          1       1.00      0.82      0.90        33
          2       0.79      1.00      0.88        61
          3       1.00      0.75      0.86        12

avg / total       0.93      0.91      0.91       195

[81  0  8  0  1 27  5  0  0  0 61  0  0  0  3  9]
svc Accuracy:  0.9128205128205128
svc F1:  0.8971423123024954
             precision    recall  f1-score   support

          0       0.83      0.96      0.89        89
          1       1.00      0.73      0.84        33
          2       0.89      0.89      0.89        61
          3       1.00      0.58      0.74        12

avg / total       0.88      0.87      0.87       195

[85  0  4  0  8 24  1  0  7  0 54  0  3  0  2  7]
LR Accuracy:  0.8717948717948718
LR F1:  0.837402484181766
For name:  s_hong
total sample size before apply threshold:  383
Counter({'0000-0002-8344-6774': 102, '0000-0002-8888-6007': 84, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.77      0.87        84
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        10
          3       0.64      0.99      0.78       102
          4       0.00      0.00      0.00        19
          5       0.00      0.00      0.00        15
          6       0.00      0.00      0.00        12
          7       1.00      0.48      0.65        27
          8       0.57      0.89      0.70        83

avg / total       0.62      0.70      0.63       364

[ 65   0   0  10   0   0   0   0   9   0   0   0   2   0   0   0   0  10
   0   0   0   4   0   0   0   0   6   0   0   0 101   0   0   0   0   1
   0   0   0  10   0   0   0   0   9   0   0   0  10   0   0   0   0   5
   0   0   0   5   0   0   0   0   7   0   0   0   6   0   0   0  13   8
   0   0   0   9   0   0   0   0  74]
MNB Accuracy:  0.695054945054945
MNB F1:  0.33339102321627334
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        16
          1       1.00      1.00      1.00        10

avg / total       1.00      1.00      1.00        26

[16  0  0 10]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        16
          1       1.00      0.90      0.95        10

avg / total       0.96      0.96      0.96        26

[16  0  1  9]
LR Accuracy:  0.9615384615384616
LR F1:  0.9585326953748007
For name:  h_jiang
total sample size before apply threshold:  135
Counter({'0000-0002-2975-7977': 52, '0000-0002-5778-4008': 16, '0000-0002-1947-4420': 15, '0000-0002-4388-6548': 13, '0000-0003-0561-5058': 10, '0000-0002-1156-9046': 8, '0000-0001-9892-4292': 4, '0000-0002-4577-2886': 4, '0000-0003-3187-2023': 3, '0000-0002-5840-007X': 3, '0000-0003-4173-8565': 3, '0000-0002-7827-0719': 1, '0000-0002-0962-902X': 1, '0000-0003-0951-0624': 1, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(122, 806)
(0, 0)
(0, 0)
1
122
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        29
          1       1.00      0.97      0.99        34
          2       0.92      0.89      0.91        27
          3       1.00      0.94      0.97        18
          4       1.00      1.00      1.00        14

avg / total       0.96      0.96      0.96       122

[29  0  0  0  0  0 33  1  0  0  3  0 24  0  0  0  0  1 17  0  0  0  0  0
 14]
MNB Accuracy:  0.9590163934426229
MNB F1:  0.9625966495567763
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


226
Counter({'0000-0001-8640-8530': 59, '0000-0002-2571-3600': 24, '0000-0001-5129-756X': 19, '0000-0002-0760-8647': 14, '0000-0003-3490-799X': 13, '0000-0003-3655-2854': 12, '0000-0003-0254-3546': 11, '0000-0002-0751-0930': 9, '0000-0001-5955-3783': 9, '0000-0002-9325-6640': 8, '0000-0002-8748-1841': 7, '0000-0002-8748-4065': 5, '0000-0003-4057-8053': 5, '0000-0002-3806-5956': 5, '0000-0002-5796-6573': 4, '0000-0002-6655-129X': 3, '0000-0001-6079-0567': 3, '0000-0002-0007-2536': 3, '0000-0001-9293-3999': 2, '0000-0002-3746-5034': 2, '0000-0002-2048-225X': 2, '0000-0001-9338-9323': 2, '0000-0002-7524-6270': 1, '0000-0003-3340-3036': 1, '0000-0003-1562-2577': 1, '0000-0001-7763-1490': 1, '0000-0002-0338-8325': 1})
['0000-0003-3490-799X', '0000-0001-5129-756X', '0000-0002-0760-8647', '0000-0003-3655-2854', '0000-0003-0254-3546', '0000-0001-8640-8530', '0000-0002-2571-3600']
Total sample size after apply threshold:  152
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      0.93      0.93        29
          1       0.93      0.93      0.93        28

avg / total       0.93      0.93      0.93        57

[27  2  2 26]
LR Accuracy:  0.9298245614035088
LR F1:  0.9298029556650247
For name:  m_aslam
total sample size before apply threshold:  55
Counter({'0000-0003-1361-5357': 29, '0000-0002-8529-4217': 17, '0000-0001-8812-6887': 4, '0000-0001-9418-3714': 4, '0000-0003-2498-3526': 1})
['0000-0003-1361-5357', '0000-0002-8529-4217']
Total sample size after apply threshold:  46
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      1.00      0.95        27
          1       1.00      0.88      0.94        26

avg / total       0.95      0.94      0.94        53

[27  0  3 23]
svc Accuracy:  0.9433962264150944
svc F1:  0.9430719656283566
             precision    recall  f1-score   support

          0       0.90      1.00      0.95        27
          1       1.00      0.88      0.94        26

avg / total       0.95      0.94      0.94        53

[27  0  3 23]
LR Accuracy:  0.9433962264150944
LR F1:  0.9430719656283566
For name:  j_king
total sample size before apply threshold:  75
Counter({'0000-0003-0596-4506': 21, '0000-0002-8174-9173': 20, '0000-0003-4530-9987': 20, '0000-0002-6048-8277': 7, '0000-0003-2171-8321': 5, '0000-0003-4947-0241': 1, '0000-0003-0494-153X': 1})
['0000-0002-8174-9173', '0000-0003-4530-9987', '0000-0003-0596-4506']
Total sample size after apply threshold:  61
TfidfVectorizer(analyzer='word', binary=False,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.85      0.92        20
          1       0.77      1.00      0.87        20
          2       1.00      0.86      0.92        21

avg / total       0.92      0.90      0.90        61

[17  3  0  0 20  0  0  3 18]
svc Accuracy:  0.9016393442622951
svc F1:  0.9038536864623822
             precision    recall  f1-score   support

          0       1.00      0.95      0.97        20
          1       0.79      0.75      0.77        20
          2       0.78      0.86      0.82        21

avg / total       0.86      0.85      0.85        61

[19  1  0  0 15  5  0  3 18]
LR Accuracy:  0.8524590163934426
LR F1:  0.853923853923854
For name:  b_shen
total sample size before apply threshold:  36
Counter({'0000-0003-2899-1531': 29, '0000-0002-5237-6144': 4, '0000-0003-3287-9438': 2, '0000-0001-9687-9010': 1})
['0000-0003-2899-1531']
Total sample size after apply threshold:  29
For name:  s_mishra
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        29
          1       1.00      0.88      0.93        16
          2       1.00      1.00      1.00        16
          3       1.00      1.00      1.00        24

avg / total       0.98      0.98      0.98        85

[29  0  0  0  2 14  0  0  0  0 16  0  0  0  0 24]
svc Accuracy:  0.9764705882352941
svc F1:  0.975
             precision    recall  f1-score   support

          0       0.81      1.00      0.89        29
          1       1.00      0.62      0.77        16
          2       1.00      1.00      1.00        16
          3       0.96      0.92      0.94        24

avg / total       0.92      0.91      0.90        85

[29  0  0  0  5 10  0  1  0  0 16  0  2  0  0 22]
LR Accuracy:  0.9058823529411765
LR F1:  0.8994271685761048
For name:  c_o'connor
total sample size before apply threshold:  10
Counter({'0000-0001-8134-075X': 4, '0000-0002-3541-708X': 2, '0000-0002-7638-9804

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(137, 311)
(0, 0)
(0, 0)
1
137
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       111
          1       0.96      0.88      0.92        26

avg / total       0.97      0.97      0.97       137

[110   1   3  23]
MNB Accuracy:  0.9708029197080292
MNB F1:  0.9510714285714286
             precision    recall  f1-score   support

          0       0.97      1.00      0.99       111
          1       1.00      0.88      0.94        26

avg / total       0.98      0.98      0.98       137

[111   0   3  2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.82      0.90        17
          1       0.92      1.00      0.96        34

avg / total       0.95      0.94      0.94        51

[14  3  0 34]
svc Accuracy:  0.9411764705882353
svc F1:  0.9304861426624262
             precision    recall  f1-score   support

          0       1.00      0.65      0.79        17
          1       0.85      1.00      0.92        34

avg / total       0.90      0.88      0.87        51

[11  6  0 34]
LR Accuracy:  0.8823529411764706
LR F1:  0.8523166023166023
For name:  t_jackson
total sample size before apply threshold:  47
Counter({'0000-0001-6351-2773': 23, '0000-0001-6749-9959': 9, '0000-0003-1669-6666': 6, '0000-0003-3214-3973': 3, '0000-0001-8404-4251': 2, '0000-0002-0248-2627': 2, '0000-0002-5489-6020': 1, '0000-0003-2387-6411': 1})
['0000-0001-6351-2773']
Total sample size after apply threshold:  23
For name:  m_romero
total sample size before apply threshold:  29

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.54      0.70        13
          1       0.91      1.00      0.95       125
          2       1.00      0.85      0.92        41

avg / total       0.94      0.93      0.93       179

[  7   6   0   0 125   0   0   6  35]
svc Accuracy:  0.9329608938547486
svc F1:  0.8584170349537966
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.81      1.00      0.90       125
          2       1.00      0.61      0.76        41

avg / total       0.80      0.84      0.80       179

[  0  13   0   0 125   0   0  16  25]
LR Accuracy:  0.8379888268156425
LR F1:  0.5512110350820028
For name:  c_guo
total sample size before apply threshold:  6
Counter({'0000-0001-9253-3469': 2, '0000-0002-0432-8121': 2, '0000-0002-4000-8141': 1, '0000-0003-2182-3287': 1})
[]
Total sample size after apply threshold:  0
For name:  m_hansen
total sample size befor

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      1.00      0.78        55
          1       1.00      0.76      0.87        17
          2       1.00      0.73      0.84        11
          3       1.00      0.79      0.88        24
          4       1.00      0.83      0.91        29
          5       1.00      0.81      0.90        27
          6       1.00      0.88      0.93        16
          7       1.00      0.82      0.90        40

avg / total       0.91      0.86      0.87       219

[55  0  0  0  0  0  0  0  4 13  0  0  0  0  0  0  3  0  8  0  0  0  0  0
  5  0  0 19  0  0  0  0  5  0  0  0 24  0  0  0  5  0  0  0  0 22  0  0
  2  0  0  0  0  0 14  0  7  0  0  0  0  0  0 33]
svc Accuracy:  0.8584474885844748
svc F1:  0.8767121484293925
             precision    recall  f1-score   support

          0       0.52      1.00      0.69        55
          1       1.00      0.59      0.74        17
          2       1.00      0.09      0.17       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.47      0.64        15
          2       0.75      1.00      0.85       141
          3       0.00      0.00      0.00        16
          4       0.98      0.99      0.98        98
          5       0.00      0.00      0.00        15

avg / total       0.73      0.83      0.77       295

[  0   0  10   0   0   0   0   7   8   0   0   0   0   0 141   0   0   0
   0   0  16   0   0   0   0   0   1   0  97   0   0   0  13   0   2   0]
MNB Accuracy:  0.8305084745762712
MNB F1:  0.4126134440855253
             precision    recall  f1-score   support

          0       1.00      0.20      0.33        10
          1       1.00      0.87      0.93        15
          2       0.85      1.00      0.92       141
          3       1.00      0.44      0.61        16
          4       1.00      0.98      0.99        98
          5       1.00      0.73      0.85   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      0.99      0.98        77
          1       0.98      0.97      0.97        60

avg / total       0.98      0.98      0.98       137

[76  1  2 58]
MNB Accuracy:  0.9781021897810219
MNB F1:  0.9777175386283545
             precision    recall  f1-score   support

          0       0.97      0.99      0.98        77
          1       0.98      0.97      0.97        60

avg / total       0.98      0.98      0.98       137

[76  1  2 58]
svc Accuracy:  0.9781021897810219
svc F1:  0.9777175386283545
             precision    recall  f1-score   support

          0       0.93      1.00      0.96        77
          1       1.00      0.90      0.95        60

avg / total       0.96      0.96      0.96       137

[77  0  6 54]
LR Accuracy:  0.9562043795620438
LR F1:  0.9549342105263159
For name:  b_zhou
total sample size before apply threshold:  20
Counter({'0000-0002-1535-6283': 13, '0000-0003-2846-1813': 2, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.78      0.88        18
          1       0.92      0.79      0.85        14
          2       0.00      0.00      0.00        12
          3       0.71      1.00      0.83        44

avg / total       0.71      0.78      0.73        88

[14  1  0  3  0 11  0  3  0  0  0 12  0  0  0 44]
MNB Accuracy:  0.7840909090909091
MNB F1:  0.6378356313497824
             precision    recall  f1-score   support

          0       1.00      0.83      0.91        18
          1       1.00      0.86      0.92        14
          2       0.89      0.67      0.76        12
          3       0.83      0.98      0.90        44

avg / total       0.90      0.89      0.88        88

[15  0  0  3  0 12  0  2  0  0  8  4  0  0  1 43]
svc Accuracy:  0.8863636363636364
svc F1:  0.8724764818514819
             precision    recall  f1-score   support

          0       1.00      0.83      0.91        18
          1       1.00     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.93      0.88      0.90        16
          1       1.00      0.54      0.70        13
          2       1.00      0.92      0.96        13
          3       0.75      0.21      0.33        14
          4       0.79      0.96      0.87       244
          5       0.95      0.70      0.81        30
          6       0.75      0.30      0.43        10
          7       1.00      0.42      0.59        12
          8       0.63      0.79      0.70        67
          9       0.75      0.82      0.78        22
         10       1.00      0.90      0.95        20
         11       0.88      0.83      0.86        18
         12       1.00      0.70      0.82        10
         13       0.96      0.88      0.92        26
         14       0.87      0.67      0.75        30
         15       0.96      0.85      0.90        26
         16       1.00      0.71      0.83        21
         17       0.75      0.68      0.71   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


31
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(31, 77)
(0, 0)
(0, 0)
1
31
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        18
          1       1.00      0.62      0.76        13

avg / total       0.87      0.84      0.83        31

[18  0  5  8]
MNB Accuracy:  0.8387096774193549
MNB F1:  0.8199767711962835
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        18
          1       0.93      1.00      0.96        13

avg / total       0.97      0.97      0.97        31

[17  1  0 13]
svc A

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.88      0.84        41
          1       1.00      0.50      0.67        10
          2       1.00      0.52      0.68        25
          3       0.94      0.89      0.92        37
          4       0.97      0.97      0.97        38
          5       0.68      1.00      0.81        68
          6       1.00      0.12      0.21        17
          7       1.00      0.91      0.95        22

avg / total       0.87      0.83      0.81       258

[36  0  0  0  1  4  0  0  0  5  0  0  0  5  0  0  5  0 13  0  0  7  0  0
  1  0  0 33  0  3  0  0  0  0  0  0 37  1  0  0  0  0  0  0  0 68  0  0
  2  0  0  2  0 11  2  0  1  0  0  0  0  1  0 20]
MNB Accuracy:  0.8294573643410853
MNB F1:  0.756358556274407
             precision    recall  f1-score   support

          0       0.80      0.88      0.84        41
          1       1.00      0.70      0.82        10
          2       0.94      0.68      0.79        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.85      1.00      0.92        34
          1       1.00      0.78      0.88        27

avg / total       0.92      0.90      0.90        61

[34  0  6 21]
svc Accuracy:  0.9016393442622951
svc F1:  0.8969594594594594
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        34
          1       1.00      0.78      0.88        27

avg / total       0.92      0.90      0.90        61

[34  0  6 21]
LR Accuracy:  0.9016393442622951
LR F1:  0.8969594594594594
For name:  g_dias
total sample size before apply threshold:  9
Counter({'0000-0002-3774-6661': 5, '0000-0001-8548-1146': 2, '0000-0001-7291-6569': 1, '0000-0002-0524-1239': 1})
[]
Total sample size after apply threshold:  0
For name:  h_yoshida
total sample size before apply threshold:  72
Counter({'0000-0001-6890-4397': 38, '0000-0002-2540-0225': 19, '0000-0001-6360-5988': 13, '0000-0002-7283-8617': 2})
['0000-0002-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(352, 565)
(0, 0)
(0, 0)
1
352
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.00      0.00      0.00        11
          2       0.84      0.90      0.87        40
          3       0.97      0.73      0.84        45
          4       0.00      0.00      0.00        10
          5       0.77      1.00      0.87        34
          6       1.00      0.50      0.67        18
          7       0.72      0.95      0.82        44
          8       0.50      0.97      0.66     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       0.82      0.82      0.82        11
          2       0.74      0.93      0.82        40
          3       0.80      0.73      0.77        45
          4       0.89      0.80      0.84        10
          5       1.00      0.94      0.97        34
          6       1.00      0.67      0.80        18
          7       0.95      0.86      0.90        44
          8       0.59      0.94      0.73        65
          9       1.00      0.97      0.98        31
         10       1.00      0.33      0.50        15
         11       1.00      0.91      0.95        11
         12       1.00      0.53      0.70        15

avg / total       0.86      0.81      0.80       352

[ 1  0  0  4  1  0  0  1  6  0  0  0  0  0  9  0  0  0  0  0  1  1  0  0
  0  0  0  0 37  0  0  0  0  0  3  0  0  0  0  0  1  0 33  0  0  0  0 11
  0  0  0  0  0  0  0  0  8  0  0  0  2  0  0  0  0  0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.81      0.95      0.88        22
          2       0.96      0.96      0.96        24

avg / total       0.91      0.89      0.89        56

[ 6  4  0  0 21  1  0  1 23]
MNB Accuracy:  0.8928571428571429
MNB F1:  0.8611111111111112
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.88      1.00      0.94        22
          2       1.00      0.96      0.98        24

avg / total       0.95      0.95      0.95        56

[ 8  2  0  0 22  0  0  1 23]
svc Accuracy:  0.9464285714285714
svc F1:  0.9345941686367217
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       0.81      1.00      0.90        22
          2       0.96      0.96      0.96        24

avg / total       0.91      0.89      0.88        56

[ 5  4  1  0 2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.73      0.82        44
          1       0.79      0.98      0.88        58
          2       1.00      0.80      0.89        15
          3       1.00      0.95      0.97        19

avg / total       0.89      0.88      0.87       136

[32 12  0  0  1 57  0  0  1  2 12  0  0  1  0 18]
svc Accuracy:  0.875
svc F1:  0.8898244398244399
             precision    recall  f1-score   support

          0       1.00      0.64      0.78        44
          1       0.68      1.00      0.81        58
          2       1.00      0.60      0.75        15
          3       1.00      0.74      0.85        19

avg / total       0.86      0.80      0.80       136

[28 16  0  0  0 58  0  0  0  6  9  0  0  5  0 14]
LR Accuracy:  0.8014705882352942
LR F1:  0.7968628593628593
For name:  s_brooks
total sample size before apply threshold:  58
Counter({'0000-0002-8437-9788': 32, '0000-0002-4592-4974': 16, '0000-0002-5701-0125

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(201, 1355)
(0, 0)
(0, 0)
1
201
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       133
          1       0.98      0.94      0.96        68

avg / total       0.98      0.98      0.97       201

[132   1   4  64]
MNB Accuracy:  0.9751243781094527
MNB F1:  0.9719093272213992
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       133
          1       1.00      1.00      1.00        68

avg / total       1.00      1.00      1.00       201

[133   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.72      0.84        18
          1       1.00      0.92      0.96        49
          2       0.80      1.00      0.89        37

avg / total       0.93      0.91      0.91       104

[13  0  5  0 45  4  0  0 37]
svc Accuracy:  0.9134615384615384
svc F1:  0.8959075836634113
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        18
          1       1.00      0.92      0.96        49
          2       0.74      1.00      0.85        37

avg / total       0.91      0.88      0.87       104

[ 9  0  9  0 45  4  0  0 37]
LR Accuracy:  0.875
LR F1:  0.8248960626069944
For name:  a_hudson
total sample size before apply threshold:  129
Counter({'0000-0003-1105-7646': 86, '0000-0002-0192-776X': 15, '0000-0003-1849-9666': 13, '0000-0001-7292-5406': 6, '0000-0001-6436-2025': 5, '0000-0001-9016-6917': 4})
['0000-0002-0192-776X', '0000-0003-1849-9666', '0000-0003-110

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        15
          1       1.00      0.54      0.70        13
          2       0.91      1.00      0.95        86

avg / total       0.93      0.92      0.91       114

[12  0  3  0  7  6  0  0 86]
svc Accuracy:  0.9210526315789473
svc F1:  0.846388377327604
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        15
          1       0.00      0.00      0.00        13
          2       0.82      1.00      0.90        86

avg / total       0.75      0.83      0.78       114

[ 9  0  6  0  0 13  0  0 86]
LR Accuracy:  0.8333333333333334
LR F1:  0.5501745200698079
For name:  d_thomas
total sample size before apply threshold:  62
Counter({'0000-0001-8832-5907': 17, '0000-0002-8141-3362': 11, '0000-0002-8278-5934': 10, '0000-0002-1307-6042': 6, '0000-0002-7976-4956': 6, '0000-0002-1053-129X': 5, '0000-0001-9415-5991': 4, '0000-0001-6867-5504': 2,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       1.00      0.64      0.78        11
          2       1.00      0.39      0.56        36
          3       0.00      0.00      0.00        14
          4       0.66      1.00      0.79       105
          5       0.98      0.93      0.95        55

avg / total       0.73      0.76      0.71       233

[  0   0   0   0  12   0   0   7   0   0   4   0   0   0  14   0  21   1
   0   0   0   0  14   0   0   0   0   0 105   0   0   0   0   0   4  51]
MNB Accuracy:  0.759656652360515
MNB F1:  0.5139169393339733
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       1.00      0.64      0.78        11
          2       1.00      0.78      0.88        36
          3       1.00      0.79      0.88        14
          4       0.81      1.00      0.89       105
          5       1.00      0.93      0.96    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        37
          1       0.87      1.00      0.93        46
          2       1.00      0.77      0.87        13

avg / total       0.94      0.93      0.93        96

[33  4  0  0 46  0  0  3 10]
svc Accuracy:  0.9270833333333334
svc F1:  0.9139050965137923
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        37
          1       0.72      1.00      0.84        46
          2       1.00      0.15      0.27        13

avg / total       0.87      0.81      0.78        96

[30  7  0  0 46  0  0 11  2]
LR Accuracy:  0.8125
LR F1:  0.6661842303633349
For name:  g_huang
total sample size before apply threshold:  160
Counter({'0000-0001-7004-826X': 52, '0000-0003-2965-0341': 31, '0000-0002-0001-888X': 22, '0000-0002-8391-4013': 17, '0000-0003-2170-0084': 16, '0000-0002-2249-1248': 9, '0000-0003-2518-8145': 6, '0000-0003-1695-1153': 2, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      1.00      0.93        31
          1       1.00      0.75      0.86        16
          2       0.96      1.00      0.98        52
          3       1.00      0.91      0.95        22
          4       1.00      0.94      0.97        17

avg / total       0.95      0.95      0.95       138

[31  0  0  0  0  2 12  2  0  0  0  0 52  0  0  2  0  0 20  0  1  0  0  0
 16]
svc Accuracy:  0.9492753623188406
svc F1:  0.9371451978041672
             precision    recall  f1-score   support

          0       0.82      1.00      0.90        31
          1       1.00      0.62      0.77        16
          2       0.87      1.00      0.93        52
          3       1.00      0.73      0.84        22
          4       1.00      0.82      0.90        17

avg / total       0.91      0.89      0.89       138

[31  0  0  0  0  2 10  4  0  0  0  0 52  0  0  2  0  4 16  0  3  0  0  0
 14]
LR Accuracy:  0.8913043478260869
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        59
          1       1.00      0.64      0.78        14
          2       1.00      0.62      0.76        21

avg / total       0.89      0.86      0.85        94

[59  0  0  5  9  0  8  0 13]
svc Accuracy:  0.8617021276595744
svc F1:  0.8160259789279136
             precision    recall  f1-score   support

          0       0.69      1.00      0.82        59
          1       0.00      0.00      0.00        14
          2       1.00      0.43      0.60        21

avg / total       0.66      0.72      0.65        94

[59  0  0 14  0  0 12  0  9]
LR Accuracy:  0.723404255319149
LR F1:  0.4731481481481481
For name:  j_qin
total sample size before apply threshold:  96
Counter({'0000-0002-8559-616X': 48, '0000-0003-2448-8058': 38, '0000-0002-8186-5705': 4, '0000-0001-6271-068X': 3, '0000-0002-9166-3533': 3})
['0000-0003-2448-8058', '0000-0002-8559-616X']
Total sample size after apply thr

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.29      0.45        17
          1       1.00      0.80      0.89        49
          2       0.80      1.00      0.89        87

avg / total       0.89      0.86      0.84       153

[ 5  0 12  0 39 10  0  0 87]
MNB Accuracy:  0.8562091503267973
MNB F1:  0.7428880643166358
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        17
          1       1.00      0.78      0.87        49
          2       0.81      1.00      0.90        87

avg / total       0.89      0.87      0.86       153

[ 8  0  9  0 38 11  0  0 87]
svc Accuracy:  0.869281045751634
svc F1:  0.8034901449618833
             precision    recall  f1-score   support

          0       1.00      0.18      0.30        17
          1       1.00      0.61      0.76        49
          2       0.72      1.00      0.84        87

avg / total       0.84      0.78      0.75       153

[ 3  0 14  0 30

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        80
          1       1.00      1.00      1.00        19

avg / total       1.00      1.00      1.00        99

[80  0  0 19]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        80
          1       1.00      0.63      0.77        19

avg / total       0.93      0.93      0.92        99

[80  0  7 12]
LR Accuracy:  0.9292929292929293
LR F1:  0.866138690361213
For name:  e_law
total sample size before apply threshold:  12
Counter({'0000-0002-4021-2150': 5, '0000-0001-5089-6341': 3, '0000-0003-4456-1259': 3, '0000-0001-5591-7316': 1})
[]
Total sample size after apply threshold:  0
For name:  m_ribeiro
total sample size before apply threshold:  134
Counter({'0000-0001-8906-0189': 25, '0000-0002-5964-5001': 17, '0000-0001-5693-7861': 16, '0000-0001-6422-3279': 13, '0000-0001-9365-6057': 12, '0000-0001-6357

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      0.70      0.82        10
          2       0.62      1.00      0.77        25
          3       1.00      0.65      0.79        17
          4       0.92      0.75      0.83        16
          5       1.00      0.75      0.86        12

avg / total       0.89      0.83      0.83        93

[13  0  0  0  0  0  0  7  3  0  0  0  0  0 25  0  0  0  0  0  6 11  0  0
  0  0  4  0 12  0  0  0  2  0  1  9]
svc Accuracy:  0.8279569892473119
svc F1:  0.8438672551248616
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       1.00      0.60      0.75        10
          2       0.54      1.00      0.70        25
          3       1.00      0.59      0.74        17
          4       1.00      0.75      0.86        16
          5       1.00      0.58      0.74        12

avg / total       0.88     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(211, 308)
(0, 0)
(0, 0)
1
211
             precision    recall  f1-score   support

          0       0.95      0.99      0.97       142
          1       1.00      0.73      0.85        15
          2       0.96      0.94      0.95        54

avg / total       0.96      0.96      0.96       211

[140   0   2   4  11   0   3   0  51]
MNB Accuracy:  0.957345971563981
MNB F1:  0.9227610018930394
             precision    recall  f1-score   support

          0       0.95      1.00      0.97       142
          1       1.00      0.73      0.85        1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      0.87      0.84        30
          1       1.00      0.90      0.95        21
          2       1.00      0.95      0.98        22
          3       0.89      0.91      0.90        44

avg / total       0.91      0.91      0.91       117

[26  0  0  4  1 19  0  1  1  0 21  0  4  0  0 40]
svc Accuracy:  0.905982905982906
svc F1:  0.9160825669900622
             precision    recall  f1-score   support

          0       1.00      0.70      0.82        30
          1       1.00      0.81      0.89        21
          2       1.00      0.95      0.98        22
          3       0.76      1.00      0.86        44

avg / total       0.91      0.88      0.88       117

[21  0  0  9  0 17  0  4  0  0 21  1  0  0  0 44]
LR Accuracy:  0.8803418803418803
LR F1:  0.889438884488924
For name:  f_yu
total sample size before apply threshold:  78
Counter({'0000-0001-9306-1731': 30, '0000-0003-0268-199X': 23, '0000-0002-52

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 331
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(331, 1340)
(0, 0)
(0, 0)
1
331
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        26
          1       1.00      0.92      0.96        53
          2       1.00      0.89      0.94        44
          3       0.96      1.00      0.98        92
          4       1.00      0.76      0.86        25
          5       0.85      0.99      0.91        91

avg / total       0.95      0.94      0.94       331

[22  0  0  1  0  3  0 49  0  1  0  3  0  0 39  1  0  4  0  0  0 92  0  0
  0  0  0  0 19  6 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.97      0.90        37
          1       0.88      0.50      0.64        14

avg / total       0.85      0.84      0.83        51

[36  1  7  7]
svc Accuracy:  0.8431372549019608
svc F1:  0.7681818181818182
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        37
          1       1.00      0.07      0.13        14

avg / total       0.81      0.75      0.65        51

[37  0 13  1]
LR Accuracy:  0.7450980392156863
LR F1:  0.4919540229885057
For name:  s_yang
total sample size before apply threshold:  611
Counter({'0000-0002-6469-8415': 108, '0000-0003-1301-3030': 94, '0000-0002-8835-5302': 43, '0000-0001-6795-8879': 36, '0000-0003-1751-4975': 33, '0000-0002-8572-4977': 31, '0000-0002-9394-9148': 26, '0000-0002-9879-0164': 25, '0000-0001-7892-7648': 21, '0000-0002-1726-0576': 20, '0000-0001-5684-6388': 19, '0000-0002-6888-7993': 17, '0000-0001-9170-2566'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        17
          1       0.00      0.00      0.00        10
          2       0.68      0.64      0.66        33
          3       0.39      0.97      0.56       108
          4       0.00      0.00      0.00        12
          5       1.00      0.07      0.13        14
          6       0.65      0.99      0.78        94
          7       1.00      0.43      0.60        14
          8       0.90      0.29      0.44        31
          9       1.00      0.23      0.38        26
         10       0.00      0.00      0.00        10
         11       1.00      0.79      0.88        19
         12       0.58      0.67      0.62        21
         13       1.00      0.29      0.45        17
         14       0.00      0.00      0.00        12
         15       1.00      0.33      0.50        36
         16       1.00      0.20      0.33        25
         17       1.00      0.67      0.81   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.92      0.94        26
          1       1.00      0.88      0.93        16
          2       0.96      0.99      0.97        98

avg / total       0.96      0.96      0.96       140

[24  0  2  0 14  2  1  0 97]
svc Accuracy:  0.9642857142857143
svc F1:  0.9497947252602884
             precision    recall  f1-score   support

          0       1.00      0.35      0.51        26
          1       1.00      0.38      0.55        16
          2       0.78      1.00      0.88        98

avg / total       0.85      0.81      0.77       140

[ 9  0 17  0  6 10  0  0 98]
LR Accuracy:  0.8071428571428572
LR F1:  0.6462213421854677
For name:  a_santoro
total sample size before apply threshold:  189
Counter({'0000-0002-0798-6816': 83, '0000-0003-1709-9492': 58, '0000-0002-5086-1453': 21, '0000-0003-2503-8219': 10, '0000-0002-1014-197X': 9, '0000-0002-6193-2050': 8})
['0000-0003-1709-9492', '0000-0002-5086-1453',

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.90      0.98      0.94        58
          1       1.00      1.00      1.00        21
          2       1.00      0.60      0.75        10
          3       0.99      0.98      0.98        83

avg / total       0.96      0.96      0.96       172

[57  0  0  1  0 21  0  0  4  0  6  0  2  0  0 81]
svc Accuracy:  0.9593023255813954
svc F1:  0.9184917355371901
             precision    recall  f1-score   support

          0       0.98      0.86      0.92        58
          1       1.00      1.00      1.00        21
          2       1.00      0.20      0.33        10
          3       0.85      1.00      0.92        83

avg / total       0.92      0.91      0.89       172

[50  0  0  8  0 21  0  0  1  0  2  7  0  0  0 83]
LR Accuracy:  0.9069767441860465
LR F1:  0.7919728994542721
For name:  q_lu
total sample size before apply threshold:  35
Counter({'0000-0002-2804-0827': 22, '0000-0002-4261-5121': 5, '0000-0002-4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.84      0.91        19
          1       1.00      0.57      0.73        14
          2       0.76      1.00      0.87       130
          3       0.89      0.64      0.74        25
          4       1.00      0.79      0.88        42
          5       1.00      0.86      0.92        21
          6       1.00      0.69      0.82        13
          7       1.00      0.67      0.80        18
          8       1.00      0.85      0.92        13

avg / total       0.89      0.86      0.85       295

[ 16   0   3   0   0   0   0   0   0   0   8   5   1   0   0   0   0   0
   0   0 130   0   0   0   0   0   0   0   0   9  16   0   0   0   0   0
   0   0   9   0  33   0   0   0   0   0   0   2   1   0  18   0   0   0
   0   0   4   0   0   0   9   0   0   0   0   6   0   0   0   0  12   0
   0   0   2   0   0   0   0   0  11]
svc Accuracy:  0.8576271186440678
svc F1:  0.8433707291846827
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94        47
          1       1.00      1.00      1.00        41
          2       1.00      0.83      0.91        36

avg / total       0.96      0.95      0.95       124

[47  0  0  0 41  0  6  0 30]
svc Accuracy:  0.9516129032258065
svc F1:  0.9496969696969697
             precision    recall  f1-score   support

          0       0.87      1.00      0.93        47
          1       1.00      1.00      1.00        41
          2       1.00      0.81      0.89        36

avg / total       0.95      0.94      0.94       124

[47  0  0  0 41  0  7  0 29]
LR Accuracy:  0.9435483870967742
LR F1:  0.941000253871541
For name:  t_han
total sample size before apply threshold:  53
Counter({'0000-0002-9063-4052': 42, '0000-0002-3095-7714': 8, '0000-0003-3535-8582': 2, '0000-0003-1404-1578': 1})
['0000-0002-9063-4052']
Total sample size after apply threshold:  42
For name:  m_sandberg
total sample siz

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       1.00      0.09      0.17        11
          2       1.00      0.69      0.81        16
          3       0.57      1.00      0.73        28

avg / total       0.82      0.68      0.63        65

[ 4  0  0  6  0  1  0 10  0  0 11  5  0  0  0 28]
LR Accuracy:  0.676923076923077
LR F1:  0.5700456950456951
For name:  f_bianchi
total sample size before apply threshold:  131
Counter({'0000-0002-3459-9301': 54, '0000-0001-7880-5624': 37, '0000-0002-2863-1598': 16, '0000-0003-2996-3604': 12, '0000-0001-5197-5279': 11, '0000-0002-7145-3846': 1})
['0000-0001-7880-5624', '0000-0002-2863-1598', '0000-0003-2996-3604', '0000-0002-3459-9301', '0000-0001-5197-5279']
Total sample size after apply threshold:  130
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(606, 1534)
(0, 0)
(0, 0)
1
606
             precision    recall  f1-score   support

          0       1.00      0.29      0.44        14
          1       1.00      0.32      0.48        25
          2       0.66      0.86      0.75        58
          3       0.00      0.00      0.00        11
          4       1.00      0.19      0.32        16
          5       0.00      0.00      0.00        19
          6       1.00      0.24      0.39        29
          7       0.00      0.00      0.00        17
          8       0.97      0.76      0.85        46
          9       0.25      0.97      0.40        63
         10       0.00      0.00      0.00        13
         11       0.00      0.00      0.00        11
         12       1.00      0.42      0.59        12
         13       0.00      0.00      0.00        18
         14       0.51      0.82      0.63        51
         15       0.67      0.07      0.13        27
         16       0.00      0.00      0.00        10
         17 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(87, 148)
(0, 0)
(0, 0)
1
87
             precision    recall  f1-score   support

          0       0.81      0.91      0.86        23
          1       0.50      0.07      0.12        14
          2       0.85      1.00      0.92        50

avg / total       0.78      0.83      0.77        87

[21  1  1  5  1  8  0  0 50]
MNB Accuracy:  0.8275862068965517
MNB F1:  0.6331913499344691
             precision    recall  f1-score   support

          0       0.87      0.87      0.87        23
          1       0.92      0.79      0.85        14
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.73      1.00      0.84        32
          1       1.00      0.14      0.25        14

avg / total       0.81      0.74      0.66        46

[32  0 12  2]
svc Accuracy:  0.7391304347826086
svc F1:  0.5460526315789473
             precision    recall  f1-score   support

          0       0.70      1.00      0.82        32
          1       0.00      0.00      0.00        14

avg / total       0.48      0.70      0.57        46

[32  0 14  0]
LR Accuracy:  0.6956521739130435
LR F1:  0.41025641025641024
For name:  b_cao
total sample size before apply threshold:  58
Counter({'0000-0002-9462-496X': 39, '0000-0003-3588-972X': 14, '0000-0003-3401-6900': 4, '0000-0003-4443-2326': 1})
['0000-0002-9462-496X', '0000-0003-3588-972X']
Total sample size after apply threshold:  53
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.99      0.91       110
          1       0.99      0.77      0.86        90

avg / total       0.90      0.89      0.89       200

[109   1  21  69]
MNB Accuracy:  0.89
MNB F1:  0.8854166666666667
             precision    recall  f1-score   support

          0       0.99      0.91      0.95       110
          1       0.90      0.99      0.94        90

avg / total       0.95      0.94      0.95       200

[100  10   1  89]
svc Accuracy:  0.945
svc F1:  0.9448331201885705
             precision    recall  f1-score   support

          0       0.87      0.93      0.90       110
          1       0.90      0.83      0.87        90

avg / total       0.89      0.89      0.88       200

[102   8  15  75]
LR Accuracy:  0.885
LR F1:  0.8828652186091518
For name:  a_sharma
total sample size before apply threshold:  223
Counter({'0000-0002-2653-0806': 85, '0000-0003-3349-4417': 23, '0000-0002-7668-3501': 14, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.83      0.90        23
          1       1.00      0.42      0.59        12
          2       1.00      0.93      0.96        14
          3       0.86      1.00      0.92        85
          4       1.00      0.80      0.89        10

avg / total       0.92      0.90      0.89       144

[19  0  0  4  0  0  5  0  7  0  0  0 13  1  0  0  0  0 85  0  0  0  0  2
  8]
svc Accuracy:  0.9027777777777778
svc F1:  0.853752418841933
             precision    recall  f1-score   support

          0       1.00      0.52      0.69        23
          1       0.00      0.00      0.00        12
          2       1.00      0.86      0.92        14
          3       0.71      1.00      0.83        85
          4       1.00      0.10      0.18        10

avg / total       0.75      0.76      0.70       144

[12  0  0 11  0  0  0  0 12  0  0  0 12  2  0  0  0  0 85  0  0  0  0  9
  1]
LR Accuracy:  0.7638888888888888
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(36, 83)
(0, 0)
(0, 0)
1
36
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.96      1.00      0.98        23

avg / total       0.97      0.97      0.97        36

[12  1  0 23]
MNB Accuracy:  0.9722222222222222
MNB F1:  0.9693617021276596
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.96      1.00      0.98        23

avg / total       0.97      0.97      0.97        36

[12  1  0 23]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.70      0.82        23
          1       1.00      0.77      0.87        30
          2       1.00      0.83      0.90        23
          3       1.00      0.82      0.90        11
          4       0.71      1.00      0.83        48

avg / total       0.90      0.85      0.85       135

[16  0  0  0  7  0 23  0  0  7  0  0 19  0  4  0  0  0  9  2  0  0  0  0
 48]
svc Accuracy:  0.8518518518518519
svc F1:  0.8641570920946327
             precision    recall  f1-score   support

          0       1.00      0.52      0.69        23
          1       1.00      0.77      0.87        30
          2       1.00      0.83      0.90        23
          3       1.00      0.09      0.17        11
          4       0.60      1.00      0.75        48

avg / total       0.86      0.76      0.74       135

[12  0  0  0 11  0 23  0  0  7  0  0 19  0  4  0  0  0  1 10  0  0  0  0
 48]
LR Accuracy:  0.762962962962963
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        19
          1       1.00      0.40      0.57        10
          2       0.99      0.89      0.94        85
          3       1.00      0.71      0.83        14
          4       0.77      1.00      0.87        86
          5       1.00      0.88      0.93        24

avg / total       0.91      0.89      0.88       238

[14  0  0  0  5  0  0  4  0  0  6  0  0  0 76  0  9  0  0  0  1 10  3  0
  0  0  0  0 86  0  0  0  0  0  3 21]
svc Accuracy:  0.8865546218487395
svc F1:  0.8322564267008711
             precision    recall  f1-score   support

          0       1.00      0.68      0.81        19
          1       1.00      0.20      0.33        10
          2       1.00      0.89      0.94        85
          3       1.00      0.36      0.53        14
          4       0.68      1.00      0.81        86
          5       1.00      0.67      0.80        24

avg / total       0.89     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       1.00      0.94      0.97        16

avg / total       0.97      0.97      0.97        33

[17  0  1 15]
LR Accuracy:  0.9696969696969697
LR F1:  0.9695852534562213
For name:  m_kelly
total sample size before apply threshold:  97
Counter({'0000-0002-6380-1150': 19, '0000-0002-1735-3342': 17, '0000-0001-7963-2139': 16, '0000-0001-6221-7406': 12, '0000-0003-3114-8780': 11, '0000-0003-1799-055X': 10, '0000-0003-3210-0295': 4, '0000-0002-6541-2992': 3, '0000-0002-2029-5841': 2, '0000-0003-2882-4450': 1, '0000-0003-0900-0691': 1, '0000-0002-0995-2425': 1})
['0000-0001-7963-2139', '0000-0003-3114-8780', '0000-0003-1799-055X', '0000-0001-6221-7406', '0000-0002-1735-3342', '0000-0002-6380-1150']
Total sample size after apply threshold:  85
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='co

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.95      0.89        60
          1       1.00      0.92      0.96        25
          2       0.95      0.87      0.91        71

avg / total       0.92      0.91      0.91       156

[57  0  3  2 23  0  9  0 62]
svc Accuracy:  0.9102564102564102
svc F1:  0.9202410130718954
             precision    recall  f1-score   support

          0       0.81      0.90      0.85        60
          1       1.00      0.72      0.84        25
          2       0.92      0.92      0.92        71

avg / total       0.89      0.88      0.88       156

[54  0  6  7 18  0  6  0 65]
LR Accuracy:  0.8782051282051282
LR F1:  0.8676986536198207
For name:  s_yun
total sample size before apply threshold:  102
Counter({'0000-0001-7737-4746': 76, '0000-0002-1498-3701': 24, '0000-0002-3774-0622': 1, '0000-0002-9510-5133': 1})
['0000-0001-7737-4746', '0000-0002-1498-3701']
Total sample size after apply threshold:  100
TfidfVector

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  76
Counter({'0000-0002-3174-8965': 57, '0000-0002-3566-5649': 9, '0000-0003-3670-1952': 4, '0000-0003-4864-8175': 1, '0000-0002-8196-1748': 1, '0000-0001-7266-1084': 1, '0000-0002-3884-4335': 1, '0000-0002-5194-7339': 1, '0000-0001-6389-2315': 1})
['0000-0002-3174-8965']
Total sample size after apply threshold:  57
For name:  e_gomes
total sample size before apply threshold:  40
Counter({'0000-0002-6941-4872': 20, '0000-0001-6378-6942': 8, '0000-0002-4238-3738': 8, '0000-0001-8528-8741': 2, '0000-0002-0636-6041': 2})
['0000-0002-6941-4872']
Total sample size after apply threshold:  20
For name:  t_yamaguchi
total sample size before apply threshold:  62
Counter({'0000-0003-4590-8592': 30, '0000-0001-9043-4408': 15, '0000-0003-0214-4983': 7, '0000-0002-7533-430X': 6, '0000-0002-5063-9924': 2, '0000-0001-8454-1995': 1, '0000-0001-5341-4184': 1})
['0000-0001-9043-4408', '0000-0003-4590-8592']
Total sample size after apply threshold:  45
TfidfVecto

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Total sample size after apply threshold:  323
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(323, 693)
(0, 0)
(0, 0)
1
323
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        16
          1       0.78      0.98      0.87        50
          2       0.72      0.97      0.83        71
          3       1.00      0.65      0.79        20
          4       0.78      0.92      0.84        71
          5       0.94      0.82      0.87        55
          6       1.00      0.40      0.57        40

avg / total       0.85      0.81      0.79       323


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       1.00      1.00      1.00        19
          2       0.96      1.00      0.98        22

avg / total       0.98      0.98      0.98        52

[10  0  1  0 19  0  0  0 22]
svc Accuracy:  0.9807692307692307
svc F1:  0.9767195767195767
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       1.00      1.00      1.00        19
          2       0.92      1.00      0.96        22

avg / total       0.96      0.96      0.96        52

[ 9  0  2  0 19  0  0  0 22]
LR Accuracy:  0.9615384615384616
LR F1:  0.9521739130434783
For name:  l_stevens
total sample size before apply threshold:  77
Counter({'0000-0003-3372-3419': 49, '0000-0003-3847-5979': 26, '0000-0002-6075-8273': 1, '0000-0002-1345-6520': 1})
['0000-0003-3372-3419', '0000-0003-3847-5979']
Total sample size after apply threshold:  75
TfidfVect

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.84      0.91        19
          1       0.87      0.82      0.85        74
          2       0.84      0.92      0.87        72

avg / total       0.87      0.87      0.87       165

[16  3  0  0 61 13  0  6 66]
svc Accuracy:  0.8666666666666667
svc F1:  0.8785600406461334
             precision    recall  f1-score   support

          0       1.00      0.63      0.77        19
          1       0.85      0.82      0.84        74
          2       0.80      0.90      0.85        72

avg / total       0.85      0.84      0.83       165

[12  4  3  0 61 13  0  7 65]
LR Accuracy:  0.8363636363636363
LR F1:  0.8198277297858801
For name:  l_song
total sample size before apply threshold:  58
Counter({'0000-0003-0585-8519': 38, '0000-0003-1691-9583': 15, '0000-0002-0400-8283': 3, '0000-0003-2454-1576': 1, '0000-0002-7299-5719': 1})
['0000-0003-1691-9583', '0000-0003-0585-8519']
Total sample size after apply t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(111, 583)
(0, 0)
(0, 0)
1
111
             precision    recall  f1-score   support

          0       0.93      0.54      0.68        26
          1       0.88      0.99      0.93        85

avg / total       0.89      0.88      0.87       111

[14 12  1 84]
MNB Accuracy:  0.8828828828828829
MNB F1:  0.8055518124242016
             precision    recall  f1-score   support

          0       1.00      0.81      0.89        26
          1       0.94      1.00      0.97        85

avg / total       0.96      0.95      0.95       111

[21  5  0 85]
svc A

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        28
          1       1.00      0.71      0.83        14
          2       0.69      1.00      0.82        98
          3       1.00      0.36      0.53        11
          4       1.00      0.69      0.81        16
          5       1.00      0.80      0.89        55
          6       1.00      0.67      0.80        12
          7       1.00      0.88      0.93        65
          8       1.00      0.88      0.94        17

avg / total       0.90      0.86      0.86       316

[25  0  3  0  0  0  0  0  0  0 10  4  0  0  0  0  0  0  0  0 98  0  0  0
  0  0  0  0  0  7  4  0  0  0  0  0  0  0  5  0 11  0  0  0  0  0  0 11
  0  0 44  0  0  0  0  0  4  0  0  0  8  0  0  0  0  8  0  0  0  0 57  0
  0  0  2  0  0  0  0  0 15]
svc Accuracy:  0.8607594936708861
svc F1:  0.8335954992178143
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.74      1.00      0.85        14
          1       0.88      0.70      0.78        10
          2       1.00      0.82      0.90        17

avg / total       0.88      0.85      0.85        41

[14  0  0  3  7  0  2  1 14]
svc Accuracy:  0.8536585365853658
svc F1:  0.8431628109047463
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        14
          1       1.00      0.50      0.67        10
          2       0.88      0.88      0.88        17

avg / total       0.86      0.83      0.82        41

[14  0  0  3  5  2  2  0 15]
LR Accuracy:  0.8292682926829268
LR F1:  0.7991681521093286
For name:  j_sullivan
total sample size before apply threshold:  79
Counter({'0000-0003-1457-2950': 26, '0000-0001-5445-708X': 17, '0000-0003-4489-4926': 14, '0000-0003-3209-0218': 9, '0000-0001-6732-0699': 7, '0000-0002-5952-3805': 2, '0000-0003-2906-2232': 1, '0000-0002-7279-4319':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
(0, 0)
1
428
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       1.00      0.32      0.48        25
          2       0.85      0.52      0.64        54
          3       1.00      0.39      0.56        31
          4       0.51      1.00      0.68       138
          5       0.99      0.85      0.91        78
          6       1.00      0.59      0.74        22
          7       0.00      0.00      0.00        22
          8       1.00      0.63      0.77        38

avg / total       0.77      0.68      0.65       428

[  2   0   0   0  18   0   0   0   0   0   8   0   0  17   0   0   0   0
   0   0  28   0  26   0   0   0   0   0   0   0  12  19   0   0   0   0
   0   0   0   0 138   0   0   0   0   0   0   0   0  12  66   0   0   0
   0   0   0   0   9   0  13   0   0   0   0   5   0  17   0   0   0   0
   0   0   0   0  13   1   0   0  24]
MNB Accuracy:  0.6799065420560748
MNB F1:  0.552668062159228
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(146, 358)
(0, 0)
(0, 0)
1
146
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        15
          1       1.00      0.62      0.77        29
          2       1.00      0.29      0.45        17
          3       0.63      1.00      0.77        71
          4       1.00      0.71      0.83        14

avg / total       0.72      0.71      0.66       146

[ 0  0  0 15  0  0 18  0 11  0  0  0  5 12  0  0  0  0 71  0  0  0  0  4
 10]
MNB Accuracy:  0.7123287671232876
MNB F1:  0.5651150730244161
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        15
          1       1.00      0.69      0.82        29
          2       1.00      0.47      0.64        17
          3       0.70      1.00      0.83        71
          4       1.00      0.86      0.92        14

avg / total       0.86      0.79      0.78       146

[ 5  0  0 10  0  0 20  0  9  0  0  0  8  9  0  0  0  0 71  0  0  0  0  2
 12]
svc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.68      0.81        28
          1       1.00      0.68      0.81        22
          2       0.81      1.00      0.90       140
          3       1.00      0.78      0.88        37
          4       1.00      0.78      0.88        37

avg / total       0.90      0.88      0.88       264

[ 19   0   9   0   0   0  15   7   0   0   0   0 140   0   0   0   0   8
  29   0   0   0   8   0  29]
svc Accuracy:  0.8787878787878788
svc F1:  0.8548666208240677
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        28
          1       1.00      0.05      0.09        22
          2       0.65      1.00      0.78       140
          3       1.00      0.41      0.58        37
          4       1.00      0.51      0.68        37

avg / total       0.81      0.71      0.66       264

[ 12   0  16   0   0   0   1  21   0   0   0   0 140   0   0   0   0  22
  15   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        53
          1       1.00      0.85      0.92        13
          2       1.00      0.67      0.80        12
          3       1.00      1.00      1.00        16
          4       1.00      0.86      0.93        22
          5       1.00      0.75      0.86        12

avg / total       0.92      0.91      0.90       128

[53  0  0  0  0  0  2 11  0  0  0  0  4  0  8  0  0  0  0  0  0 16  0  0
  3  0  0  0 19  0  3  0  0  0  0  9]
svc Accuracy:  0.90625
svc F1:  0.8998239794746615
             precision    recall  f1-score   support

          0       0.63      1.00      0.77        53
          1       1.00      0.54      0.70        13
          2       1.00      0.50      0.67        12
          3       1.00      1.00      1.00        16
          4       1.00      0.55      0.71        22
          5       1.00      0.25      0.40        12

avg / total       0.85      0.76      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(104, 184)
(0, 0)
(0, 0)
1
104
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        15
          1       0.78      0.33      0.47        21
          2       1.00      0.62      0.76        13
          3       1.00      0.23      0.38        13
          4       0.58      1.00      0.74        42

avg / total       0.79      0.69      0.66       104

[12  0  0  0  3  0  7  0  0 14  0  0  8  0  5  0  2  0  3  8  0  0  0  0
 42]
MNB Accuracy:  0.6923076923076923
MNB F1:  0.6458604845446951
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        12
          1       0.89      0.85      0.87        20
          2       1.00      0.71      0.83        17
          3       0.62      1.00      0.77        35
          4       0.88      0.39      0.54        18
          5       0.97      0.93      0.95        40

avg / total       0.87      0.82      0.82       142

[ 9  0  0  2  0  1  0 17  0  3  0  0  0  1 12  4  0  0  0  0  0 35  0  0
  0  1  0 10  7  0  0  0  0  2  1 37]
svc Accuracy:  0.823943661971831
svc F1:  0.8021556987074229
             precision    recall  f1-score   support

          0       1.00      0.58      0.74        12
          1       0.89      0.80      0.84        20
          2       1.00      0.53      0.69        17
          3       0.56      0.94      0.70        35
          4       1.00      0.28      0.43        18
          5       0.84      0.93      0.88        40

avg / total       0.83      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 77
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(77, 191)
(0, 0)
(0, 0)
1
77
             precision    recall  f1-score   support

          0       1.00      0.96      0.98        28
          1       0.98      1.00      0.99        49

avg / total       0.99      0.99      0.99        77

[27  1  0 49]
MNB Accuracy:  0.987012987012987
MNB F1:  0.9858585858585859
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        28
          1       0.96      1.00      0.98        49

avg / total       0.98      0.97      0.97        77

[26  2  0 49]
svc 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.99      0.92        76
          1       0.92      0.48      0.63        23

avg / total       0.87      0.87      0.85        99

[75  1 12 11]
MNB Accuracy:  0.8686868686868687
MNB F1:  0.7744084136722174
             precision    recall  f1-score   support

          0       0.97      1.00      0.99        76
          1       1.00      0.91      0.95        23

avg / total       0.98      0.98      0.98        99

[76  0  2 21]
svc Accuracy:  0.9797979797979798
svc F1:  0.9707792207792207
             precision    recall  f1-score   support

          0       0.82      1.00      0.90        76
          1       1.00      0.26      0.41        23

avg / total       0.86      0.83      0.79        99

[76  0 17  6]
LR Accuracy:  0.8282828282828283
LR F1:  0.6566006937359722
For name:  m_schneider
total sample size before apply threshold:  367
Counter({'0000-0001-9645-1938': 110, '0000-0002-9570-3491':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(354, 1357)
(0, 0)
(0, 0)
1
354
             precision    recall  f1-score   support

          0       0.86      0.86      0.86        91
          1       1.00      0.71      0.83        56
          2       0.50      0.97      0.66       110
          3       1.00      0.06      0.11        34
          4       0.00      0.00      0.00        10
          5       0.00      0.00      0.00        10
          6       1.00      0.21      0.34        29
          7       0.00      0.00      0.00        14

avg / total       0.71      0.66      0.60   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96        26
          1       0.95      1.00      0.98        21
          2       1.00      0.84      0.91        19

avg / total       0.96      0.95      0.95        66

[26  0  0  0 21  0  2  1 16]
MNB Accuracy:  0.9545454545454546
MNB F1:  0.9513309544317297
             precision    recall  f1-score   support

          0       0.96      0.96      0.96        26
          1       0.95      1.00      0.98        21
          2       0.94      0.89      0.92        19

avg / total       0.95      0.95      0.95        66

[25  0  1  0 21  0  1  1 17]
svc Accuracy:  0.9545454545454546
svc F1:  0.952400522167964
             precision    recall  f1-score   support

          0       0.93      1.00      0.96        26
          1       0.95      1.00      0.98        21
          2       1.00      0.84      0.91        19

avg / total       0.96      0.95      0.95        66

[26  0  0  0 21

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.96      0.98        51
          1       0.96      1.00      0.98        46

avg / total       0.98      0.98      0.98        97

[49  2  0 46]
svc Accuracy:  0.979381443298969
svc F1:  0.9793617021276595
             precision    recall  f1-score   support

          0       0.98      0.98      0.98        51
          1       0.98      0.98      0.98        46

avg / total       0.98      0.98      0.98        97

[50  1  1 45]
LR Accuracy:  0.979381443298969
LR F1:  0.9793265132139812
For name:  m_acosta
total sample size before apply threshold:  47
Counter({'0000-0002-5018-339X': 24, '0000-0003-4827-7271': 17, '0000-0003-0611-6672': 4, '0000-0001-9504-883X': 2})
['0000-0003-4827-7271', '0000-0002-5018-339X']
Total sample size after apply threshold:  41
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(228, 512)
(0, 0)
(0, 0)
1
228
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       218
          1       0.00      0.00      0.00        10

avg / total       0.91      0.96      0.93       228

[218   0  10   0]
MNB Accuracy:  0.956140350877193
MNB F1:  0.4887892376681614


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.97      1.00      0.99       218
          1       1.00      0.40      0.57        10

avg / total       0.97      0.97      0.97       228

[218   0   6   4]
svc Accuracy:  0.9736842105263158
svc F1:  0.7789269553975438
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       218
          1       0.00      0.00      0.00        10

avg / total       0.91      0.96      0.93       228

[218   0  10   0]
LR Accuracy:  0.956140350877193
LR F1:  0.4887892376681614
For name:  p_kelly
total sample size before apply threshold:  55
Counter({'0000-0003-0500-1865': 27, '0000-0001-9040-1868': 11, '0000-0001-8933-2367': 6, '0000-0003-4338-6225': 5, '0000-0002-7490-5772': 5, '0000-0002-8813-8877': 1})
['0000-0001-9040-1868', '0000-0003-0500-1865']
Total sample size after apply threshold:  38
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<cla

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(60, 187)
(0, 0)
(0, 0)
1
60
             precision    recall  f1-score   support

          0       0.58      1.00      0.74        35
          1       0.00      0.00      0.00        15
          2       0.00      0.00      0.00        10

avg / total       0.34      0.58      0.43        60

[35  0  0 15  0  0 10  0  0]
MNB Accuracy:  0.5833333333333334
MNB F1:  0.24561403508771928
             precision    recall  f1-score   support

          0       0.64      1.00      0.78        35
          1       1.00      0.33      0.50        15
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.88      1.00      0.93        64
          1       1.00      0.20      0.33        10
          2       1.00      0.95      0.97        19

avg / total       0.92      0.90      0.88        93

[64  0  0  8  2  0  1  0 18]
MNB Accuracy:  0.9032258064516129
MNB F1:  0.7468709585497907
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        64
          1       1.00      0.70      0.82        10
          2       1.00      0.95      0.97        19

avg / total       0.96      0.96      0.95        93

[64  0  0  3  7  0  1  0 18]
svc Accuracy:  0.956989247311828
svc F1:  0.9220664514782162
             precision    recall  f1-score   support

          0       0.77      1.00      0.87        64
          1       0.00      0.00      0.00        10
          2       1.00      0.53      0.69        19

avg / total       0.73      0.80      0.74        93

[64  0  0 10  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.79      0.88        24
          1       0.76      1.00      0.86        62
          2       1.00      0.18      0.31        11
          3       1.00      0.50      0.67        12

avg / total       0.86      0.82      0.79       109

[19  5  0  0  0 62  0  0  0  9  2  0  0  6  0  6]
MNB Accuracy:  0.8165137614678899
MNB F1:  0.6797977539256609
             precision    recall  f1-score   support

          0       0.91      0.83      0.87        24
          1       0.85      0.97      0.90        62
          2       1.00      0.73      0.84        11
          3       0.88      0.58      0.70        12

avg / total       0.88      0.87      0.87       109

[20  4  0  0  2 60  0  0  0  2  8  1  0  5  0  7]
svc Accuracy:  0.8715596330275229
svc F1:  0.8284815299117358
             precision    recall  f1-score   support

          0       1.00      0.58      0.74        24
          1       0.72     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.73      0.33      0.46        33
          1       1.00      0.30      0.47        23
          2       0.00      0.00      0.00        11
          3       0.54      0.98      0.70        66
          4       1.00      0.40      0.57        15

avg / total       0.66      0.60      0.54       148

[11  0  0 22  0  0  7  0 16  0  3  0  0  8  0  1  0  0 65  0  0  0  0  9
  6]
MNB Accuracy:  0.6013513513513513
MNB F1:  0.4390706605222735
             precision    recall  f1-score   support

          0       0.71      0.52      0.60        33
          1       0.81      0.74      0.77        23
          2       1.00      0.36      0.53        11
          3       0.70      0.94      0.81        66
          4       1.00      0.73      0.85        15

avg / total       0.77      0.75      0.74       148

[17  1  0 15  0  0 17  0  6  0  5  0  4  2  0  2  2  0 62  0  0  1  0  3
 11]
svc Accuracy:  0.75
svc F1:  0.710

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.51      1.00      0.68        25
          2       1.00      0.64      0.78        14
          3       1.00      0.18      0.31        11
          4       0.88      0.58      0.70        12

avg / total       0.81      0.65      0.63        72

[ 4  6  0  0  0  0 25  0  0  0  0  4  9  0  1  0  9  0  2  0  0  5  0  0
  7]
MNB Accuracy:  0.6527777777777778
MNB F1:  0.6074810500897458
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       0.93      1.00      0.96        25
          2       1.00      0.93      0.96        14
          3       1.00      1.00      1.00        11
          4       1.00      0.92      0.96        12

avg / total       0.97      0.97      0.97        72

[10  0  0  0  0  0 25  0  0  0  0  1 13  0  0  0  0  0 11  0  0  1  0  0
 11]
svc Accuracy:  0.9722222222222222


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.85      1.00      0.92        66
          1       1.00      0.75      0.86        16
          2       1.00      0.79      0.88        38

avg / total       0.92      0.90      0.90       120

[66  0  0  4 12  0  8  0 30]
svc Accuracy:  0.9
svc F1:  0.8853874883286648
             precision    recall  f1-score   support

          0       0.70      1.00      0.82        66
          1       1.00      0.12      0.22        16
          2       1.00      0.63      0.77        38

avg / total       0.84      0.77      0.73       120

[66  0  0 14  2  0 14  0 24]
LR Accuracy:  0.7666666666666667
LR F1:  0.6071385902031062
For name:  h_chen
total sample size before apply threshold:  986
Counter({'0000-0001-5108-8338': 147, '0000-0002-5799-6705': 93, '0000-0003-0708-6073': 73, '0000-0001-6758-1995': 49, '0000-0001-5051-9896': 40, '0000-0003-0676-4610': 40, '0000-0002-7748-4440': 39, '0000-0001-6883-3752': 36, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.44      0.47        16
          1       0.61      0.70      0.65        20
          2       0.85      0.61      0.71        18
          3       1.00      0.58      0.74        12
          4       0.94      0.79      0.86        39
          5       1.00      0.89      0.94        36
          6       0.91      0.50      0.65        20
          7       0.54      0.60      0.57        25
          8       1.00      0.20      0.33        10
          9       0.89      0.89      0.89        19
         10       0.67      0.57      0.62        28
         11       0.70      0.70      0.70        40
         12       0.93      0.93      0.93        73
         13       0.89      0.85      0.87        40
         14       1.00      0.73      0.85        15
         15       0.58      0.83      0.68        93
         16       0.92      0.86      0.89        14
         17       0.90      0.75      0.82   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.72      0.81      0.76        36
          1       1.00      0.92      0.96        13
          2       1.00      0.80      0.89        10
          3       1.00      0.80      0.89        15
          4       0.77      0.96      0.85       117
          5       0.91      0.81      0.86        48
          6       0.86      0.35      0.50        17
          7       0.78      1.00      0.88        14
          8       1.00      0.29      0.44        14
          9       0.96      0.79      0.87        33

avg / total       0.85      0.83      0.82       317

[ 29   0   0   0   7   0   0   0   0   0   0  12   0   0   1   0   0   0
   0   0   0   0   8   0   1   0   1   0   0   0   0   0   0  12   1   0
   0   1   0   1   4   0   0   0 112   1   0   0   0   0   2   0   0   0
   7  39   0   0   0   0   2   0   0   0   7   2   6   0   0   0   0   0
   0   0   0   0   0  14   0   0   3   0   0   0   6   1   0   0   4 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      1.00      0.98        24
          1       1.00      0.92      0.96        12

avg / total       0.97      0.97      0.97        36

[24  0  1 11]
svc Accuracy:  0.9722222222222222
svc F1:  0.9680567879325643
             precision    recall  f1-score   support

          0       0.83      1.00      0.91        24
          1       1.00      0.58      0.74        12

avg / total       0.89      0.86      0.85        36

[24  0  5  7]
LR Accuracy:  0.8611111111111112
LR F1:  0.8212512413108242
For name:  z_wu
total sample size before apply threshold:  221
Counter({'0000-0003-0807-7195': 52, '0000-0002-9596-9134': 35, '0000-0002-2982-2177': 31, '0000-0002-4468-3240': 25, '0000-0002-0708-6770': 14, '0000-0002-4004-9728': 11, '0000-0002-3719-406X': 9, '0000-0002-6424-6777': 8, '0000-0003-1660-0724': 6, '0000-0002-1824-9563': 5, '0000-0002-2463-242X': 5, '0000-0003-2009-991X': 5, '0000-0002-9383-1270': 4, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97        96
          1       1.00      0.80      0.89        25

avg / total       0.96      0.96      0.96       121

[96  0  5 20]
svc Accuracy:  0.9586776859504132
svc F1:  0.9317540891144952
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        96
          1       1.00      0.32      0.48        25

avg / total       0.88      0.86      0.83       121

[96  0 17  8]
LR Accuracy:  0.859504132231405
LR F1:  0.7017543859649122
For name:  k_nomura
total sample size before apply threshold:  38
Counter({'0000-0003-3661-6328': 32, '0000-0002-6425-4574': 3, '0000-0003-0625-1778': 1, '0000-0002-5912-074X': 1, '0000-0001-7891-9795': 1})
['0000-0003-3661-6328']
Total sample size after apply threshold:  32
For name:  m_wu
total sample size before apply threshold:  658
Counter({'0000-0002-1940-6428': 219, '0000-0002-7074-8087': 194, '0000-0002-1674-4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.07      0.13        29
          1       1.00      0.88      0.94        33
          2       1.00      0.55      0.71        11
          3       1.00      0.90      0.95        10
          4       0.81      0.69      0.74        42
          5       0.91      0.77      0.83        13
          6       0.67      0.18      0.29        22
          7       0.83      0.77      0.80        56
          8       0.76      0.90      0.82       219
          9       0.76      0.85      0.80       194

avg / total       0.80      0.79      0.76       629

[  2   0   0   0   0   0   0   3  18   6   0  29   0   0   1   0   0   1
   1   1   0   0   6   0   0   0   0   0   0   5   0   0   0   9   0   0
   0   0   0   1   0   0   0   0  29   0   0   0   4   9   0   0   0   0
   0  10   0   0   1   2   0   0   0   0   1   0   4   1  12   4   0   0
   0   0   1   1   1  43   5   5   0   0   0   0   1   0   0   1 198 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       1.00      0.95      0.97        40
          2       0.73      0.92      0.81        48
          3       0.82      0.56      0.67        16
          4       0.61      0.73      0.67        48
          5       0.50      0.24      0.33        29
          6       0.81      0.85      0.83        81

avg / total       0.76      0.77      0.76       272

[ 7  0  3  0  0  0  0  0 38  2  0  0  0  0  0  0 44  0  0  1  3  0  0  0
  9  3  0  4  0  0  1  1 35  6  5  0  0  7  1 10  7  4  0  0  3  0  9  0
 69]
svc Accuracy:  0.7683823529411765
svc F1:  0.7289918901179263
             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       1.00      0.93      0.96        40
          2       0.65      0.92      0.76        48
          3       0.86      0.38      0.52        16
          4       0.65      0.71      0.6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.97      0.99        34
          1       1.00      0.65      0.79        17
          2       1.00      0.83      0.91        29
          3       1.00      0.60      0.75        10
          4       0.61      1.00      0.76        37
          5       1.00      0.50      0.67        16

avg / total       0.90      0.83      0.83       143

[33  0  0  0  1  0  0 11  0  0  6  0  0  0 24  0  5  0  0  0  0  6  4  0
  0  0  0  0 37  0  0  0  0  0  8  8]
svc Accuracy:  0.8321678321678322
svc F1:  0.8080363329035736
             precision    recall  f1-score   support

          0       1.00      0.97      0.99        34
          1       1.00      0.35      0.52        17
          2       1.00      0.79      0.88        29
          3       1.00      0.60      0.75        10
          4       0.51      1.00      0.68        37
          5       1.00      0.19      0.32        16

avg / total       0.87     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        54
          1       1.00      0.80      0.89        35
          2       1.00      0.55      0.71        11

avg / total       0.90      0.88      0.87       100

[54  0  0  7 28  0  5  0  6]
svc Accuracy:  0.88
svc F1:  0.8315904139433551
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        54
          1       1.00      0.77      0.87        35
          2       0.00      0.00      0.00        11

avg / total       0.75      0.81      0.76       100

[54  0  0  8 27  0 11  0  0]
LR Accuracy:  0.81
LR F1:  0.5737871475742952
For name:  s_thompson
total sample size before apply threshold:  45
Counter({'0000-0003-0327-7155': 36, '0000-0003-4784-8386': 3, '0000-0001-9689-1490': 2, '0000-0001-9637-2041': 2, '0000-0002-6847-0397': 1, '0000-0002-0457-6926': 1})
['0000-0003-0327-7155']
Total sample size after apply threshold:  36
For name

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(167, 539)
(0, 0)
(0, 0)
1
167
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.83      1.00      0.91       134
          2       1.00      0.26      0.42        19

avg / total       0.78      0.83      0.77       167

[  0  14   0   0 134   0   0  14   5]
MNB Accuracy:  0.8323353293413174
MNB F1:  0.44069069069069067


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.36      0.53        14
          1       0.89      1.00      0.94       134
          2       1.00      0.58      0.73        19

avg / total       0.91      0.90      0.88       167

[  5   9   0   0 134   0   0   8  11]
svc Accuracy:  0.8982035928143712
svc F1:  0.7333333333333334
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.83      1.00      0.91       134
          2       1.00      0.26      0.42        19

avg / total       0.78      0.83      0.77       167

[  0  14   0   0 134   0   0  14   5]
LR Accuracy:  0.8323353293413174
LR F1:  0.44069069069069067
For name:  l_rocha
total sample size before apply threshold:  81
Counter({'0000-0001-9402-887X': 24, '0000-0002-4345-6994': 20, '0000-0002-5469-0911': 11, '0000-0001-7832-058X': 8, '0000-0001-8184-8801': 6, '0000-0003-2146-9708': 5, '0000-0002-7219-1518': 5, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(23, 62)
(0, 0)
(0, 0)
1
23
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.86      1.00      0.92        12

avg / total       0.93      0.91      0.91        23

[ 9  2  0 12]
MNB Accuracy:  0.9130434782608695
MNB F1:  0.9115384615384615
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.92      0.96        12

avg / total       0.96      0.96      0.96        23

[11  0  1 11]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.65      0.79        26
          1       1.00      0.91      0.95        22
          2       0.85      1.00      0.92        63

avg / total       0.92      0.90      0.90       111

[17  0  9  0 20  2  0  0 63]
svc Accuracy:  0.9009009009009009
svc F1:  0.8875955519988791
             precision    recall  f1-score   support

          0       0.94      0.62      0.74        26
          1       1.00      0.68      0.81        22
          2       0.78      0.98      0.87        63

avg / total       0.86      0.84      0.83       111

[16  0 10  0 15  7  1  0 62]
LR Accuracy:  0.8378378378378378
LR F1:  0.809412097980719
For name:  m_rodriguez
total sample size before apply threshold:  214
Counter({'0000-0001-6328-6497': 195, '0000-0001-8926-2987': 8, '0000-0002-9380-6614': 4, '0000-0002-4476-004X': 3, '0000-0001-6778-1663': 2, '0000-0002-4452-7627': 1, '0000-0002-2640-5888': 1})
['0000-0001-6328-6497

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.27      0.43        11
          1       0.70      1.00      0.82        21
          2       1.00      0.90      0.95        10

avg / total       0.85      0.79      0.75        42

[ 3  8  0  0 21  0  0  1  9]
LR Accuracy:  0.7857142857142857
LR F1:  0.7331564204629221
For name:  v_pinto
total sample size before apply threshold:  48
Counter({'0000-0002-6600-1781': 29, '0000-0002-1152-1667': 11, '0000-0003-3871-9152': 7, '0000-0003-3395-1251': 1})
['0000-0002-6600-1781', '0000-0002-1152-1667']
Total sample size after apply threshold:  40
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(298, 558)
(0, 0)
(0, 0)
1
298
             precision    recall  f1-score   support

          0       1.00      0.57      0.72        23
          1       0.00      0.00      0.00        11
          2       1.00      0.42      0.59        31
          3       0.00      0.00      0.00        11
          4       1.00      0.71      0.83        28
          5       0.77      1.00      0.87       194

avg / total       0.78      0.81      0.76       298

[ 13   0   0   0   0  10   0   0   0   0   0  11   0   0  13   0   0  18
   0   0   0   0   0  11 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.78      0.88        23
          1       1.00      0.91      0.95        11
          2       1.00      0.74      0.85        31
          3       1.00      0.82      0.90        11
          4       1.00      0.89      0.94        28
          5       0.91      1.00      0.95       194

avg / total       0.94      0.94      0.93       298

[ 18   0   0   0   0   5   0  10   0   0   0   1   0   0  23   0   0   8
   0   0   0   9   0   2   0   0   0   0  25   3   0   0   0   0   0 194]
svc Accuracy:  0.9362416107382551
svc F1:  0.9131657940754428
             precision    recall  f1-score   support

          0       1.00      0.57      0.72        23
          1       1.00      0.18      0.31        11
          2       1.00      0.29      0.45        31
          3       0.00      0.00      0.00        11
          4       1.00      0.46      0.63        28
          5       0.74      1.00      0.85   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.86      0.95      0.90        44
          2       0.94      1.00      0.97        29
          3       1.00      0.75      0.86        12
          4       0.88      0.75      0.81        20

avg / total       0.91      0.91      0.90       116

[10  0  1  0  0  0 42  1  0  1  0  0 29  0  0  0  2  0  9  1  0  5  0  0
 15]
svc Accuracy:  0.9051724137931034
svc F1:  0.89804541869058
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.78      0.98      0.87        44
          2       0.88      1.00      0.94        29
          3       1.00      0.50      0.67        12
          4       0.93      0.65      0.76        20

avg / total       0.87      0.85      0.84       116

[ 8  0  3  0  0  0 43  1  0  0  0  0 29  0  0  0  5  0  6  1  0  7  0  0
 13]
LR Accuracy:  0.853448275862069
LR F

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        24
          1       1.00      0.76      0.86        25
          2       0.85      1.00      0.92        78
          3       0.95      0.90      0.93        42

avg / total       0.92      0.91      0.90       169

[18  0  4  2  0 19  6  0  0  0 78  0  0  0  4 38]
svc Accuracy:  0.9053254437869822
svc F1:  0.8913138869738583
             precision    recall  f1-score   support

          0       1.00      0.29      0.45        24
          1       1.00      0.60      0.75        25
          2       0.70      1.00      0.83        78
          3       1.00      0.86      0.92        42

avg / total       0.86      0.80      0.79       169

[ 7  0 17  0  0 15 10  0  0  0 78  0  0  0  6 36]
LR Accuracy:  0.8047337278106509
LR F1:  0.7375216629248886
For name:  j_xavier
total sample size before apply threshold:  22
Counter({'0000-0002-0702-6700': 12, '0000-0003-1386-4492': 7, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.95      0.99      0.97       163
          1       0.92      0.58      0.71        19

avg / total       0.95      0.95      0.95       182

[162   1   8  11]
svc Accuracy:  0.9505494505494505
svc F1:  0.8413251961639059
             precision    recall  f1-score   support

          0       0.90      1.00      0.94       163
          1       0.00      0.00      0.00        19

avg / total       0.80      0.90      0.85       182

[163   0  19   0]
LR Accuracy:  0.8956043956043956
LR F1:  0.47246376811594204
For name:  r_ellis
total sample size before apply threshold:  176
Counter({'0000-0003-4931-752X': 158, '0000-0001-7691-5205': 16, '0000-0002-9755-9913': 1, '0000-0003-2355-5407': 1})
['0000-0001-7691-5205', '0000-0003-4931-752X']
Total sample size after apply threshold:  174
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='con

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.98      1.00      0.99       158

avg / total       0.98      0.98      0.98       174

[ 13   3   0 158]
svc Accuracy:  0.9827586206896551
svc F1:  0.9435736677115988
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.91      1.00      0.95       158

avg / total       0.82      0.91      0.86       174

[  0  16   0 158]
LR Accuracy:  0.9080459770114943
LR F1:  0.4759036144578313
For name:  v_saini
total sample size before apply threshold:  18
Counter({'0000-0002-0258-2871': 11, '0000-0002-9944-0262': 5, '0000-0003-2734-0120': 1, '0000-0002-6796-5881': 1})
['0000-0002-0258-2871']
Total sample size after apply threshold:  11
For name:  a_ellis
total sample size before apply threshold:  168
Counter({'0000-0001-7456-9214': 47, '0000-0002-0725-2353': 41, '0000-0002-0417-0547': 40, '0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.97      0.99        40
          1       1.00      0.96      0.98        47
          2       0.82      1.00      0.90        41
          3       1.00      0.84      0.91        37

avg / total       0.96      0.95      0.95       165

[39  0  1  0  0 45  2  0  0  0 41  0  0  0  6 31]
svc Accuracy:  0.9454545454545454
svc F1:  0.9446165621745926
             precision    recall  f1-score   support

          0       0.93      0.97      0.95        40
          1       0.79      0.96      0.87        47
          2       0.89      0.78      0.83        41
          3       1.00      0.81      0.90        37

avg / total       0.90      0.88      0.88       165

[39  1  0  0  0 45  2  0  2  7 32  0  1  4  2 30]
LR Accuracy:  0.8848484848484849
LR F1:  0.8858238367020674
For name:  f_reis
total sample size before apply threshold:  222
Counter({'0000-0002-9258-7472': 111, '0000-0003-3401-9554': 92, '0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       0.94      0.99      0.96       111
          2       0.99      0.93      0.96        92

avg / total       0.96      0.96      0.96       215

[ 11   1   0   0 110   1   0   6  86]
svc Accuracy:  0.9627906976744186
svc F1:  0.9607759581935976
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        12
          1       0.86      0.97      0.91       111
          2       0.96      0.89      0.93        92

avg / total       0.91      0.90      0.89       215

[  4   8   0   0 108   3   0  10  82]
LR Accuracy:  0.9023255813953488
LR F1:  0.7793153591265584
For name:  j_gray
total sample size before apply threshold:  112
Counter({'0000-0001-6380-2324': 55, '0000-0003-4146-7902': 24, '0000-0003-2338-0301': 17, '0000-0002-7287-0748': 8, '0000-0001-5863-6835': 3, '0000-0001-9972-5156': 2, '0000-0001-6668-5899': 1, '0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.88      0.93        24
          1       0.81      1.00      0.89        55
          2       1.00      0.41      0.58        17

avg / total       0.89      0.86      0.85        96

[21  3  0  0 55  0  0 10  7]
svc Accuracy:  0.8645833333333334
svc F1:  0.8036585365853659
             precision    recall  f1-score   support

          0       1.00      0.54      0.70        24
          1       0.66      1.00      0.80        55
          2       0.00      0.00      0.00        17

avg / total       0.63      0.71      0.63        96

[13 11  0  0 55  0  0 17  0]
LR Accuracy:  0.7083333333333334
LR F1:  0.4999347173260216
For name:  r_hughes
total sample size before apply threshold:  57
Counter({'0000-0001-9910-6566': 30, '0000-0002-4465-4212': 18, '0000-0002-6307-4432': 7, '0000-0002-2875-2103': 2})
['0000-0002-4465-4212', '0000-0001-9910-6566']
Total sample size after apply threshold:  48
TfidfVecto

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(153, 338)
(0, 0)
(0, 0)
1
153
             precision    recall  f1-score   support

          0       1.00      0.79      0.89        39
          1       0.84      1.00      0.91        79
          2       1.00      0.80      0.89        35

avg / total       0.92      0.90      0.90       153

[31  8  0  0 79  0  0  7 28]
MNB Accuracy:  0.9019607843137255
MNB F1:  0.8959659907636787
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        39
          1       0.83      1.00      0.91        79
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.66      1.00      0.79        38
          1       1.00      0.88      0.93        16
          2       1.00      0.09      0.17        11
          3       1.00      0.67      0.80        18
          4       0.93      0.82      0.87        17

avg / total       0.86      0.79      0.76       100

[38  0  0  0  0  2 14  0  0  0  9  0  1  0  1  6  0  0 12  0  3  0  0  0
 14]
MNB Accuracy:  0.79
MNB F1:  0.7133333333333334
             precision    recall  f1-score   support

          0       0.79      1.00      0.88        38
          1       1.00      1.00      1.00        16
          2       1.00      0.73      0.84        11
          3       1.00      0.83      0.91        18
          4       1.00      0.76      0.87        17

avg / total       0.92      0.90      0.90       100

[38  0  0  0  0  0 16  0  0  0  3  0  8  0  0  3  0  0 15  0  4  0  0  0
 13]
svc Accuracy:  0.9
svc F1:  0.9003167538296057
 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.96      0.98        25
          1       0.84      0.98      0.90        42
          2       0.93      0.98      0.95        43
          3       1.00      0.72      0.84        29

avg / total       0.93      0.92      0.92       139

[24  1  0  0  0 41  1  0  0  1 42  0  0  6  2 21]
MNB Accuracy:  0.920863309352518
MNB F1:  0.9188090480947624
             precision    recall  f1-score   support

          0       1.00      0.88      0.94        25
          1       0.88      0.90      0.89        42
          2       0.81      0.98      0.88        43
          3       1.00      0.76      0.86        29

avg / total       0.91      0.89      0.89       139

[22  2  1  0  0 38  4  0  0  1 42  0  0  2  5 22]
svc Accuracy:  0.8920863309352518
svc F1:  0.8943108710449466
             precision    recall  f1-score   support

          0       1.00      0.84      0.91        25
          1       1.00      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(24, 78)
(0, 0)
(0, 0)
1
24
             precision    recall  f1-score   support

          0       0.90      0.75      0.82        12
          1       0.79      0.92      0.85        12

avg / total       0.84      0.83      0.83        24

[ 9  3  1 11]
MNB Accuracy:  0.8333333333333334
MNB F1:  0.8321678321678322
             precision    recall  f1-score   support

          0       0.83      0.83      0.83        12
          1       0.83      0.83      0.83        12

avg / total       0.83      0.83      0.83        24

[10  2  2 10]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(48, 65)
(0, 0)
(0, 0)
1
48
             precision    recall  f1-score   support

          0       0.82      1.00      0.90        18
          1       1.00      0.95      0.97        19
          2       1.00      0.73      0.84        11

avg / total       0.93      0.92      0.92        48

[18  0  0  1 18  0  3  0  8]
MNB Accuracy:  0.9166666666666666
MNB F1:  0.9050260787102893
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        18
          1       0.90      1.00      0.95        19
         

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      1.00      0.93        38
          1       1.00      0.62      0.77        16

avg / total       0.90      0.89      0.88        54

[38  0  6 10]
svc Accuracy:  0.8888888888888888
svc F1:  0.848030018761726
             precision    recall  f1-score   support

          0       0.73      1.00      0.84        38
          1       1.00      0.12      0.22        16

avg / total       0.81      0.74      0.66        54

[38  0 14  2]
LR Accuracy:  0.7407407407407407
LR F1:  0.5333333333333332
For name:  d_ghosh
total sample size before apply threshold:  23
Counter({'0000-0002-6571-304X': 9, '0000-0003-0256-1998': 6, '0000-0003-3266-9262': 6, '0000-0001-9691-1498': 1, '0000-0001-8222-5737': 1})
[]
Total sample size after apply threshold:  0
For name:  r_morgan
total sample size before apply threshold:  15
Counter({'0000-0003-1664-5316': 8, '0000-0003-0194-0304': 4, '0000-0002-3881-7257': 1, '0000-0002-2842-

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.82      0.82      0.82        11
          1       1.00      0.53      0.70        15
          2       1.00      0.64      0.78        14
          3       0.72      0.94      0.82        36
          4       0.76      0.95      0.84        20
          5       1.00      0.86      0.92        14
          6       0.91      0.97      0.94        33
          7       0.94      0.75      0.83        20

avg / total       0.87      0.85      0.84       163

[ 9  0  0  0  1  0  1  0  0  8  0  7  0  0  0  0  0  0  9  3  2  0  0  0
  0  0  0 34  2  0  0  0  0  0  0  0 19  0  0  1  0  0  0  0  1 12  1  0
  1  0  0  0  0  0 32  0  1  0  0  3  0  0  1 15]
svc Accuracy:  0.8466257668711656
svc F1:  0.8322188709529633
             precision    recall  f1-score   support

          0       0.90      0.82      0.86        11
          1       1.00      0.27      0.42        15
          2       1.00      0.29      0.44       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.61      0.76        28
          1       0.90      0.47      0.62        19
          2       1.00      0.20      0.33        10
          3       1.00      0.94      0.97        16
          4       1.00      0.83      0.91        12
          5       0.83      0.69      0.75        29
          6       0.50      0.15      0.24        13
          7       0.60      0.94      0.74       194
          8       0.89      0.62      0.73        39
          9       1.00      0.53      0.70        30
         10       1.00      0.57      0.73        14
         11       1.00      0.81      0.90        27
         12       1.00      0.80      0.89        15
         13       0.71      0.78      0.75       101
         14       0.85      0.52      0.65        21
         15       1.00      0.15      0.27        13
         16       1.00      0.79      0.88        19
         17       0.68      0.62      0.65   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.99      1.00      1.00       356
          1       1.00      0.88      0.94        17

avg / total       0.99      0.99      0.99       373

[356   0   2  15]
svc Accuracy:  0.9946380697050938
svc F1:  0.9673494397759104
             precision    recall  f1-score   support

          0       0.95      1.00      0.98       356
          1       0.00      0.00      0.00        17

avg / total       0.91      0.95      0.93       373

[356   0  17   0]
LR Accuracy:  0.9544235924932976
LR F1:  0.4883401920438957
For name:  k_yamamoto
total sample size before apply threshold:  106
Counter({'0000-0002-7935-7015': 93, '0000-0003-0866-3207': 4, '0000-0002-7590-3568': 4, '0000-0001-6642-7961': 2, '0000-0002-6831-5346': 2, '0000-0002-1619-4407': 1})
['0000-0002-7935-7015']
Total sample size after apply threshold:  93
For name:  j_silva
total sample size before apply threshold:  268
Counter({'0000-0001-9523-9441': 128, '000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      0.93      0.93        14
          1       1.00      0.77      0.87        13
          2       1.00      0.96      0.98        28
          3       1.00      0.68      0.81        22
          4       0.92      1.00      0.96       128

avg / total       0.95      0.94      0.94       205

[ 13   0   0   0   1   0  10   0   0   3   0   0  27   0   1   1   0   0
  15   6   0   0   0   0 128]
svc Accuracy:  0.9414634146341463
svc F1:  0.9099134273438134
             precision    recall  f1-score   support

          0       0.92      0.79      0.85        14
          1       1.00      0.62      0.76        13
          2       1.00      0.75      0.86        28
          3       1.00      0.36      0.53        22
          4       0.82      1.00      0.90       128

avg / total       0.88      0.86      0.84       205

[ 11   0   0   0   3   0   8   0   0   5   0   0  21   0   7   1   0   0
   8  13   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.07      0.13        14
          1       0.55      1.00      0.71        31
          2       1.00      0.20      0.33        15

avg / total       0.77      0.58      0.48        60

[ 1 13  0  0 31  0  0 12  3]
LR Accuracy:  0.5833333333333334
LR F1:  0.393103448275862
For name:  s_kwon
total sample size before apply threshold:  51
Counter({'0000-0002-8490-9101': 17, '0000-0002-0679-1523': 15, '0000-0002-1857-3515': 9, '0000-0002-8215-442X': 5, '0000-0001-5265-862X': 1, '0000-0002-9121-3954': 1, '0000-0003-0249-4190': 1, '0000-0001-9287-4490': 1, '0000-0003-1147-8037': 1})
['0000-0002-0679-1523', '0000-0002-8490-9101']
Total sample size after apply threshold:  32
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(98, 377)
(0, 0)
(0, 0)
1
98
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        27
          1       1.00      0.73      0.84        11
          2       1.00      0.83      0.91        12
          3       1.00      0.08      0.15        12
          4       0.64      1.00      0.78        36

avg / total       0.87      0.80      0.77        98

[23  0  0  0  4  0  8  0  0  3  0  0 10  0  2  0  0  0  1 11  0  0  0  0
 36]
MNB Accuracy:  0.7959183673469388
MNB F1:  0.7215302043494263
             

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(58, 192)
(0, 0)
(0, 0)
1
58
             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.79      1.00      0.88        42

avg / total       0.85      0.81      0.77        58

[ 5 11  0 42]
MNB Accuracy:  0.8103448275862069
MNB F1:  0.6802005012531329
             precision    recall  f1-score   support

          0       1.00      0.75      0.86        16
          1       0.91      1.00      0.95        42

avg / total       0.94      0.93      0.93        58

[12  4  0 42]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 0.873202614379085
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       1.00      0.84      0.91        38
          2       0.91      1.00      0.95        81

avg / total       0.94      0.94      0.94       129

[ 8  0  2  0 32  6  0  0 81]
svc Accuracy:  0.937984496124031
svc F1:  0.9187052598817305
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       1.00      0.61      0.75        38
          2       0.80      1.00      0.89        81

avg / total       0.88      0.84      0.83       129

[ 5  0  5  0 23 15  0  0 81]
LR Accuracy:  0.8449612403100775
LR F1:  0.7702916391440983
For name:  m_ferreira
total sample size before apply threshold:  253
Counter({'0000-0002-5293-9090': 80, '0000-0002-6814-6773': 33, '0000-0002-9459-8167': 23, '0000-0001-8362-0819': 16, '0000-0003-2098-066X': 15, '0000-0002-0856-9811': 14, '0000-0002-0075-2400': 10,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.20      0.33        10
          1       1.00      0.93      0.97        15
          2       1.00      0.70      0.82        10
          3       0.67      1.00      0.80        80
          4       1.00      0.21      0.35        14
          5       1.00      0.75      0.86        16
          6       1.00      0.83      0.90        23
          7       0.96      0.76      0.85        33
          8       1.00      0.80      0.89        10

avg / total       0.87      0.81      0.79       211

[ 2  0  0  8  0  0  0  0  0  0 14  0  1  0  0  0  0  0  0  0  7  3  0  0
  0  0  0  0  0  0 80  0  0  0  0  0  0  0  0 11  3  0  0  0  0  0  0  0
  3  0 12  0  1  0  0  0  0  4  0  0 19  0  0  0  0  0  8  0  0  0 25  0
  0  0  0  2  0  0  0  0  8]
svc Accuracy:  0.8056872037914692
svc F1:  0.7526191600955815
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.25      0.40        20
          1       0.00      0.00      0.00        15
          2       0.82      1.00      0.90       251
          3       0.00      0.00      0.00        26

avg / total       0.72      0.82      0.75       312

[  5   0  15   0   0   0  15   0   0   0 250   1   0   0  26   0]
MNB Accuracy:  0.8173076923076923
MNB F1:  0.32441651705565533
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        20
          1       1.00      0.80      0.89        15
          2       0.89      1.00      0.94       251
          3       1.00      0.31      0.47        26

avg / total       0.91      0.90      0.89       312

[ 11   0   9   0   0  12   3   0   0   0 251   0   0   0  18   8]
svc Accuracy:  0.9038461538461539
svc F1:  0.7531908915235591
             precision    recall  f1-score   support

          0       1.00      0.05      0.10     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       1.00      0.70      0.82        10
          2       0.88      1.00      0.93        21

avg / total       0.95      0.94      0.94        52

[21  0  0  0  7  3  0  0 21]
LR Accuracy:  0.9423076923076923
LR F1:  0.918954248366013
For name:  g_miller
total sample size before apply threshold:  76
Counter({'0000-0002-4743-8187': 26, '0000-0001-8984-1284': 23, '0000-0001-6533-3306': 13, '0000-0003-4527-3814': 11, '0000-0002-1108-0654': 3})
['0000-0001-6533-3306', '0000-0001-8984-1284', '0000-0003-4527-3814', '0000-0002-4743-8187']
Total sample size after apply threshold:  73
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_w

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(44, 48)
(0, 0)
(0, 0)
1
44
             precision    recall  f1-score   support

          0       0.75      0.30      0.43        10
          1       0.82      0.97      0.89        34

avg / total       0.81      0.82      0.79        44

[ 3  7  1 33]
MNB Accuracy:  0.8181818181818182
MNB F1:  0.6602316602316602
             precision    recall  f1-score   support

          0       0.88      0.70      0.78        10
          1       0.92      0.97      0.94        34

avg / total       0.91      0.91      0.91        44

[ 7  3  1 33]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.86      0.40      0.55        15
          2       0.84      0.96      0.90        28
          3       0.89      0.89      0.89        18
          4       0.78      0.94      0.85        31

avg / total       0.86      0.85      0.84       108

[14  1  0  0  1  0  6  4  0  5  0  0 27  1  0  0  0  0 16  2  0  0  1  1
 29]
LR Accuracy:  0.8518518518518519
LR F1:  0.8241235888294712
For name:  j_chin
total sample size before apply threshold:  27
Counter({'0000-0003-3932-8639': 13, '0000-0002-1840-325X': 9, '0000-0002-2878-8544': 3, '0000-0001-9809-6976': 1, '0000-0001-7626-6778': 1})
['0000-0003-3932-8639']
Total sample size after apply threshold:  13
For name:  h_kwon
total sample size before apply threshold:  35
Counter({'0000-0003-4979-8749': 13, '0000-0002-6919-833X': 7, '0000-0002-0960-0198': 5, '0000-0001-6941-4808': 3, '0000-0003-4026-4572': 3, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(348, 909)
(0, 0)
(0, 0)
1
348
             precision    recall  f1-score   support

          0       0.95      0.94      0.95       102
          1       1.00      0.39      0.56        46
          2       0.00      0.00      0.00        15
          3       0.76      1.00      0.86       173
          4       0.00      0.00      0.00        12

avg / total       0.79      0.82      0.78       348

[ 96   0   0   6   0   4  18   0  24   0   0   0   0  15   0   0   0   0
 173   0   1   0   0  11   0]
MNB Accuracy:  0.8247126436781609
MNB F1:  0.473

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.99      0.94      0.96       102
          1       1.00      0.70      0.82        46
          2       1.00      0.60      0.75        15
          3       0.86      1.00      0.93       173
          4       1.00      0.75      0.86        12

avg / total       0.93      0.92      0.91       348

[ 96   0   0   6   0   0  32   0  14   0   0   0   9   6   0   0   0   0
 173   0   1   0   0   2   9]
svc Accuracy:  0.9166666666666666
svc F1:  0.863522697619653
             precision    recall  f1-score   support

          0       0.99      0.84      0.91       102
          1       1.00      0.33      0.49        46
          2       1.00      0.07      0.12        15
          3       0.71      1.00      0.83       173
          4       1.00      0.25      0.40        12

avg / total       0.85      0.80      0.77       348

[ 86   0   0  16   0   1  15   0  30   0   0   0   1  14   0   0   0   0
 173   0   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.93      0.90        40
          1       0.92      0.87      0.89        39

avg / total       0.90      0.90      0.90        79

[37  3  5 34]
svc Accuracy:  0.8987341772151899
svc F1:  0.8985879332477535
             precision    recall  f1-score   support

          0       0.92      0.90      0.91        40
          1       0.90      0.92      0.91        39

avg / total       0.91      0.91      0.91        79

[36  4  3 36]
LR Accuracy:  0.9113924050632911
LR F1:  0.9113924050632911
For name:  m_adams
total sample size before apply threshold:  190
Counter({'0000-0003-0435-8651': 59, '0000-0001-8989-508X': 46, '0000-0001-6310-1472': 30, '0000-0002-7743-4515': 29, '0000-0003-2849-9096': 12, '0000-0002-5277-5487': 7, '0000-0002-3878-7684': 5, '0000-0002-3602-6849': 1, '0000-0002-4645-2593': 1})
['0000-0002-7743-4515', '0000-0001-6310-1472', '0000-0003-2849-9096', '0000-0003-0435-8651', '0000-0001-8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(45, 89)
(0, 0)
(0, 0)
1
45
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       1.00      0.73      0.84        11
          2       0.63      0.92      0.75        13
          3       0.77      0.91      0.83        11

avg / total       0.84      0.78      0.77        45

[ 5  0  3  2  0  8  3  0  0  0 12  1  0  0  1 10]
MNB Accuracy:  0.7777777777777778
MNB F1:  0.7730263157894737
             precision    recall  f1-score   support

          0       0.82      0.90      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.94      0.97        53
          1       1.00      0.93      0.96        14
          2       1.00      0.50      0.67        10
          3       0.88      1.00      0.93        64

avg / total       0.94      0.94      0.93       141

[50  0  0  3  0 13  0  1  0  0  5  5  0  0  0 64]
svc Accuracy:  0.9361702127659575
svc F1:  0.8837024963451157
             precision    recall  f1-score   support

          0       1.00      0.89      0.94        53
          1       1.00      0.79      0.88        14
          2       0.00      0.00      0.00        10
          3       0.77      1.00      0.87        64

avg / total       0.83      0.87      0.84       141

[47  0  0  6  0 11  0  3  0  0  0 10  0  0  0 64]
LR Accuracy:  0.8652482269503546
LR F1:  0.6726870748299321
For name:  c_scott
total sample size before apply threshold:  162
Counter({'0000-0003-1340-0647': 98, '0000-0001-6110-6982': 39, '0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.57      0.73        21
          1       1.00      0.85      0.92        39
          2       0.87      1.00      0.93        98

avg / total       0.92      0.91      0.90       158

[12  0  9  0 33  6  0  0 98]
svc Accuracy:  0.9050632911392406
svc F1:  0.8576164488486763
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        21
          1       1.00      0.62      0.76        39
          2       0.73      1.00      0.84        98

avg / total       0.70      0.77      0.71       158

[ 0  0 21  0 24 15  0  0 98]
LR Accuracy:  0.7721518987341772
LR F1:  0.5355774493705528
For name:  m_mukherjee
total sample size before apply threshold:  16
Counter({'0000-0003-3706-406X': 4, '0000-0002-7924-7211': 4, '0000-0002-3083-436X': 3, '0000-0003-0376-8173': 2, '0000-0002-3615-7574': 2, '0000-0001-9653-0556': 1})
[]
Total sample size after apply threshold:  0
Fo

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(117, 517)
(0, 0)
(0, 0)
1
117
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       1.00      0.62      0.76        13
          2       1.00      0.62      0.77        16
          3       0.90      1.00      0.95        35
          4       0.77      0.97      0.86        37

avg / total       0.90      0.87      0.87       117

[13  0  0  1  2  0  8  0  2  3  0  0 10  0  6  0  0  0 35  0  0  0  0  1
 36]
MNB Accuracy:  0.8717948717948718
MNB F1:  0.846155211672453
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  67
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(67, 329)
(0, 0)
(0, 0)
1
67
             precision    recall  f1-score   support

          0       0.87      0.87      0.87        23
          1       0.89      0.89      0.89        27
          2       1.00      1.00      1.00        17

avg / total       0.91      0.91      0.91        67

[20  3  0  3 24  0  0  0 17]
MNB Accuracy:  0.9104477611940298
MNB F1:  0.9194847020933977
             precision    recall  f1-score   support

          0       0.86      0.78      0.82        23
          1    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



103
             precision    recall  f1-score   support

          0       1.00      0.54      0.70        28
          1       0.85      1.00      0.92        75

avg / total       0.89      0.87      0.86       103

[15 13  0 75]
MNB Accuracy:  0.8737864077669902
MNB F1:  0.8089599086888286
             precision    recall  f1-score   support

          0       1.00      0.68      0.81        28
          1       0.89      1.00      0.94        75

avg / total       0.92      0.91      0.91       103

[19  9  0 75]
svc Accuracy:  0.912621359223301
svc F1:  0.8759534323564835
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        28
          1       0.78      1.00      0.88        75

avg / total       0.84      0.80      0.75       103

[ 7 21  0 75]
LR Accuracy:  0.7961165048543689
LR F1:  0.6385964912280702
For name:  a_vincent
total sample size before apply threshold:  79
Counter({'0000-0002-4185-3267': 39, '0000-0001-6446-3846':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.63      0.77        35
          1       0.85      1.00      0.92        76

avg / total       0.90      0.88      0.87       111

[22 13  0 76]
svc Accuracy:  0.8828828828828829
svc F1:  0.8465709728867623
             precision    recall  f1-score   support

          0       1.00      0.23      0.37        35
          1       0.74      1.00      0.85        76

avg / total       0.82      0.76      0.70       111

[ 8 27  0 76]
LR Accuracy:  0.7567567567567568
LR F1:  0.6106275172144991
For name:  d_park
total sample size before apply threshold:  156
Counter({'0000-0003-2307-8575': 95, '0000-0002-6001-4223': 17, '0000-0001-9209-0493': 14, '0000-0003-0147-2424': 13, '0000-0002-7507-1175': 9, '0000-0002-7325-5480': 2, '0000-0001-9675-7179': 2, '0000-0002-5560-873X': 1, '0000-0003-4991-5247': 1, '0000-0002-1007-8595': 1, '0000-0001-9969-3051': 1})
['0000-0001-9209-0493', '0000-0002-6001-4223', '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       0.92      0.65      0.76        17
          2       1.00      0.77      0.87        13
          3       0.88      1.00      0.94        95

avg / total       0.91      0.90      0.89       139

[ 9  1  0  4  0 11  0  6  0  0 10  3  0  0  0 95]
svc Accuracy:  0.8992805755395683
svc F1:  0.836688798457914
             precision    recall  f1-score   support

          0       1.00      0.07      0.13        14
          1       1.00      0.47      0.64        17
          2       1.00      0.31      0.47        13
          3       0.75      1.00      0.86        95

avg / total       0.83      0.78      0.72       139

[ 1  0  0 13  0  8  0  9  0  0  4  9  0  0  0 95]
LR Accuracy:  0.7769784172661871
LR F1:  0.5259125188536953
For name:  d_gao
total sample size before apply threshold:  23
Counter({'0000-0003-1821-2741': 14, '0000-0002-9391-1756': 7, '0000-0001-8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        18
          1       0.76      1.00      0.86        22
          2       1.00      0.67      0.80        12
          3       1.00      0.93      0.96        14

avg / total       0.92      0.89      0.89        66

[16  2  0  0  0 22  0  0  0  4  8  0  0  1  0 13]
svc Accuracy:  0.8939393939393939
svc F1:  0.8917211328976034
             precision    recall  f1-score   support

          0       1.00      0.89      0.94        18
          1       0.69      1.00      0.81        22
          2       1.00      0.50      0.67        12
          3       1.00      0.86      0.92        14

avg / total       0.90      0.85      0.85        66

[16  2  0  0  0 22  0  0  0  6  6  0  0  2  0 12]
LR Accuracy:  0.8484848484848485
LR F1:  0.8364337187866598
For name:  j_peters
total sample size before apply threshold:  154
Counter({'0000-0001-8503-1452': 57, '0000-0002-6725-2814': 36, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(55, 239)
(0, 0)
(0, 0)
1
55
             precision    recall  f1-score   support

          0       0.93      0.93      0.93        15
          1       1.00      0.81      0.90        16
          2       0.89      1.00      0.94        24

avg / total       0.93      0.93      0.93        55

[14  0  1  1 13  2  0  0 24]
MNB Accuracy:  0.9272727272727272
MNB F1:  0.9236871760198332
             precision    recall  f1-score   support

          0       1.00      0.93      0.97        15
          1       0.88      0.94      0.91        16
          2       0.92      0.92      0.92        24

avg / total       0.93      0.93      0.93        55

[14  0  1  0 15  1  0  2 22]
svc Accuracy:  0.9272727272727272
svc F1:  0.9304249390456287
             precision    recall  f1-score   support

          0       1.00      0.93      0.97        15
          1       1.00      0.75      0.86        16
          2       0.83      1.00      0.91        24

avg / total       0.92      0.91      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      1.00      0.91       120
          1       1.00      0.72      0.84        25
          2       1.00      0.46      0.63        13
          3       1.00      0.92      0.96        52
          4       1.00      0.40      0.57        10

avg / total       0.91      0.89      0.88       220

[120   0   0   0   0   7  18   0   0   0   7   0   6   0   0   4   0   0
  48   0   6   0   0   0   4]
svc Accuracy:  0.8909090909090909
svc F1:  0.7818615460426966
             precision    recall  f1-score   support

          0       0.71      1.00      0.83       120
          1       1.00      0.40      0.57        25
          2       0.00      0.00      0.00        13
          3       1.00      0.81      0.89        52
          4       0.00      0.00      0.00        10

avg / total       0.74      0.78      0.73       220

[120   0   0   0   0  15  10   0   0   0  13   0   0   0   0  10   0   0
  42   0  10  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.10      0.18        10
          1       0.74      0.83      0.78        24
          2       0.55      0.97      0.70        34
          3       0.83      0.50      0.62        20
          4       0.00      0.00      0.00        13

avg / total       0.63      0.63      0.56       101

[ 1  3  6  0  0  0 20  3  1  0  0  0 33  0  1  0  4  6 10  0  0  0 12  1
  0]
MNB Accuracy:  0.6336633663366337
MNB F1:  0.4586519133765692
             precision    recall  f1-score   support

          0       0.89      0.80      0.84        10
          1       0.95      0.83      0.89        24
          2       1.00      0.94      0.97        34
          3       0.65      0.85      0.74        20
          4       0.69      0.69      0.69        13

avg / total       0.87      0.85      0.86       101

[ 8  0  0  2  0  1 20  0  2  1  0  0 32  1  1  0  1  0 17  2  0  0  0  4
  9]
svc Accuracy:  0.8514851485148515


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.95      0.68      0.79       113
          1       1.00      0.10      0.17        21
          2       1.00      0.11      0.20        27
          3       0.00      0.00      0.00        10
          4       0.83      1.00      0.91       423

avg / total       0.85      0.85      0.81       594

[ 77   0   0   0  36   3   2   0   0  16   1   0   3   0  23   0   0   0
   0  10   0   0   0   0 423]
MNB Accuracy:  0.8501683501683501
MNB F1:  0.41528555974042164
             precision    recall  f1-score   support

          0       0.96      0.83      0.89       113
          1       1.00      0.57      0.73        21
          2       1.00      0.52      0.68        27
          3       1.00      0.70      0.82        10
          4       0.91      1.00      0.95       423

avg / total       0.93      0.93      0.92       594

[ 94   0   0   0  19   1  12   0   0   8   1   0  14   0  12   2   0   0
   7   1   0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.76      1.00      0.86        22
          2       1.00      1.00      1.00        14
          3       1.00      0.50      0.67        10

avg / total       0.91      0.89      0.88        62

[14  2  0  0  0 22  0  0  0  0 14  0  0  5  0  5]
MNB Accuracy:  0.8870967741935484
MNB F1:  0.8656862745098038
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.88      1.00      0.94        22
          2       1.00      1.00      1.00        14
          3       1.00      1.00      1.00        10

avg / total       0.96      0.95      0.95        62

[13  3  0  0  0 22  0  0  0  0 14  0  0  0  0 10]
svc Accuracy:  0.9516129032258065
svc F1:  0.9581804842259721
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.73     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        16
          1       0.69      1.00      0.82        40
          2       1.00      0.69      0.81        32

avg / total       0.86      0.80      0.79        88

[ 8  8  0  0 40  0  0 10 22]
LR Accuracy:  0.7954545454545454
LR F1:  0.7659360040312421
For name:  y_yuan
total sample size before apply threshold:  67
Counter({'0000-0003-1376-0028': 17, '0000-0001-7094-4419': 11, '0000-0003-4284-3973': 10, '0000-0003-4706-7897': 10, '0000-0002-7577-3257': 9, '0000-0003-3020-0700': 4, '0000-0002-1761-9040': 3, '0000-0002-6719-2567': 1, '0000-0002-1823-3174': 1, '0000-0002-2292-7339': 1})
['0000-0003-1376-0028', '0000-0003-4284-3973', '0000-0003-4706-7897', '0000-0001-7094-4419']
Total sample size after apply threshold:  48
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, m

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.36      0.53        14
          1       1.00      0.92      0.96        13
          2       0.91      0.83      0.87        24
          3       0.94      1.00      0.97        17
          4       0.63      0.97      0.76        35
          5       1.00      0.30      0.46        10
          6       1.00      0.87      0.93        15
          7       1.00      0.73      0.84        11
          8       0.82      0.92      0.87        36

avg / total       0.87      0.83      0.82       175

[ 5  0  0  0  8  0  0  0  1  0 12  0  0  1  0  0  0  0  0  0 20  0  4  0
  0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  1 34  0  0  0  0  0  0  1
  0  3  3  0  0  3  0  0  0  0  1  0 13  0  1  0  0  1  0  0  0  0  8  2
  0  0  0  0  3  0  0  0 33]
svc Accuracy:  0.8285714285714286
svc F1:  0.7991100808903498
             precision    recall  f1-score   support

          0       1.00      0.21      0.35        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 186)
(0, 0)
(0, 0)
1
49
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        11
          1       0.84      1.00      0.92        38

avg / total       0.88      0.86      0.83        49

[ 4  7  0 38]
MNB Accuracy:  0.8571428571428571
MNB F1:  0.7244979919678715
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.93      1.00      0.96        38

avg / total       0.94      0.94      0.94        49

[ 8  3  0 38]
svc Acc

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 82)
(0, 0)
(0, 0)
1
33
             precision    recall  f1-score   support

          0       0.93      0.81      0.87        16
          1       0.84      0.94      0.89        17

avg / total       0.88      0.88      0.88        33

[13  3  1 16]
MNB Accuracy:  0.8787878787878788
MNB F1:  0.8777777777777778
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.89      1.00      0.94        17

avg / total       0.95      0.94      0.94        33

[14  2  0 17]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.87      1.00      0.93       102
          1       1.00      0.09      0.17        11
          2       1.00      0.88      0.94        42

avg / total       0.92      0.90      0.88       155

[102   0   0  10   1   0   5   0  37]
svc Accuracy:  0.9032258064516129
svc F1:  0.6782941255804097
             precision    recall  f1-score   support

          0       0.78      1.00      0.88       102
          1       0.00      0.00      0.00        11
          2       1.00      0.60      0.75        42

avg / total       0.79      0.82      0.78       155

[102   0   0  11   0   0  17   0  25]
LR Accuracy:  0.8193548387096774
LR F1:  0.5418596671813347
For name:  m_kang
total sample size before apply threshold:  131
Counter({'0000-0003-1595-1717': 38, '0000-0003-3245-144X': 19, '0000-0002-2039-4866': 18, '0000-0002-4778-8240': 13, '0000-0002-1530-7254': 12, '0000-0003-2140-4234': 10, '0000-0002-8795-2973': 8, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.68      1.00      0.81        38
          1       1.00      0.77      0.87        13
          2       1.00      0.83      0.91        12
          3       0.00      0.00      0.00        10
          4       1.00      0.83      0.91        18
          5       0.95      0.95      0.95        19

avg / total       0.79      0.83      0.79       110

[38  0  0  0  0  0  2 10  0  0  0  1  2  0 10  0  0  0 10  0  0  0  0  0
  3  0  0  0 15  0  1  0  0  0  0 18]
LR Accuracy:  0.8272727272727273
LR F1:  0.7406043491539377
For name:  z_shi
total sample size before apply threshold:  180
Counter({'0000-0002-3099-3299': 94, '0000-0002-9624-4960': 25, '0000-0003-2388-6695': 22, '0000-0001-5357-1171': 13, '0000-0002-3928-2960': 12, '0000-0002-3865-0098': 9, '0000-0001-9922-3957': 2, '0000-0002-7798-1121': 1, '0000-0002-8328-0305': 1, '0000-0002-5828-1904': 1})
['0000-0002-3099-3299', '0000-0001-5357-1171', '0000-0002-3928-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      1.00      0.86       174
          1       1.00      0.05      0.09        21
          2       1.00      0.06      0.11        17
          3       1.00      0.71      0.83        76

avg / total       0.85      0.80      0.75       288

[174   0   0   0  20   1   0   0  16   0   1   0  22   0   0  54]
MNB Accuracy:  0.7986111111111112
MNB F1:  0.47248307248307253
             precision    recall  f1-score   support

          0       0.85      0.99      0.92       174
          1       1.00      0.52      0.69        21
          2       0.92      0.65      0.76        17
          3       0.98      0.79      0.88        76

avg / total       0.90      0.89      0.88       288

[173   0   1   0   9  11   0   1   6   0  11   0  16   0   0  60]
svc Accuracy:  0.8854166666666666
svc F1:  0.809344253439553
             precision    recall  f1-score   support

          0       0.75      1.00      0.86      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(88, 170)
(0, 0)
(0, 0)
1
88
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        60
          1       1.00      0.82      0.90        28

avg / total       0.95      0.94      0.94        88

[60  0  5 23]
MNB Accuracy:  0.9431818181818182
MNB F1:  0.9309803921568628
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        60
          1       1.00      0.82      0.90        28

avg / total       0.95      0.94      0.94        88

[60  0  5 23]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 1.0
MNB F1:  1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       1.00      1.00      1.00        14

avg / total       1.00      1.00      1.00        33

[19  0  0 14]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       1.00      1.00      1.00        14

avg / total       1.00      1.00      1.00        33

[19  0  0 14]
LR Accuracy:  1.0
LR F1:  1.0
For name:  m_hsieh
total sample size before apply threshold:  35
Counter({'0000-0003-3636-6250': 15, '0000-0001-5254-1341': 10, '0000-0002-3396-8427': 5, '0000-0002-7833-847X': 4, '0000-0002-3706-6615': 1})
['0000-0003-3636-6250', '0000-0001-5254-1341']
Total sample size after apply threshold:  25
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=Tru

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      1.00      0.98        22
          1       1.00      0.93      0.96        14

avg / total       0.97      0.97      0.97        36

[22  0  1 13]
MNB Accuracy:  0.9722222222222222
MNB F1:  0.9703703703703703
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        22
          1       1.00      0.93      0.96        14

avg / total       0.97      0.97      0.97        36

[22  0  1 13]
svc Accuracy:  0.9722222222222222
svc F1:  0.9703703703703703
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        22
          1       1.00      0.79      0.88        14

avg / total       0.93      0.92      0.91        36

[22  0  3 11]
LR Accuracy:  0.9166666666666666
LR F1:  0.9080851063829787
For name:  a_mccarthy
total sample size before apply threshold:  88
Counter({'0000-0001-7195-6366': 56, '0000-0002-8979-2926': 26

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(82, 335)
(0, 0)
(0, 0)
1
82
             precision    recall  f1-score   support

          0       0.96      0.88      0.92        26
          1       0.95      0.98      0.96        56

avg / total       0.95      0.95      0.95        82

[23  3  1 55]
MNB Accuracy:  0.9512195121951219
MNB F1:  0.9424561403508771
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        26
          1       0.89      1.00      0.94        56

avg / total       0.92      0.91      0.91        82

[19  7  0 56]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.25      0.24      0.24        17
          2       1.00      0.62      0.77        16
          3       0.83      0.67      0.74        15
          4       1.00      0.64      0.78        11
          5       0.62      0.83      0.71        29
          6       0.68      0.79      0.73        38

avg / total       0.73      0.69      0.69       140

[12  0  0  0  0  2  0  0  4  0  0  0  6  7  0  2 10  0  0  2  2  0  3  0
 10  0  0  2  0  0  0  0  7  3  1  0  3  0  0  0 24  2  0  4  0  2  0  2
 30]
svc Accuracy:  0.6928571428571428
svc F1:  0.6986914461806857
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.80      0.24      0.36        17
          2       1.00      0.62      0.77        16
          3       0.92      0.73      0.81        15
          4       1.00      0.55      0.7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



['0000-0002-7078-5937', '0000-0001-6677-7961', '0000-0002-8320-8725', '0000-0002-5247-1678', '0000-0002-2245-4893', '0000-0002-4638-0788', '0000-0002-6535-6169', '0000-0002-7968-0162', '0000-0003-4439-8818', '0000-0003-1590-0995', '0000-0002-4423-6045', '0000-0003-0802-3832', '0000-0002-7135-723X', '0000-0002-2253-3698', '0000-0002-2885-1670', '0000-0001-8404-3806', '0000-0003-0180-4142', '0000-0002-1282-4897', '0000-0001-9636-990X', '0000-0001-5293-5930', '0000-0003-4150-1111', '0000-0001-5304-3459', '0000-0002-4300-4349', '0000-0001-6222-5641', '0000-0001-8181-1080', '0000-0002-8417-2488', '0000-0002-6388-9674', '0000-0003-1112-4255', '0000-0001-5198-3674', '0000-0002-1862-3121', '0000-0002-3961-2691', '0000-0001-8327-3108', '0000-0003-1278-7114', '0000-0002-5880-8649']
Total sample size after apply threshold:  788
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.30      0.41      0.35        41
          1       0.92      0.79      0.85        28
          2       0.00      0.00      0.00        14
          3       0.74      0.89      0.81        38
          4       0.51      0.86      0.64        42
          5       0.00      0.00      0.00        17
          6       1.00      0.29      0.44        21
          7       0.96      0.73      0.83        37
          8       1.00      0.08      0.15        12
          9       0.00      0.00      0.00        22
         10       0.42      0.68      0.52        50
         11       1.00      0.33      0.50        27
         12       1.00      0.06      0.11        18
         13       1.00      0.25      0.40        16
         14       1.00      0.17      0.29        18
         15       0.00      0.00      0.00        11
         16       1.00      0.13      0.24        15
         17       1.00      0.46      0.63   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 265)
(0, 0)
(0, 0)
1
133
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        12
          1       1.00      0.90      0.95        40
          2       1.00      0.81      0.89        36
          3       0.70      1.00      0.83        45

avg / total       0.90      0.86      0.85       133

[ 4  0  0  8  0 36  0  4  0  0 29  7  0  0  0 45]
MNB Accuracy:  0.8571428571428571
MNB F1:  0.7913410466887049
             precision    recall  f1-score   support

          0       1.00      0.50      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.59      0.75        32
          1       0.76      0.99      0.86       100
          2       0.00      0.00      0.00        13
          3       0.80      0.74      0.76        53

avg / total       0.76      0.79      0.76       198

[19 10  0  3  0 99  0  1  0  7  0  6  0 14  0 39]
MNB Accuracy:  0.7929292929292929
MNB F1:  0.5926683716965047
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        32
          1       1.00      0.96      0.98       100
          2       1.00      0.38      0.56        13
          3       0.78      1.00      0.88        53

avg / total       0.94      0.92      0.92       198

[29  0  0  3  0 96  0  4  0  0  5  8  0  0  0 53]
svc Accuracy:  0.9242424242424242
svc F1:  0.8405000305681591
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        32
          1       0.77     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



svc Accuracy:  0.9357798165137615
svc F1:  0.9339794064203513
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        61
          1       1.00      0.83      0.91        48

avg / total       0.94      0.93      0.93       109

[61  0  8 40]
LR Accuracy:  0.926605504587156
LR F1:  0.9237762237762238
For name:  j_dias
total sample size before apply threshold:  31
Counter({'0000-0002-7613-6241': 9, '0000-0002-1150-4357': 9, '0000-0003-3732-7122': 5, '0000-0003-2517-7905': 3, '0000-0002-0966-0537': 3, '0000-0003-4732-7230': 1, '0000-0002-6271-6501': 1})
[]
Total sample size after apply threshold:  0
For name:  p_nunes
total sample size before apply threshold:  36
Counter({'0000-0002-4598-685X': 19, '0000-0003-4740-8268': 12, '0000-0002-4641-8846': 4, '0000-0003-1693-1267': 1})
['0000-0002-4598-685X', '0000-0003-4740-8268']
Total sample size after apply threshold:  31
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


a_das
total sample size before apply threshold:  74
Counter({'0000-0002-0883-1816': 14, '0000-0002-7033-1441': 10, '0000-0003-0740-8140': 8, '0000-0001-5924-4235': 6, '0000-0001-7383-9606': 5, '0000-0002-5196-9589': 5, '0000-0002-7510-1805': 5, '0000-0003-1801-7487': 4, '0000-0002-1733-626X': 3, '0000-0003-0616-9715': 3, '0000-0002-7473-6139': 2, '0000-0003-4305-6007': 2, '0000-0002-2101-9056': 2, '0000-0003-0921-8877': 2, '0000-0001-5884-0852': 1, '0000-0002-0445-0012': 1, '0000-0002-0141-0963': 1})
['0000-0002-7033-1441', '0000-0002-0883-1816']
Total sample size after apply threshold:  24
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=Non

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(95, 389)
(0, 0)
(0, 0)
1
95
             precision    recall  f1-score   support

          0       1.00      0.41      0.58        17
          1       1.00      0.09      0.17        11
          2       0.64      0.97      0.77        40
          3       0.96      0.93      0.94        27

avg / total       0.84      0.76      0.72        95

[ 7  0 10  0  0  1 10  0  0  0 39  1  0  0  2 25]
MNB Accuracy:  0.7578947368421053
MNB F1:  0.6164183635344667


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.82      0.90        17
          1       1.00      0.73      0.84        11
          2       0.83      0.97      0.90        40
          3       0.96      0.93      0.94        27

avg / total       0.92      0.91      0.90        95

[14  0  3  0  0  8  3  0  0  0 39  1  0  0  2 25]
svc Accuracy:  0.9052631578947369
svc F1:  0.8963197550406332
             precision    recall  f1-score   support

          0       1.00      0.41      0.58        17
          1       1.00      0.27      0.43        11
          2       0.62      0.97      0.76        40
          3       0.95      0.78      0.86        27

avg / total       0.83      0.74      0.72        95

[ 7  0 10  0  0  3  8  0  0  0 39  1  0  0  6 21]
LR Accuracy:  0.7368421052631579
LR F1:  0.6565822931114194
For name:  k_zhu
total sample size before apply threshold:  6
Counter({'0000-0001-7664-7204': 3, '0000-0003-4361-1138': 1, '0000-0003-27

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      0.98      0.81        54
          1       0.84      0.87      0.85        30
          2       0.69      0.36      0.47        25
          3       1.00      0.57      0.73        14
          4       0.90      0.56      0.69        16

avg / total       0.78      0.76      0.74       139

[53  0  1  0  0  4 26  0  0  0 12  3  9  0  1  3  1  2  8  0  5  1  1  0
  9]
svc Accuracy:  0.7553956834532374
svc F1:  0.7109767903687378
             precision    recall  f1-score   support

          0       0.51      0.98      0.67        54
          1       0.82      0.60      0.69        30
          2       1.00      0.24      0.39        25
          3       1.00      0.43      0.60        14
          4       1.00      0.06      0.12        16

avg / total       0.77      0.60      0.55       139

[53  1  0  0  0 12 18  0  0  0 17  2  6  0  0  8  0  0  6  0 14  1  0  0
  1]
LR Accuracy:  0.60431654676259
LR 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 175)
(0, 0)
(0, 0)
1
33
             precision    recall  f1-score   support

          0       0.83      0.86      0.84        22
          1       0.70      0.64      0.67        11

avg / total       0.78      0.79      0.79        33

[19  3  4  7]
MNB Accuracy:  0.7878787878787878
MNB F1:  0.7555555555555555
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        22
          1       1.00      0.82      0.90        11

avg / total       0.94      0.94      0.94        33

[22  0  2  9]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(119, 6019)
(0, 0)
(0, 0)
1
119
             precision    recall  f1-score   support

          0       1.00      0.68      0.81        22
          1       0.74      0.97      0.84        33
          2       1.00      0.93      0.97        15
          3       0.98      0.94      0.96        49

avg / total       0.92      0.90      0.90       119

[15  7  0  0  0 32  0  1  0  1 14  0  0  3  0 46]
MNB Accuracy:  0.8991596638655462
MNB F1:  0.8941916621703374
             precision    recall  f1-score   support

          0       0.95      0.86    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.93        22
          1       0.88      0.98      0.93        52
          2       0.94      0.81      0.87        21
          3       1.00      1.00      1.00        21

avg / total       0.94      0.93      0.93       116

[19  3  0  0  0 51  1  0  0  4 17  0  0  0  0 21]
svc Accuracy:  0.9310344827586207
svc F1:  0.9314742168400705
             precision    recall  f1-score   support

          0       1.00      0.41      0.58        22
          1       0.62      0.98      0.76        52
          2       1.00      0.29      0.44        21
          3       0.95      0.86      0.90        21

avg / total       0.82      0.72      0.69       116

[ 9 13  0  0  0 51  0  1  0 15  6  0  0  3  0 18]
LR Accuracy:  0.7241379310344828
LR F1:  0.6715709088963784
For name:  a_bhattacharyya
total sample size before apply threshold:  14
Counter({'0000-0002-1646-709X': 8, '0000-0002-5948-3364': 3, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.94      0.49      0.64        35
          1       0.65      0.98      0.78        63
          2       1.00      0.77      0.87        13
          3       1.00      0.27      0.43        11
          4       1.00      0.83      0.91        30

avg / total       0.84      0.77      0.76       152

[17 18  0  0  0  1 62  0  0  0  0  3 10  0  0  0  8  0  3  0  0  5  0  0
 25]
svc Accuracy:  0.7697368421052632
svc F1:  0.7257222405704769
             precision    recall  f1-score   support

          0       1.00      0.46      0.63        35
          1       0.59      1.00      0.74        63
          2       1.00      0.46      0.63        13
          3       0.00      0.00      0.00        11
          4       1.00      0.77      0.87        30

avg / total       0.76      0.71      0.68       152

[16 19  0  0  0  0 63  0  0  0  0  7  6  0  0  0 11  0  0  0  0  7  0  0
 23]
LR Accuracy:  0.7105263157894737
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 234)
(0, 0)
(0, 0)
1
46
             precision    recall  f1-score   support

          0       0.97      0.93      0.95        30
          1       0.88      0.94      0.91        16

avg / total       0.94      0.93      0.94        46

[28  2  1 15]
MNB Accuracy:  0.9347826086956522
MNB F1:  0.9291217257318953
             precision    recall  f1-score   support

          0       0.97      1.00      0.98        30
          1       1.00      0.94      0.97        16

avg / total       0.98      0.98      0.98        46

[30  0  1 15]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       0.80      1.00      0.89        12

avg / total       0.89      0.86      0.86        22

[ 7  3  0 12]
LR Accuracy:  0.8636363636363636
LR F1:  0.8562091503267975
For name:  k_shimizu
total sample size before apply threshold:  103
Counter({'0000-0002-0229-6541': 44, '0000-0003-2454-1795': 37, '0000-0003-1574-5526': 10, '0000-0001-8261-8098': 8, '0000-0002-2796-8666': 4})
['0000-0002-0229-6541', '0000-0003-1574-5526', '0000-0003-2454-1795']
Total sample size after apply threshold:  91
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      1.00      0.95        61
          1       1.00      0.96      0.98        57
          2       1.00      0.69      0.81        16

avg / total       0.95      0.95      0.95       134

[61  0  0  2 55  0  5  0 11]
svc Accuracy:  0.9477611940298507
svc F1:  0.914231368688733
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        61
          1       1.00      0.93      0.96        57
          2       1.00      0.19      0.32        16

avg / total       0.90      0.87      0.85       134

[61  0  0  4 53  0 13  0  3]
LR Accuracy:  0.8731343283582089
LR F1:  0.7190412263490643
For name:  a_sinclair
total sample size before apply threshold:  109
Counter({'0000-0003-2741-7992': 64, '0000-0001-8510-8691': 31, '0000-0002-2628-1686': 9, '0000-0002-5602-5958': 5})
['0000-0003-2741-7992', '0000-0001-8510-8691']
Total sample size after apply threshold:  95
TfidfVec

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.79      1.00      0.88        64
          1       1.00      0.45      0.62        31

avg / total       0.86      0.82      0.80        95

[64  0 17 14]
LR Accuracy:  0.8210526315789474
LR F1:  0.7524904214559387
For name:  y_pan
total sample size before apply threshold:  46
Counter({'0000-0001-7709-0508': 15, '0000-0002-6311-2945': 14, '0000-0002-8587-6065': 7, '0000-0002-5547-0849': 3, '0000-0002-1173-1074': 2, '0000-0001-5133-1342': 1, '0000-0002-3945-6377': 1, '0000-0002-0090-1285': 1, '0000-0002-6894-7271': 1, '0000-0002-9195-3776': 1})
['0000-0002-6311-2945', '0000-0001-7709-0508']
Total sample size after apply threshold:  29
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Total sample size after apply threshold:  137
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(137, 112)
(0, 0)
(0, 0)
1
137
             precision    recall  f1-score   support

          0       0.82      0.96      0.89        83
          1       0.92      0.92      0.92        38
          2       1.00      0.12      0.22        16

avg / total       0.87      0.85      0.82       137

[80  3  0  3 35  0 14  0  2]
MNB Accuracy:  0.8540145985401459
MNB F1:  0.6773879142300195
             precision    recall  f1-score   support

          0       0.96      0.99      0.98        83
          1  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(261, 987)
(0, 0)
(0, 0)
1
261
             precision    recall  f1-score   support

          0       0.96      0.56      0.71        39
          1       0.84      1.00      0.91       202
          2       0.00      0.00      0.00        20

avg / total       0.80      0.85      0.81       261

[ 22  17   0   1 201   0   0  20   0]
MNB Accuracy:  0.8544061302681992
MNB F1:  0.5411045943304008


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.67      0.80        39
          1       0.89      1.00      0.94       202
          2       1.00      0.35      0.52        20

avg / total       0.91      0.90      0.89       261

[ 26  13   0   0 202   0   0  13   7]
svc Accuracy:  0.9003831417624522
svc F1:  0.7526844674131495
             precision    recall  f1-score   support

          0       1.00      0.23      0.38        39
          1       0.80      1.00      0.89       202
          2       0.00      0.00      0.00        20

avg / total       0.77      0.81      0.74       261

[  9  30   0   0 202   0   0  20   0]
LR Accuracy:  0.8084291187739464
LR F1:  0.4216226138032306
For name:  s_may
total sample size before apply threshold:  115
Counter({'0000-0003-1813-7745': 59, '0000-0001-5282-3250': 47, '0000-0002-7228-8440': 7, '0000-0001-6762-7500': 2})
['0000-0003-1813-7745', '0000-0001-5282-3250']
Total sample size after apply threshold

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.83      0.84        59
          1       0.80      0.83      0.81        47

avg / total       0.83      0.83      0.83       106

[49 10  8 39]
svc Accuracy:  0.8301886792452831
svc F1:  0.8286637931034483
             precision    recall  f1-score   support

          0       0.75      0.98      0.85        59
          1       0.97      0.60      0.74        47

avg / total       0.85      0.81      0.80       106

[58  1 19 28]
LR Accuracy:  0.8113207547169812
LR F1:  0.7948916408668731
For name:  z_cai
total sample size before apply threshold:  244
Counter({'0000-0002-8724-7684': 200, '0000-0002-8937-4943': 27, '0000-0002-9180-675X': 11, '0000-0003-2884-1429': 6})
['0000-0002-9180-675X', '0000-0002-8724-7684', '0000-0002-8937-4943']
Total sample size after apply threshold:  238
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.97      1.00      0.98       200
          2       1.00      0.93      0.96        27

avg / total       0.97      0.97      0.97       238

[  6   5   0   0 200   0   0   2  25]
svc Accuracy:  0.9705882352941176
svc F1:  0.8834072657602068
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.89      1.00      0.94       200
          2       1.00      0.48      0.65        27

avg / total       0.86      0.89      0.86       238

[  0  11   0   0 200   0   0  14  13]
LR Accuracy:  0.8949579831932774
LR F1:  0.5303921568627451
For name:  a_pereira
total sample size before apply threshold:  205
Counter({'0000-0003-1378-4273': 47, '0000-0001-9980-441X': 19, '0000-0002-3478-4718': 15, '0000-0002-1053-8715': 14, '0000-0002-7392-2255': 9, '0000-0001-9430-9399': 7, '0000-0002-8587-262X': 7, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        14
          1       0.89      1.00      0.94        47
          2       1.00      0.95      0.97        19
          3       0.93      0.87      0.90        15

avg / total       0.93      0.93      0.92        95

[10  4  0  0  0 47  0  0  0  0 18  1  0  2  0 13]
svc Accuracy:  0.9263157894736842
svc F1:  0.9107145076110593
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        14
          1       0.70      1.00      0.82        47
          2       1.00      0.95      0.97        19
          3       1.00      0.27      0.42        15

avg / total       0.85      0.79      0.76        95

[ 6  8  0  0  0 47  0  0  0  1 18  0  0 11  0  4]
LR Accuracy:  0.7894736842105263
LR F1:  0.704646752015173
For name:  d_patel
total sample size before apply threshold:  33
Counter({'0000-0002-1154-3444': 9, '0000-0002-5744-568X': 8, '0000-0002-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      1.00      0.96        64
          1       1.00      1.00      1.00        29
          2       1.00      0.84      0.91        37

avg / total       0.96      0.95      0.95       130

[64  0  0  0 29  0  6  0 31]
svc Accuracy:  0.9538461538461539
svc F1:  0.9556628621597895
             precision    recall  f1-score   support

          0       0.71      1.00      0.83        64
          1       1.00      0.59      0.74        29
          2       1.00      0.62      0.77        37

avg / total       0.86      0.80      0.79       130

[64  0  0 12 17  0 14  0 23]
LR Accuracy:  0.8
LR F1:  0.7789886442060355
For name:  c_cao
total sample size before apply threshold:  74
Counter({'0000-0003-2139-1648': 25, '0000-0003-2830-4383': 20, '0000-0001-8621-8403': 19, '0000-0002-0320-1110': 5, '0000-0002-3407-7837': 4, '0000-0001-6909-5739': 1})
['0000-0003-2830-4383', '0000-0003-2139-1648', '0000-0001-8621-8403

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(335, 1160)
(0, 0)
(0, 0)
1
335
             precision    recall  f1-score   support

          0       0.47      1.00      0.64        85
          1       0.00      0.00      0.00        11
          2       1.00      0.86      0.92        49
          3       0.97      0.68      0.80        44
          4       1.00      0.29      0.44        28
          5       0.00      0.00      0.00        13
          6       1.00      0.48      0.65        33
          7       0.91      0.83      0.87        60
          8       1.00      0.33      0.50    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.54      0.99      0.70        85
          1       1.00      0.36      0.53        11
          2       1.00      0.90      0.95        49
          3       1.00      0.73      0.84        44
          4       1.00      0.57      0.73        28
          5       1.00      0.23      0.38        13
          6       1.00      0.73      0.84        33
          7       0.94      0.83      0.88        60
          8       1.00      0.33      0.50        12

avg / total       0.87      0.78      0.78       335

[84  0  0  0  0  0  0  1  0  6  4  0  0  0  0  0  1  0  5  0 44  0  0  0
  0  0  0 12  0  0 32  0  0  0  0  0 12  0  0  0 16  0  0  0  0  9  0  0
  0  0  3  0  1  0  9  0  0  0  0  0 24  0  0 10  0  0  0  0  0  0 50  0
  8  0  0  0  0  0  0  0  4]
svc Accuracy:  0.7791044776119403
svc F1:  0.7056676553637805
             precision    recall  f1-score   support

          0       0.47      1.00      0.64        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.98      0.97        54
          1       0.97      0.94      0.96        36

avg / total       0.97      0.97      0.97        90

[53  1  2 34]
MNB Accuracy:  0.9666666666666667
MNB F1:  0.9651117715467115
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        54
          1       1.00      0.92      0.96        36

avg / total       0.97      0.97      0.97        90

[54  0  3 33]
svc Accuracy:  0.9666666666666667
svc F1:  0.9647473560517039
             precision    recall  f1-score   support

          0       0.90      1.00      0.95        54
          1       1.00      0.83      0.91        36

avg / total       0.94      0.93      0.93        90

[54  0  6 30]
LR Accuracy:  0.9333333333333333
LR F1:  0.9282296650717703
For name:  m_jeong
total sample size before apply threshold:  41
Counter({'0000-0002-7019-8089': 34, '0000-0003-0669-1386': 3, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(209, 398)
(0, 0)
(0, 0)
1
209
             precision    recall  f1-score   support

          0       1.00      0.66      0.80        59
          1       1.00      0.24      0.38        21
          2       0.78      1.00      0.88       129

avg / total       0.87      0.83      0.80       209

[ 39   0  20   0   5  16   0   0 129]
MNB Accuracy:  0.8277511961722488
MNB F1:  0.6860282574568289
             precision    recall  f1-score   support

          0       1.00      0.69      0.82        59
          1       1.00      0.38      0.55        21
          2       0.81      1.00      0.89       129

avg / total       0.88      0.85      0.84       209

[ 41   0  18   0   8  13   0   0 129]
svc Accuracy:  0.8516746411483254
svc F1:  0.7548192339816251
             precision    recall  f1-score   support

          0       1.00      0.49      0.66        59
          1       0.00      0.00      0.00        21
          2       0.72      1.00      0.83       129

avg / total       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


62
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(62, 127)
(0, 0)
(0, 0)
1
62
             precision    recall  f1-score   support

          0       0.78      1.00      0.87        38
          1       1.00      0.55      0.71        11
          2       1.00      0.54      0.70        13

avg / total       0.86      0.82      0.81        62

[38  0  0  5  6  0  6  0  7]
MNB Accuracy:  0.8225806451612904
MNB F1:  0.7598151904439937
             precision    recall  f1-score   support

          0       0.79      1.00      0.88        38
          1       1.00      0.55      0.71        11
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(536, 770)
(0, 0)
(0, 0)
1
536
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.00      0.00      0.00        17
          2       1.00      0.32      0.48        22
          3       1.00      0.07      0.12        15
          4       0.94      0.47      0.63        34
          5       1.00      0.09      0.16        23
          6       0.00      0.00      0.00        14
          7       0.00      0.00      0.00        11
          8       1.00      0.41      0.58     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.67      0.45      0.54        22
          1       0.71      0.71      0.71        17
          2       1.00      0.50      0.67        22
          3       0.92      0.73      0.81        15
          4       0.81      0.76      0.79        34
          5       0.90      0.39      0.55        23
          6       0.42      0.36      0.38        14
          7       0.80      0.73      0.76        11
          8       1.00      0.78      0.88        27
          9       0.75      0.82      0.78        95
         10       0.91      0.94      0.93       109
         11       0.76      0.83      0.79        23
         12       0.43      0.92      0.58        48
         13       0.67      0.20      0.31        10
         14       0.85      0.41      0.55        27
         15       0.69      0.62      0.65        39

avg / total       0.78      0.74      0.73       536

[ 10   1   0   0   0   0   1   0   0   1   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97        54
          1       1.00      0.85      0.92        20

avg / total       0.96      0.96      0.96        74

[54  0  3 17]
svc Accuracy:  0.9594594594594594
svc F1:  0.9459459459459459
             precision    recall  f1-score   support

          0       0.84      1.00      0.92        54
          1       1.00      0.50      0.67        20

avg / total       0.89      0.86      0.85        74

[54  0 10 10]
LR Accuracy:  0.8648648648648649
LR F1:  0.7909604519774012
For name:  j_walker
total sample size before apply threshold:  253
Counter({'0000-0002-8922-083X': 71, '0000-0002-5349-1689': 70, '0000-0002-2050-1641': 64, '0000-0002-2995-0398': 17, '0000-0002-8683-0026': 15, '0000-0001-6034-7514': 9, '0000-0002-9732-5738': 4, '0000-0001-5151-1693': 1, '0000-0003-1349-2633': 1, '0000-0002-8241-9424': 1})
['0000-0002-2995-0398', '0000-0002-2050-1641', '0000-0002-8922-083X', '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        17
          1       1.00      0.81      0.90        64
          2       0.74      1.00      0.85        71
          3       1.00      0.87      0.93        15
          4       1.00      0.91      0.96        70

avg / total       0.92      0.89      0.90       237

[12  0  5  0  0  0 52 12  0  0  0  0 71  0  0  0  0  2 13  0  0  0  6  0
 64]
svc Accuracy:  0.8945147679324894
svc F1:  0.8916465282801062
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        17
          1       0.96      0.81      0.88        64
          2       0.69      0.99      0.81        71
          3       1.00      0.67      0.80        15
          4       1.00      0.90      0.95        70

avg / total       0.90      0.86      0.86       237

[ 8  1  8  0  0  0 52 12  0  0  0  1 70  0  0  0  0  5 10  0  0  0  7  0
 63]
LR Accuracy:  0.8565400843881856
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(125, 136)
(0, 0)
(0, 0)
1
125
             precision    recall  f1-score   support

          0       0.72      0.72      0.72        47
          1       1.00      0.67      0.80        12
          2       1.00      0.20      0.33        10
          3       0.74      0.89      0.81        56

avg / total       0.78      0.75      0.74       125

[34  0  0 13  3  8  0  1  4  0  2  4  6  0  0 50]
MNB Accuracy:  0.752
MNB F1:  0.6657973003889271
             precision    recall  f1-score   support

          0       0.76      0.89      0.82        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      1.00      0.96        34
          1       1.00      0.82      0.90        17

avg / total       0.95      0.94      0.94        51

[34  0  3 14]
MNB Accuracy:  0.9411764705882353
MNB F1:  0.9304861426624262
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        34
          1       1.00      0.82      0.90        17

avg / total       0.95      0.94      0.94        51

[34  0  3 14]
svc Accuracy:  0.9411764705882353
svc F1:  0.9304861426624262
             precision    recall  f1-score   support

          0       0.79      1.00      0.88        34
          1       1.00      0.47      0.64        17

avg / total       0.86      0.82      0.80        51

[34  0  9  8]
LR Accuracy:  0.8235294117647058
LR F1:  0.7615584415584415
For name:  a_norman
total sample size before apply threshold:  28
Counter({'0000-0002-1282-394X': 16, '0000-0002-4208-2708': 4, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       1.00      0.47      0.64        15
          2       0.58      0.91      0.71        47
          3       0.97      0.95      0.96        66
          4       1.00      0.45      0.62        11
          5       0.68      0.76      0.72        25
          6       0.90      0.64      0.75        14
          7       1.00      1.00      1.00        27
          8       1.00      0.80      0.89        10
          9       0.75      0.62      0.68        24

avg / total       0.85      0.81      0.81       249

[ 5  0  2  0  0  1  1  0  0  1  0  7  7  0  0  0  0  0  0  1  0  0 43  0
  0  4  0  0  0  0  0  0  2 63  0  0  0  0  0  1  0  0  6  0  5  0  0  0
  0  0  0  0  5  0  0 19  0  0  0  1  0  0  3  1  0  1  9  0  0  0  0  0
  0  0  0  0  0 27  0  0  0  0  1  0  0  0  0  0  8  1  0  0  5  1  0  3
  0  0  0 15]
svc Accuracy:  0.8072289156626506
svc F1:  0.7638294368

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.74      1.00      0.85        28
          1       1.00      0.09      0.17        11

avg / total       0.81      0.74      0.66        39

[28  0 10  1]
LR Accuracy:  0.7435897435897436
LR F1:  0.5075757575757576
For name:  d_morgan
total sample size before apply threshold:  86
Counter({'0000-0002-2291-1740': 50, '0000-0002-7410-6591': 27, '0000-0001-8725-9477': 7, '0000-0001-7403-4586': 1, '0000-0002-4911-0046': 1})
['0000-0002-2291-1740', '0000-0002-7410-6591']
Total sample size after apply threshold:  77
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_id

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.31      0.47        13
          1       1.00      0.84      0.91        19
          2       0.91      1.00      0.95       587
          3       1.00      0.80      0.89        10
          4       1.00      0.58      0.74        43
          5       1.00      0.69      0.82        13
          6       1.00      0.68      0.81        19
          7       1.00      0.39      0.56        31

avg / total       0.92      0.92      0.91       735

[  4   0   9   0   0   0   0   0   0  16   3   0   0   0   0   0   0   0
 587   0   0   0   0   0   0   0   2   8   0   0   0   0   0   0  18   0
  25   0   0   0   0   0   4   0   0   9   0   0   0   0   6   0   0   0
  13   0   0   0  19   0   0   0   0  12]
svc Accuracy:  0.9170068027210885
svc F1:  0.768560699578839
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       1.00      0.32      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(40, 80)
(0, 0)
(0, 0)
1
40
             precision    recall  f1-score   support

          0       1.00      0.75      0.86        12
          1       0.90      1.00      0.95        28

avg / total       0.93      0.93      0.92        40

[ 9  3  0 28]
MNB Accuracy:  0.925
MNB F1:  0.9031476997578691
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        12
          1       1.00      1.00      1.00        28

avg / total       1.00      1.00      1.00        40

[12  0  0 28]
svc Accuracy:  1.0
sv

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(131, 599)
(0, 0)
(0, 0)
1
131
             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       0.66      1.00      0.80        59
          2       1.00      0.59      0.74        17
          3       1.00      0.50      0.67        28
          4       0.91      0.59      0.71        17

avg / total       0.84      0.76      0.75       131

[ 7  3  0  0  0  0 59  0  0  0  0  7 10  0  0  0 13  0 14  1  0  7  0  0
 10]
MNB Accuracy:  0.7633587786259542
MNB F1:  0.7485039661510249
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.73      0.81        15
          1       1.00      0.73      0.84        11
          2       1.00      0.70      0.82        10
          3       1.00      0.94      0.97        16
          4       0.89      1.00      0.94        83

avg / total       0.92      0.92      0.91       135

[11  0  0  0  4  1  8  0  0  2  0  0  7  0  3  0  0  0 15  1  0  0  0  0
 83]
svc Accuracy:  0.9185185185185185
svc F1:  0.8782746486806209
             precision    recall  f1-score   support

          0       1.00      0.47      0.64        15
          1       0.00      0.00      0.00        11
          2       1.00      0.20      0.33        10
          3       1.00      0.38      0.55        16
          4       0.69      1.00      0.82        83

avg / total       0.73      0.73      0.66       135

[ 7  0  0  0  8  0  0  0  0 11  0  0  2  0  8  0  0  0  6 10  0  0  0  0
 83]
LR Accuracy:  0.725925925925926
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.72      1.00      0.84        49
          2       1.00      0.67      0.80        36

avg / total       0.86      0.80      0.78        95

[ 3  7  0  0 49  0  0 12 24]
LR Accuracy:  0.8
LR F1:  0.6997150997150996
For name:  a_cattaneo
total sample size before apply threshold:  196
Counter({'0000-0002-6975-8923': 127, '0000-0002-9963-848X': 31, '0000-0002-2962-7259': 18, '0000-0002-4500-6540': 12, '0000-0001-5685-3684': 8})
['0000-0002-6975-8923', '0000-0002-2962-7259', '0000-0002-4500-6540', '0000-0002-9963-848X']
Total sample size after apply threshold:  188
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      1.00      0.95       127
          1       1.00      0.78      0.88        18
          2       1.00      0.58      0.74        12
          3       1.00      0.90      0.95        31

avg / total       0.94      0.94      0.93       188

[127   0   0   0   4  14   0   0   5   0   7   0   3   0   0  28]
svc Accuracy:  0.9361702127659575
svc F1:  0.878970466420288
             precision    recall  f1-score   support

          0       0.74      1.00      0.85       127
          1       1.00      0.11      0.20        18
          2       0.00      0.00      0.00        12
          3       1.00      0.48      0.65        31

avg / total       0.76      0.77      0.70       188

[127   0   0   0  16   2   0   0  12   0   0   0  16   0   0  15]
LR Accuracy:  0.7659574468085106
LR F1:  0.4261307265830172
For name:  a_ferrari
total sample size before apply threshold:  114
Counter({'0000-0001-9536-3995': 49, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.88      0.94        17
          1       1.00      1.00      1.00        17
          2       1.00      0.67      0.80        18
          3       0.86      1.00      0.92        49

avg / total       0.93      0.92      0.92       101

[15  0  0  2  0 17  0  0  0  0 12  6  0  0  0 49]
svc Accuracy:  0.9207920792079208
svc F1:  0.9155070754716981
             precision    recall  f1-score   support

          0       1.00      0.35      0.52        17
          1       1.00      1.00      1.00        17
          2       1.00      0.56      0.71        18
          3       0.72      1.00      0.84        49

avg / total       0.86      0.81      0.79       101

[ 6  0  0 11  0 17  0  0  0  0 10  8  0  0  0 49]
LR Accuracy:  0.8118811881188119
LR F1:  0.7684079205818337
For name:  a_murphy
total sample size before apply threshold:  178
Counter({'0000-0003-4152-4081': 81, '0000-0002-5222-9902': 61, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.98      0.99        81
          1       1.00      0.65      0.79        20
          2       0.87      1.00      0.93        61

avg / total       0.95      0.94      0.94       162

[79  0  2  0 13  7  0  0 61]
svc Accuracy:  0.9444444444444444
svc F1:  0.902225499267484
             precision    recall  f1-score   support

          0       0.82      0.99      0.90        81
          1       1.00      0.55      0.71        20
          2       0.96      0.85      0.90        61

avg / total       0.90      0.88      0.88       162

[80  0  1  8 11  1  9  0 52]
LR Accuracy:  0.8827160493827161
LR F1:  0.837633883312059
For name:  f_hong
total sample size before apply threshold:  41
Counter({'0000-0003-1318-2635': 23, '0000-0001-5120-3519': 14, '0000-0003-0060-2063': 2, '0000-0002-4167-6037': 2})
['0000-0003-1318-2635', '0000-0001-5120-3519']
Total sample size after apply threshold:  37
TfidfVectorize

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 389)
(0, 0)
(0, 0)
1
133
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        22
          1       0.87      1.00      0.93        74
          2       1.00      0.58      0.74        12
          3       1.00      0.76      0.86        25

avg / total       0.93      0.92      0.91       133

[22  0  0  0  0 74  0  0  0  5  7  0  0  6  0 19]
MNB Accuracy:  0.9172932330827067
MNB F1:  0.8828240197406036
             precision    recall  f1-score   support

          0       1.00      1.00      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.73      0.85        15
          1       0.96      0.93      0.95        28
          2       0.83      0.36      0.50        14
          3       0.79      0.99      0.88        79
          4       0.88      0.47      0.61        15

avg / total       0.85      0.84      0.82       151

[11  0  0  4  0  0 26  0  2  0  0  1  5  7  1  0  0  1 78  0  0  0  0  8
  7]
svc Accuracy:  0.8410596026490066
svc F1:  0.7553417076328655
             precision    recall  f1-score   support

          0       1.00      0.67      0.80        15
          1       1.00      0.86      0.92        28
          2       0.00      0.00      0.00        14
          3       0.70      1.00      0.82        79
          4       1.00      0.27      0.42        15

avg / total       0.75      0.77      0.72       151

[10  0  0  5  0  0 24  0  4  0  0  0  0 14  0  0  0  0 79  0  0  0  0 11
  4]
LR Accuracy:  0.7748344370860927
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(21, 309)
(0, 0)
(0, 0)
1
21
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.67      1.00      0.80        10

avg / total       0.84      0.76      0.75        21

[ 6  5  0 10]
MNB Accuracy:  0.7619047619047619
MNB F1:  0.7529411764705882
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.91      1.00      0.95        10

avg / total       0.96      0.95      0.95        21

[10  1  0 10]
svc Accuracy:  0.9523809523809523
svc F1:  0.9523809523809523
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.91      1.00      0.95        10

avg / total       0.96      0.95      0.95        21

[10  1  0 10]
LR Accuracy:  0.9523809523809523
LR F1:  0.9523809523809523
For name:  d_kuo
total sample size before apply threshold:  34
Counter({'0000-0002-6461-2562': 17, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(32, 48)
(0, 0)
(0, 0)
1
32
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        18
          1       0.93      1.00      0.97        14

avg / total       0.97      0.97      0.97        32

[17  1  0 14]
MNB Accuracy:  0.96875
MNB F1:  0.9684729064039409
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        18
          1       0.93      1.00      0.97        14

avg / total       0.97      0.97      0.97        32

[17  1  0 14]
svc Accuracy:  0.9

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(105, 387)
(0, 0)
(0, 0)
1
105
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        75
          1       1.00      0.90      0.95        30

avg / total       0.97      0.97      0.97       105

[75  0  3 27]
MNB Accuracy:  0.9714285714285714
MNB F1:  0.9638802889576883
             precision    recall  f1-score   support

          0       0.99      1.00      0.99        75
          1       1.00      0.97      0.98        30

avg / total       0.99      0.99      0.99       105

[75  0  1 29]
svc A

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.08      0.15        25
          1       0.50      0.03      0.05        38
          2       0.00      0.00      0.00        22
          3       0.97      0.90      0.93       180
          4       0.65      1.00      0.79       265
          5       0.98      0.73      0.84        71
          6       0.00      0.00      0.00        29

avg / total       0.73      0.77      0.70       630

[  2   0   0   2  21   0   0   0   1   0   0  37   0   0   0   0   0   2
  19   1   0   0   0   0 162  18   0   0   0   0   0   0 265   0   0   0
   0   0   0  19  52   0   0   1   0   1  27   0   0]
MNB Accuracy:  0.765079365079365
MNB F1:  0.39434875380732975
             precision    recall  f1-score   support

          0       1.00      0.76      0.86        25
          1       0.94      0.79      0.86        38
          2       0.92      0.50      0.65        22
          3       0.83      0.99      0.90   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.94      1.00      0.97       268
          1       1.00      0.70      0.83        44
          2       1.00      0.60      0.75        10

avg / total       0.95      0.95      0.94       322

[268   0   0  13  31   0   4   0   6]
svc Accuracy:  0.9472049689440993
svc F1:  0.8486417520594736
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       268
          1       1.00      0.34      0.51        44
          2       0.00      0.00      0.00        10

avg / total       0.86      0.88      0.85       322

[268   0   0  29  15   0  10   0   0]
LR Accuracy:  0.8788819875776398
LR F1:  0.48021616310488824
For name:  i_carvalho
total sample size before apply threshold:  39
Counter({'0000-0002-2028-777X': 24, '0000-0002-7882-3555': 4, '0000-0002-7569-2019': 3, '0000-0001-7981-4442': 3, '0000-0001-5823-1520': 3, '0000-0002-1811-0588': 2})
['0000-0002-2028-777X']
Total 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Total sample size after apply threshold:  248
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(248, 342)
(0, 0)
(0, 0)
1
248
             precision    recall  f1-score   support

          0       0.97      1.00      0.98       212
          1       1.00      0.81      0.89        36

avg / total       0.97      0.97      0.97       248

[212   0   7  29]
MNB Accuracy:  0.9717741935483871
MNB F1:  0.938033196501874
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       212
          1       1.00      0.78      0.88        36

avg / total       0.97  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        48
          1       1.00      0.96      0.98        23
          2       1.00      1.00      1.00        25
          3       1.00      0.88      0.94        17

avg / total       0.98      0.97      0.97       113

[48  0  0  0  1 22  0  0  0  0 25  0  2  0  0 15]
svc Accuracy:  0.9734513274336283
svc F1:  0.9712436868686869
             precision    recall  f1-score   support

          0       0.86      1.00      0.92        48
          1       1.00      0.96      0.98        23
          2       1.00      0.96      0.98        25
          3       1.00      0.65      0.79        17

avg / total       0.94      0.93      0.93       113

[48  0  0  0  1 22  0  0  1  0 24  0  6  0  0 11]
LR Accuracy:  0.9292035398230089
LR F1:  0.91654020582592
For name:  h_kang
total sample size before apply threshold:  47
Counter({'0000-0003-3431-0827': 25, '0000-0001-9671-0944': 6, '0000-0001-8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(86, 146)
(0, 0)
(0, 0)
1
86
             precision    recall  f1-score   support

          0       1.00      0.24      0.38        17
          1       0.00      0.00      0.00        10
          2       0.58      1.00      0.73        44
          3       1.00      0.40      0.57        15

avg / total       0.67      0.63      0.55        86

[ 4  0 13  0  0  0 10  0  0  0 44  0  0  0  9  6]
MNB Accuracy:  0.627906976744186
MNB F1:  0.4214285714285715
             precision    recall  f1-score   support

          0       0.91      0.59      0.7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      1.00      0.93        46
          1       0.98      1.00      0.99        57
          2       1.00      0.73      0.85        30

avg / total       0.95      0.94      0.94       133

[46  0  0  0 57  0  7  1 22]
MNB Accuracy:  0.9398496240601504
MNB F1:  0.9222503744242875
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        46
          1       0.97      1.00      0.98        57
          2       1.00      0.83      0.91        30

avg / total       0.96      0.96      0.96       133

[46  0  0  0 57  0  3  2 25]
svc Accuracy:  0.9624060150375939
svc F1:  0.9534235274707145
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        46
          1       0.95      1.00      0.97        57
          2       1.00      0.70      0.82        30

avg / total       0.94      0.93      0.93       133

[46  0  0  0 5

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



98
             precision    recall  f1-score   support

          0       0.81      1.00      0.90        43
          1       1.00      0.70      0.82        20
          2       1.00      0.88      0.93        16
          3       1.00      0.89      0.94        19

avg / total       0.92      0.90      0.90        98

[43  0  0  0  6 14  0  0  2  0 14  0  2  0  0 17]
MNB Accuracy:  0.8979591836734694
MNB F1:  0.8992851307189542
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        43
          1       1.00      0.70      0.82        20
          2       1.00      0.88      0.93        16
          3       1.00      1.00      1.00        19

avg / total       0.93      0.92      0.92        98

[43  0  0  0  6 14  0  0  2  0 14  0  0  0  0 19]
svc Accuracy:  0.9183673469387755
svc F1:  0.917939090529829
             precision    recall  f1-score   support

          0       0.64      1.00      0.78        43
          1       1.00  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97       110
          1       1.00      0.65      0.79        17

avg / total       0.96      0.95      0.95       127

[110   0   6  11]
MNB Accuracy:  0.952755905511811
MNB F1:  0.879582806573957
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       110
          1       1.00      0.76      0.87        17

avg / total       0.97      0.97      0.97       127

[110   0   4  13]
svc Accuracy:  0.968503937007874
svc F1:  0.924404761904762
             precision    recall  f1-score   support

          0       0.89      1.00      0.94       110
          1       1.00      0.24      0.38        17

avg / total       0.91      0.90      0.87       127

[110   0  13   4]
LR Accuracy:  0.8976377952755905
LR F1:  0.662579194768036
For name:  b_white
total sample size before apply threshold:  47
Counter({'0000-0002-4293-6128': 29, '0000-0002-0684-5210'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(26, 99)
(0, 0)
(0, 0)
1
26
             precision    recall  f1-score   support

          0       0.90      0.82      0.86        11
          1       0.88      0.93      0.90        15

avg / total       0.89      0.88      0.88        26

[ 9  2  1 14]
MNB Accuracy:  0.8846153846153846
MNB F1:  0.8801843317972351
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.88      1.00      0.94        15

avg / total       0.93      0.92      0.92        26

[ 9  2  0 15]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.00      0.00      0.00        29
          2       1.00      0.33      0.50        27
          3       0.00      0.00      0.00        16
          4       0.00      0.00      0.00        10
          5       1.00      0.21      0.35        56
          6       0.00      0.00      0.00        14
          7       0.00      0.00      0.00        12
          8       0.00      0.00      0.00        17
          9       1.00      0.25      0.40        24
         10       0.45      0.65      0.53       124
         11       0.00      0.00      0.00        11
         12       1.00      0.06      0.12        31
         13       0.00      0.00      0.00        32
         14       0.68      0.41      0.51        73
         15       0.00      0.00      0.00        32
         16       0.00      0.00      0.00        18
         17       1.00      0.13      0.23   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        13
          3       1.00      0.47      0.64        30
          4       1.00      0.44      0.62        36
          5       0.62      1.00      0.77       194
          6       1.00      0.18      0.30        34
          7       0.00      0.00      0.00        10

avg / total       0.63      0.66      0.58       348

[  0   0   0   0   0  12   0   0   0   0   0   0   0  19   0   0   0   0
   0   0   0  13   0   0   0   0   0  14   0  16   0   0   0   0   0   0
  16  20   0   0   0   0   0   0   0 194   0   0   0   0   0   0   0  28
   6   0   0   0   0   0   0  10   0   0]
MNB Accuracy:  0.6609195402298851
MNB F1:  0.289818333840073
             precision    recall  f1-score   support

          0       1.00      0.75      0.86        12
          1       1.00      0.47      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        26
          1       0.91      1.00      0.95        81
          2       1.00      0.27      0.43        11

avg / total       0.94      0.93      0.91       118

[26  0  0  0 81  0  0  8  3]
svc Accuracy:  0.9322033898305084
svc F1:  0.7938375350140056
             precision    recall  f1-score   support

          0       1.00      0.65      0.79        26
          1       0.80      1.00      0.89        81
          2       0.00      0.00      0.00        11

avg / total       0.77      0.83      0.79       118

[17  9  0  0 81  0  0 11  0]
LR Accuracy:  0.8305084745762712
LR F1:  0.5602691881761649
For name:  a_reynolds
total sample size before apply threshold:  40
Counter({'0000-0002-0836-746X': 23, '0000-0001-9534-8699': 7, '0000-0002-6768-5716': 5, '0000-0002-9919-4161': 3, '0000-0003-0554-8107': 1, '0000-0002-6364-6250': 1})
['0000-0002-0836-746X']
Total sample size after a

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.63      0.77        19
          1       0.68      1.00      0.81        36
          2       0.00      0.00      0.00        10

avg / total       0.67      0.74      0.67        65

[12  7  0  0 36  0  0 10  0]
LR Accuracy:  0.7384615384615385
LR F1:  0.5277274374773469
For name:  a_baranov
total sample size before apply threshold:  42
Counter({'0000-0002-9976-8532': 20, '0000-0002-9112-0838': 14, '0000-0003-3987-8112': 7, '0000-0001-8810-9972': 1})
['0000-0002-9112-0838', '0000-0002-9976-8532']
Total sample size after apply threshold:  34
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 117)
(0, 0)
(0, 0)
1
41
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        13
          1       0.90      1.00      0.95        28

avg / total       0.93      0.93      0.92        41

[10  3  0 28]
MNB Accuracy:  0.926829268292683
MNB F1:  0.9093588798820929
             precision    recall  f1-score   support

          0       1.00      0.69      0.82        13
          1       0.88      1.00      0.93        28

avg / total       0.91      0.90      0.90        41

[ 9  4  0 28]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        19
          1       0.62      0.99      0.77       119
          2       0.93      0.83      0.88        48
          3       1.00      0.12      0.21        17
          4       1.00      0.06      0.11        17
          5       0.50      0.04      0.08        23
          6       0.67      0.86      0.75        85
          7       0.90      0.53      0.67        36
          8       1.00      0.89      0.94        28

avg / total       0.72      0.71      0.65       392

[  0  18   1   0   0   0   0   0   0   0 118   1   0   0   0   0   0   0
   0   7  40   0   0   1   0   0   0   0   4   0   2   0   0  11   0   0
   0   4   0   0   1   0  11   1   0   0  21   1   0   0   1   0   0   0
   0  11   0   0   0   0  73   1   0   0   3   0   0   0   0  14  19   0
   0   3   0   0   0   0   0   0  25]
MNB Accuracy:  0.7117346938775511
MNB F1:  0.4899591427694022
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.45      0.62        11
          1       1.00      0.69      0.82        13
          2       0.77      1.00      0.87        33

avg / total       0.87      0.82      0.81        57

[ 5  0  6  0  9  4  0  0 33]
svc Accuracy:  0.8245614035087719
svc F1:  0.7705342902711324
             precision    recall  f1-score   support

          0       1.00      0.09      0.17        11
          1       1.00      0.15      0.27        13
          2       0.61      1.00      0.76        33

avg / total       0.77      0.63      0.53        57

[ 1  0 10  0  2 11  0  0 33]
LR Accuracy:  0.631578947368421
LR F1:  0.3973180076628353
For name:  h_lu
total sample size before apply threshold:  108
Counter({'0000-0003-1720-6526': 20, '0000-0002-8340-2739': 19, '0000-0003-2180-3091': 17, '0000-0003-4025-3160': 9, '0000-0001-9732-0833': 6, '0000-0002-1440-9902': 6, '0000-0002-3940-3283': 5, '0000-0002-0017-4276': 5, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.94      1.00      0.97       252
          2       1.00      0.77      0.87        26
          3       1.00      0.50      0.67        16
          4       1.00      0.91      0.95        11

avg / total       0.95      0.95      0.94       315

[  9   1   0   0   0   0 252   0   0   0   0   6  20   0   0   0   8   0
   8   0   0   1   0   0  10]
svc Accuracy:  0.9492063492063492
svc F1:  0.8810424053444648
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        10
          1       0.87      1.00      0.93       252
          2       1.00      0.62      0.76        26
          3       1.00      0.06      0.12        16
          4       1.00      0.55      0.71        11

avg / total       0.89      0.88      0.84       315

[  1   9   0   0   0   0 252   0   0   0   0  10  16   0   0   0  15   0
   1   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 459)
(0, 0)
(0, 0)
1
191
             precision    recall  f1-score   support

          0       0.96      0.96      0.96        25
          1       0.79      1.00      0.88        55
          2       1.00      0.77      0.87        26
          3       0.87      0.90      0.88        29
          4       1.00      0.71      0.83        14
          5       1.00      0.09      0.17        11
          6       0.89      1.00      0.94        31

avg / total       0.89      0.87      0.85       191

[24  0  0  0  0  0  1  0 55  0  0  0  0  0  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(729, 2302)
(0, 0)
(0, 0)
1
729
             precision    recall  f1-score   support

          0       0.21      0.99      0.35        91
          1       0.00      0.00      0.00        14
          2       1.00      0.30      0.47        23
          3       1.00      0.12      0.21        25
          4       0.86      0.24      0.38        25
          5       0.00      0.00      0.00        19
          6       0.71      0.33      0.45        30
          7       0.00      0.00      0.00        11
          8       0.00      0.00      0.00        11
          9       1.00      0.80      0.89        15
         10       0.00      0.00      0.00        10
         11       1.00      0.58      0.73        31
         12       1.00      0.62      0.77        24
         13       1.00      0.12      0.21        26
         14       1.00      0.33      0.50        18
         15       0.93      0.47      0.62        30
         16       0.00      0.00      0.00        10
         17 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.93      0.97        60
          1       1.00      0.72      0.84        18
          2       0.91      1.00      0.95        97
          3       1.00      0.94      0.97        18

avg / total       0.95      0.95      0.95       193

[56  0  4  0  0 13  5  0  0  0 97  0  0  0  1 17]
svc Accuracy:  0.9481865284974094
svc F1:  0.9316589705960248
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        60
          1       0.00      0.00      0.00        18
          2       0.76      1.00      0.86        97
          3       1.00      0.61      0.76        18

avg / total       0.79      0.84      0.80       193

[54  0  6  0  0  0 18  0  0  0 97  0  0  0  7 11]
LR Accuracy:  0.8393782383419689
LR F1:  0.6420528332325066
For name:  m_aguilar
total sample size before apply threshold:  108
Counter({'0000-0002-1935-6619': 59, '0000-0001-7395-5754': 18, '0000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      1.00      0.91        59
          1       1.00      0.50      0.67        18
          2       1.00      0.86      0.92        14

avg / total       0.90      0.88      0.87        91

[59  0  0  9  9  0  2  0 12]
LR Accuracy:  0.8791208791208791
LR F1:  0.8348240906380441
For name:  a_bianchi
total sample size before apply threshold:  73
Counter({'0000-0002-1082-3911': 38, '0000-0001-6583-1671': 23, '0000-0001-9340-6971': 8, '0000-0002-4571-0511': 2, '0000-0003-4925-5269': 2})
['0000-0001-6583-1671', '0000-0002-1082-3911']
Total sample size after apply threshold:  61
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=Tr

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 732)
(0, 0)
(0, 0)
1
191
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        36
          1       0.93      1.00      0.96       116
          2       1.00      0.85      0.92        39

avg / total       0.96      0.95      0.95       191

[ 33   3   0   0 116   0   0   6  33]
MNB Accuracy:  0.9528795811518325
MNB F1:  0.9452813358189509
             precision    recall  f1-score   support

          0       1.00      0.97      0.99        36
          1       0.95      1.00      0.97       1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.91      0.62      0.74        16
          2       0.83      0.71      0.77        35
          3       0.77      0.79      0.78        43
          4       1.00      0.55      0.71        11
          5       0.87      0.88      0.87        97
          6       0.69      0.62      0.65        39
          7       1.00      0.58      0.74        12
          8       0.87      0.94      0.90       115
          9       0.91      0.88      0.90        73
         10       0.65      0.92      0.76        50
         11       0.96      0.92      0.94        26

avg / total       0.84      0.83      0.83       527

[  6   0   1   0   0   0   1   0   1   0   0   1   0  10   0   1   0   1
   2   0   0   0   2   0   0   0  25   1   0   6   0   0   1   2   0   0
   0   0   0  34   0   0   2   0   0   0   7   0   0   0   0   0   6   1
   1   0   0   0   3   0   0   1   1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



171
             precision    recall  f1-score   support

          0       1.00      0.84      0.92        58
          1       0.93      1.00      0.96       113

avg / total       0.95      0.95      0.95       171

[ 49   9   0 113]
MNB Accuracy:  0.9473684210526315
MNB F1:  0.9387949890634322
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        58
          1       0.93      1.00      0.97       113

avg / total       0.96      0.95      0.95       171

[ 50   8   0 113]
svc Accuracy:  0.9532163742690059
svc F1:  0.9458689458689459
             precision    recall  f1-score   support

          0       1.00      0.57      0.73        58
          1       0.82      1.00      0.90       113

avg / total       0.88      0.85      0.84       171

[ 33  25   0 113]
LR Accuracy:  0.8538011695906432
LR F1:  0.8128365658246136
For name:  l_wang
total sample size before apply threshold:  828
Counter({'0000-0001-9783-4383': 98, '0000-0003-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.00      0.00      0.00        27
          2       0.00      0.00      0.00        19
          3       0.00      0.00      0.00        15
          4       1.00      0.71      0.83        17
          5       1.00      0.59      0.74        22
          6       1.00      0.35      0.52        17
          7       0.56      0.12      0.20        40
          8       1.00      0.74      0.85        53
          9       0.97      0.94      0.95        64
         10       1.00      0.20      0.33        25
         11       0.96      0.77      0.86        31
         12       0.59      0.96      0.73        56
         13       0.36      0.95      0.52        98
         14       0.00      0.00      0.00        17
         15       0.00      0.00      0.00        14
         16       1.00      0.23      0.38        30
         17       0.59      0.89      0.71   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 60)
(0, 0)
(0, 0)
1
33
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.91      1.00      0.95        20

avg / total       0.94      0.94      0.94        33

[11  2  0 20]
MNB Accuracy:  0.9393939393939394
MNB F1:  0.9345238095238095
             precision    recall  f1-score   support

          0       1.00      0.69      0.82        13
          1       0.83      1.00      0.91        20

avg / total       0.90      0.88      0.87        33

[ 9  4  0 20]
svc Accu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        15
          1       1.00      0.90      0.95        10

avg / total       0.96      0.96      0.96        25

[15  0  1  9]
LR Accuracy:  0.96
LR F1:  0.9575551782682513
For name:  n_hall
total sample size before apply threshold:  115
Counter({'0000-0003-2808-0009': 102, '0000-0003-0100-0291': 5, '0000-0003-1503-5989': 4, '0000-0001-7465-5470': 2, '0000-0001-7082-1523': 1, '0000-0002-0216-512X': 1})
['0000-0003-2808-0009']
Total sample size after apply threshold:  102
For name:  d_schneider
total sample size before apply threshold:  93
Counter({'0000-0002-2124-8385': 40, '0000-0001-9659-6731': 33, '0000-0002-2867-2613': 12, '0000-0002-0163-6137': 5, '0000-0002-5276-3304': 3})
['0000-0002-2867-2613', '0000-0001-9659-6731', '0000-0002-2124-8385']
Total sample size after apply threshold:  85
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'num

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.92      1.00      0.96        12

avg / total       0.96      0.96      0.96        23

[10  1  0 12]
MNB Accuracy:  0.9565217391304348
MNB F1:  0.9561904761904763
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.86      1.00      0.92        12

avg / total       0.93      0.91      0.91        23

[ 9  2  0 12]
svc Accuracy:  0.9130434782608695
svc F1:  0.9115384615384615
             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.80      1.00      0.89        12

avg / total       0.90      0.87      0.87        23

[ 8  3  0 12]
LR Accuracy:  0.8695652173913043
LR F1:  0.8654970760233919
For name:  j_qiu
total sample size before apply threshold:  58
Counter({'0000-0002-1541-9627': 41, '0000-0002-7633-6227': 8, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.00      0.00      0.00        11
          2       1.00      0.44      0.61        39
          3       0.00      0.00      0.00        21
          4       0.00      0.00      0.00        11
          5       0.90      0.64      0.75        58
          6       0.69      1.00      0.82       222

avg / total       0.65      0.73      0.66       376

[  0   0   0   0   0   0  14   0   0   0   0   0   0  11   0   0  17   0
   0   3  19   0   0   0   0   0   0  21   0   0   0   0   0   0  11   0
   0   0   0   0  37  21   0   0   0   0   0   1 221]
MNB Accuracy:  0.7313829787234043
MNB F1:  0.3104480175908747
             precision    recall  f1-score   support

          0       1.00      0.79      0.88        14
          1       1.00      0.45      0.62        11
          2       1.00      0.77      0.87        39
          3       0.91      0.48      0.62   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96       267
          1       1.00      0.17      0.29        24

avg / total       0.94      0.93      0.91       291

[267   0  20   4]
MNB Accuracy:  0.9312714776632303
MNB F1:  0.6248066013408973
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       267
          1       1.00      0.50      0.67        24

avg / total       0.96      0.96      0.95       291

[267   0  12  12]
svc Accuracy:  0.9587628865979382
svc F1:  0.8223443223443223
             precision    recall  f1-score   support

          0       0.92      1.00      0.96       267
          1       1.00      0.08      0.15        24

avg / total       0.93      0.92      0.89       291

[267   0  22   2]
LR Accuracy:  0.9243986254295533
LR F1:  0.5571389042612065
For name:  m_hartmann
total sample size before apply threshold:  88
Counter({'0000-0001-8069-5284': 28, '0000-0001-69

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.84      0.91        25
          1       0.76      1.00      0.86        28
          2       1.00      0.90      0.95        21
          3       1.00      0.70      0.82        10

avg / total       0.92      0.89      0.89        84

[21  4  0  0  0 28  0  0  0  2 19  0  0  3  0  7]
svc Accuracy:  0.8928571428571429
svc F1:  0.8870278378910093
             precision    recall  f1-score   support

          0       1.00      0.84      0.91        25
          1       0.67      1.00      0.80        28
          2       1.00      0.71      0.83        21
          3       1.00      0.60      0.75        10

avg / total       0.89      0.83      0.84        84

[21  4  0  0  0 28  0  0  0  6 15  0  0  4  0  6]
LR Accuracy:  0.8333333333333334
LR F1:  0.8240942028985507
For name:  k_nielsen
total sample size before apply threshold:  194
Counter({'0000-0002-5848-0911': 89, '0000-0002-7217-2114': 59, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      0.45      0.59        11
          1       1.00      0.58      0.74        12
          2       1.00      0.75      0.86        20
          3       0.78      0.99      0.87        89
          4       1.00      0.85      0.92        59

avg / total       0.89      0.86      0.86       191

[ 5  0  0  6  0  0  7  0  5  0  0  0 15  5  0  1  0  0 88  0  0  0  0  9
 50]
svc Accuracy:  0.8638743455497382
svc F1:  0.7941877155794167
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       1.00      0.33      0.50        12
          2       1.00      0.35      0.52        20
          3       0.65      0.99      0.79        89
          4       0.98      0.75      0.85        59

avg / total       0.77      0.75      0.71       191

[ 0  0  0 11  0  0  4  0  8  0  0  0  7 13  0  0  0  0 88  1  0  0  0 15
 44]
LR Accuracy:  0.7486910994764397
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      1.00      0.96       117
          1       1.00      0.82      0.90        28
          2       1.00      0.79      0.88        28

avg / total       0.94      0.94      0.93       173

[117   0   0   5  23   0   6   0  22]
svc Accuracy:  0.9364161849710982
svc F1:  0.9123542750433508
             precision    recall  f1-score   support

          0       0.82      1.00      0.90       117
          1       1.00      0.54      0.70        28
          2       1.00      0.54      0.70        28

avg / total       0.88      0.85      0.83       173

[117   0   0  13  15   0  13   0  15]
LR Accuracy:  0.8497109826589595
LR F1:  0.7651162790697675
For name:  a_coelho
total sample size before apply threshold:  128
Counter({'0000-0002-6143-4203': 72, '0000-0003-2780-5821': 15, '0000-0002-7196-4179': 11, '0000-0002-3286-0262': 11, '0000-0003-3077-3859': 6, '0000-0002-7277-2267': 5, '0000-0002-2883-415X': 4, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        15
          1       1.00      0.64      0.78        11
          2       0.84      1.00      0.91        72
          3       1.00      0.91      0.95        11

avg / total       0.89      0.87      0.86       109

[ 6  0  9  0  0  7  4  0  0  0 72  0  0  0  1 10]
svc Accuracy:  0.8715596330275229
svc F1:  0.8032449266626482
             precision    recall  f1-score   support

          0       1.00      0.07      0.12        15
          1       1.00      0.09      0.17        11
          2       0.71      1.00      0.83        72
          3       1.00      0.55      0.71        11

avg / total       0.81      0.73      0.66       109

[ 1  0 14  0  0  1 10  0  0  0 72  0  0  0  5  6]
LR Accuracy:  0.7339449541284404
LR F1:  0.45747974045109374
For name:  r_sanz
total sample size before apply threshold:  42
Counter({'0000-0003-2830-0892': 30, '0000-0001-6626-4146': 8, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(269, 526)
(0, 0)
(0, 0)
1
269
             precision    recall  f1-score   support

          0       1.00      0.84      0.91        44
          1       1.00      0.95      0.98        21
          2       0.00      0.00      0.00        13
          3       1.00      0.27      0.43        11
          4       0.91      0.86      0.89        72
          5       0.76      0.99      0.86       108

avg / total       0.83      0.85      0.83       269

[ 37   0   0   0   0   7   0  20   0   0   0   1   0   0   0   0   0  13
   0   0   0   3   5   3 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.89      0.94        44
          1       1.00      0.90      0.95        21
          2       1.00      0.62      0.76        13
          3       1.00      0.73      0.84        11
          4       0.97      0.88      0.92        72
          5       0.83      1.00      0.91       108

avg / total       0.92      0.91      0.91       269

[ 39   0   0   0   0   5   0  19   0   0   0   2   0   0   8   0   0   5
   0   0   0   8   2   1   0   0   0   0  63   9   0   0   0   0   0 108]
svc Accuracy:  0.9107806691449815
svc F1:  0.8868400192690666
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        44
          1       1.00      0.81      0.89        21
          2       1.00      0.23      0.38        13
          3       1.00      0.36      0.53        11
          4       0.98      0.79      0.88        72
          5       0.71      0.99      0.83   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        20
          1       0.97      1.00      0.98        61

avg / total       0.98      0.98      0.97        81

[18  2  0 61]
svc Accuracy:  0.9753086419753086
svc F1:  0.9656196943972835
             precision    recall  f1-score   support

          0       1.00      0.35      0.52        20
          1       0.82      1.00      0.90        61

avg / total       0.87      0.84      0.81        81

[ 7 13  0 61]
LR Accuracy:  0.8395061728395061
LR F1:  0.711111111111111
For name:  y_wu
total sample size before apply threshold:  612
Counter({'0000-0002-3611-0258': 120, '0000-0002-1720-7863': 65, '0000-0003-3456-3373': 43, '0000-0002-2573-8736': 39, '0000-0002-1751-461X': 35, '0000-0001-5579-2197': 33, '0000-0002-2985-219X': 23, '0000-0002-8621-4098': 23, '0000-0003-0365-5590': 23, '0000-0001-9359-1863': 23, '0000-0003-0253-1625': 17, '0000-0001-9142-456X': 14, '0000-0002-0833-1205': 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        14
          1       0.89      0.24      0.38        33
          2       0.38      1.00      0.55       120
          3       1.00      0.36      0.53        11
          4       0.00      0.00      0.00        10
          5       1.00      0.09      0.16        23
          6       0.00      0.00      0.00        12
          7       0.00      0.00      0.00        23
          8       0.00      0.00      0.00        10
          9       1.00      0.14      0.24        43
         10       0.89      0.89      0.89        35
         11       0.66      0.85      0.74        65
         12       1.00      0.26      0.41        23
         13       0.00      0.00      0.00        23
         14       1.00      0.64      0.78        39
         15       1.00      0.29      0.45        17

avg / total       0.64      0.54      0.47       501

[  7   0   7   0   0   0   0   0   0   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94        31
          1       1.00      0.76      0.87        17

avg / total       0.93      0.92      0.91        48

[31  0  4 13]
svc Accuracy:  0.9166666666666666
svc F1:  0.903030303030303
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        31
          1       1.00      0.35      0.52        17

avg / total       0.83      0.77      0.73        48

[31  0 11  6]
LR Accuracy:  0.7708333333333334
LR F1:  0.6855270994639666
For name:  d_sharma
total sample size before apply threshold:  60
Counter({'0000-0001-7612-3486': 32, '0000-0002-0082-1285': 16, '0000-0003-4463-1480': 4, '0000-0001-7379-4233': 4, '0000-0001-5818-025X': 2, '0000-0002-2971-5013': 1, '0000-0001-5557-9388': 1})
['0000-0001-7612-3486', '0000-0002-0082-1285']
Total sample size after apply threshold:  48
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(217, 629)
(0, 0)
(0, 0)
1
217
             precision    recall  f1-score   support

          0       0.97      0.77      0.86        48
          1       0.52      0.97      0.67        61
          2       0.00      0.00      0.00        10
          3       0.94      0.83      0.88        36
          4       1.00      0.51      0.68        35
          5       1.00      0.56      0.71        27

avg / total       0.80      0.73      0.72       217

[37 11  0  0  0  0  1 59  0  1  0  0  0 10  0  0  0  0  0  6  0 30  0  0
  0 16  0  1 18  0  0 12 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      0.83      0.87        48
          1       0.58      0.95      0.72        61
          2       1.00      0.40      0.57        10
          3       0.97      0.83      0.90        36
          4       1.00      0.57      0.73        35
          5       1.00      0.67      0.80        27

avg / total       0.86      0.78      0.79       217

[40  8  0  0  0  0  3 58  0  0  0  0  0  6  4  0  0  0  0  6  0 30  0  0
  1 13  0  1 20  0  0  9  0  0  0 18]
svc Accuracy:  0.783410138248848
svc F1:  0.764047633093707
             precision    recall  f1-score   support

          0       0.84      0.79      0.82        48
          1       0.51      0.93      0.66        61
          2       0.00      0.00      0.00        10
          3       0.97      0.83      0.90        36
          4       1.00      0.49      0.65        35
          5       1.00      0.48      0.65        27

avg / total       0.78      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       1.00      0.42      0.59        12
          2       0.66      1.00      0.80        37

avg / total       0.60      0.69      0.60        61

[ 0  0 12  0  5  7  0  0 37]
LR Accuracy:  0.6885245901639344
LR F1:  0.4613114062829433
For name:  h_tsai
total sample size before apply threshold:  93
Counter({'0000-0002-9393-7155': 36, '0000-0003-1310-9980': 15, '0000-0002-4070-0058': 14, '0000-0002-9661-5848': 8, '0000-0002-7395-1603': 7, '0000-0003-2097-0170': 6, '0000-0003-1174-5473': 1, '0000-0001-8242-4939': 1, '0000-0001-6444-8814': 1, '0000-0002-4480-0240': 1, '0000-0001-8972-7174': 1, '0000-0003-3467-0507': 1, '0000-0003-3840-7853': 1})
['0000-0002-4070-0058', '0000-0003-1310-9980', '0000-0002-9393-7155']
Total sample size after apply threshold:  65
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encodi

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      1.00      0.89        43
          1       1.00      0.45      0.62        20

avg / total       0.86      0.83      0.80        63

[43  0 11  9]
LR Accuracy:  0.8253968253968254
LR F1:  0.7536437966583718
For name:  c_peng
total sample size before apply threshold:  103
Counter({'0000-0003-3666-9833': 79, '0000-0003-3332-184X': 10, '0000-0001-7943-9873': 7, '0000-0001-6090-2944': 5, '0000-0002-0800-1417': 2})
['0000-0003-3666-9833', '0000-0003-3332-184X']
Total sample size after apply threshold:  89
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


52
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 186)
(0, 0)
(0, 0)
1
52
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.79      1.00      0.89        31
          2       1.00      0.30      0.46        10

avg / total       0.88      0.85      0.82        52

[10  1  0  0 31  0  0  7  3]
MNB Accuracy:  0.8461538461538461
MNB F1:  0.7665445665445666
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.84      1.00      0.91        31
     

In [22]:
# accuracy
from statistics import mean 
cleaned_mnb_accuracy = [x for x in all_mnb_accuracy if isinstance(x, float)]
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_mnb_accuracy))
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_mnb_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

784
784
784
0.8431078927279118
0.9095801418648304
0.8294931099749167


In [23]:
# f1
from statistics import mean 
# remove string from result
cleaned_mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_mnb_f1))
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_mnb_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

784
784
784
0.7449507529794926
0.8770878958001022
0.7213154764960485


In [None]:
%reset

In [None]:
%who