In [1]:
import com_func

Dataset = "pubmed"

# parameters
threshold = 10
cutoff = 3

coauthor_emb_type = "tf_idf"
venue_emb_type = "off"
year_emb_type = "off"
pp_textual_emb_type = "off"
citation_emb_type = "off"

In [16]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7],
                                "publish_year": read_data[10]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [3]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [4]:
# co-author relation to frequence count
def co_author_to_vector(raw_co_author_data, emb_type="off"):
    while True:
        if emb_type == "tf":
            co_author_vectorizer = CountVectorizer()
            print(co_author_vectorizer)
            result_vector = co_author_vectorizer.fit_transform(raw_co_author_data).toarray()
            #print(co_author_vectorizer.get_feature_names())
            #print(len(co_author_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [5]:
# venue relation with author
def venue_to_vector(raw_venue_id, emb_type="off"):
    while True:
        if emb_type == "tf":
            venue_count_vectorizer = CountVectorizer()
            print(venue_count_vectorizer)
            result_vector = venue_count_vectorizer.fit_transform(raw_venue_id).toarray()
            #print(len(venue_count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [6]:
# author-year relation to emb
def year_to_vector(raw_year, emb_type="off"):
    while True:
        if emb_type == "tf":
            count_vectorizer = CountVectorizer()
            result_vector = count_vectorizer.fit_transform(raw_year).toarray()
            #print(len(count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="tf"
    return result_vector

In [7]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = count_vectorizer.fit_transform(cleaned_token).toarray()
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [8]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [None]:
# load the file
import sys
import io
import os
import collections
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)
# collect statistic to output
allname = []
num_class = []
per_class_count = []
average_textual_size = []

all_mnb_accuracy = []
all_mnb_f1 = []
all_svcLinear_accuracy = []
all_svcLinear_f1 = []
all_LR_accuracy = []
all_LR_f1 = []

# read all file in labeled group
for file in listfiles:
    # group name
    temp = file.split("_")
    name = temp[1]+"_"+temp[-1]
    print("For name: ",name)
    allname.append(name)
    # read needed content in labeled file
    labeled_data = read_labeled_file(fileDir+file)
    print("total sample size before apply threshold: ",len(labeled_data))
    # count number of paper each author write based on author ID
    paperCounter = collections.Counter(labeled_data["authorID"])
    print(paperCounter)
    # collect per class statistic
    for k in list(paperCounter):
        if paperCounter[k] < threshold:
            del paperCounter[k]
    temp =list(paperCounter.keys())
    print(temp)
    per_class_count.append(paperCounter)
    num_class.append(len(paperCounter))
    # remove samples that are smaller than threshold
    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
    print("Total sample size after apply threshold: ",len(labeled_data))
    # if only have one class or no class pass the threshold, not applicable
    if(len(paperCounter)==0) or (len(paperCounter)==1):
        average_textual_size.append("Not applicable")
        all_mnb_accuracy.append("Not applicable")
        all_mnb_f1.append("Not applicable")
        all_svcLinear_accuracy.append("Not applicable")
        all_svcLinear_f1.append("Not applicable")
        all_LR_accuracy.append("Not applicable")
        all_LR_f1.append("Not applicable")
    else:
        # convert author id to label
        gather_label = []
        for index, record in labeled_data.iterrows():
            gather_label.append(temp.index(record["authorID"]))
        labeled_data["label"] = gather_label
        # shuffle the data
        labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
        # extract true label and pid
        label = labeled_data["label"]
        pid = labeled_data["paperID"]
        # list of different data field
        part_collection = []
        # data part 1, co-author matrix
        data_part_co_author = co_author_to_vector(labeled_data["co-author"], emb_type=coauthor_emb_type)
        print(data_part_co_author.shape)
        part_collection.append(data_part_co_author)
        # data part 2.1, venue_id that author attend
        data_part_venue = venue_to_vector(labeled_data["venue_id"], emb_type=venue_emb_type)
        print(data_part_venue.shape)
        part_collection.append(data_part_venue)
        # data part 2.2 year that author attend
        data_part_year = year_to_vector(labeled_data["publish_year"], emb_type=year_emb_type)
        print(data_part_year.shape)
        part_collection.append(data_part_year)
        # merge different part of data data together by concatenate it all together
        # remove empty emb (when emb set off)
        part_collection = [part for part in part_collection if len(part)!=0]
        print(len(part_collection))
        if len(part_collection)>1:
            combinedata = np.concatenate(part_collection,axis=1)
        elif len(part_collection)==1:
            if isinstance(part_collection[0], pd.DataFrame):
                combinedata = part_collection[0].values
            else:
                combinedata = part_collection[0]
        else:
            print("No data available")
            break
        print(len(combinedata))
        # using converted feature vector to train classifier
        # using Multinomial naive bayes
        clf = MultinomialNB()
        # use 10 fold cv
        mnbaccuracy, mnbmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("MNB Accuracy: ",mnbaccuracy)
        print("MNB F1: ", mnbmarcof1)
        all_mnb_accuracy.append(mnbaccuracy)
        all_mnb_f1.append(mnbmarcof1)
        # using SVM with linear kernal
        clf = SVC(decision_function_shape='ovr', kernel='linear')
        svcaccuracy, svcmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("svc Accuracy: ",svcaccuracy)
        print("svc F1: ", svcmarcof1)
        all_svcLinear_accuracy.append(svcaccuracy)
        all_svcLinear_f1.append(svcmarcof1)
        # using logistic regression
        clf = LogisticRegression(multi_class='ovr')
        LRaccuracy, LRmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("LR Accuracy: ",LRaccuracy)
        print("LR F1: ", LRmarcof1)
        all_LR_accuracy.append(LRaccuracy)
        all_LR_f1.append(LRmarcof1)
# write evaluation result to excel
output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"per_class_size":per_class_count, 
                       "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                       "mnb accuracy":all_mnb_accuracy, "mnb macro f1": all_mnb_f1,
                       "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

savePath = "../result/"+Dataset+"/coauthor_only/"
if not os.path.exists(savePath):
    os.makedirs(savePath)
filename = "2004_coauthor_only_"+coauthor_emb_type+"_threshold="+str(threshold)+".csv"
output.to_csv(savePath+filename, encoding='utf-8',index=False)
print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(127, 263)
(0, 0)
(0, 0)
1
127
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        31
          1     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        31
          1       1.00      0.54      0.70        39
          2       0.64      1.00      0.78        57

avg / total       0.84      0.75      0.74       127

[17  0 14  0 21 18  0  0 57]
LR Accuracy:  0.7480314960629921
LR F1:  0.7297184170471841
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
['0000-0002-5403-0091', '0000-0002-3046-1313']
Total sample size after apply threshold:  30
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-0002-2936-7717': 6, '0000-0003-3898-9734': 6, '0000-0002-5074-6914': 2, '0000-0003-4266-6700': 1, '0000-0002-9286-9787': 1, '0000-0002-0821-0892': 1})
['0000-0003-4341-1283', '0000-0002-9448-8144', '0000-0003-2772-9531', '0000-0001-6082-9273', '0000-0002-3813-1706', '0000-0001-8628-4902', '0000-0002-3989-7973', '0000-0002-2601-4422']
Total sample size after apply threshold:  235
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.48      1.00      0.65        51
          1       1.00      0.42      0.59        19
          2       1.00      0.70      0.83        27
          3       0.91      0.45      0.61        22
          4       0.86      0.95      0.90        39
          5       1.00      0.33      0.50        15
          6       0.97      0.78      0.86        40
          7       1.00      0.45      0.62        22

avg / total       0.85      0.73      0.73       235

[51  0  0  0  0  0  0  0  9  8  0  1  1  0  0  0  8  0 19  0  0  0  0  0
 12  0  0 10  0  0  0  0  1  0  0  0 37  0  1  0  7  0  0  0  3  5  0  0
  7  0  0  0  2  0 31  0 12  0  0  0  0  0  0 10]
MNB Accuracy:  0.7276595744680852
MNB F1:  0.6948574888661821
             precision    recall  f1-score   support

          0       0.54      1.00      0.70        51
          1       1.00      0.63      0.77        19
          2       1.00      0.67      0.80       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.90      1.00      0.95        69
          2       1.00      0.91      0.95        22

avg / total       0.93      0.92      0.91       101

[ 4  6  0  0 69  0  0  2 20]
svc Accuracy:  0.9207920792079208
svc F1:  0.8230050010871928
             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.79      1.00      0.88        69
          2       1.00      0.50      0.67        22

avg / total       0.86      0.82      0.80       101

[ 3  7  0  0 69  0  0 11 11]
LR Accuracy:  0.8217821782178217
LR F1:  0.6709401709401709
For name:  a_vega
total sample size before apply threshold:  20
Counter({'0000-0002-8207-9925': 10, '0000-0002-2178-2780': 8, '0000-0002-8148-5702': 1, '0000-0003-1082-0961': 1})
['0000-0002-8207-9925']
Total sample size after apply threshold:  10
For name:  k_smith
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.87      0.93        15
          1       1.00      0.93      0.96        29
          2       1.00      0.38      0.56        13
          3       0.80      0.99      0.89       133
          4       1.00      0.64      0.78        14
          5       1.00      0.96      0.98        23
          6       0.98      0.87      0.92        75
          7       1.00      0.79      0.88        19

avg / total       0.92      0.90      0.89       321

[ 13   0   0   2   0   0   0   0   0  27   0   2   0   0   0   0   0   0
   5   8   0   0   0   0   0   0   0 132   0   0   1   0   0   0   0   5
   9   0   0   0   0   0   0   1   0  22   0   0   0   0   0  10   0   0
  65   0   0   0   0   4   0   0   0  15]
svc Accuracy:  0.897196261682243
svc F1:  0.8627533521888557
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        15
          1       1.00      0.93      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[ 0  9  1  0 40  6  0 13 30]
MNB Accuracy:  0.7070707070707071
MNB F1:  0.49691358024691357
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.75      0.85      0.80        46
          2       0.81      0.88      0.84        43

avg / total       0.70      0.78      0.74        99

[ 0  8  2  0 39  7  0  5 38]
svc Accuracy:  0.7777777777777778
svc F1:  0.5467876039304611
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.71      0.85      0.77        46
          2       0.82      0.84      0.83        43

avg / total       0.68      0.76      0.72        99

[ 0  9  1  0 39  7  0  7 36]
LR Accuracy:  0.7575757575757576
LR F1:  0.5332878115397747
For name:  j_qian
total sample size before apply threshold:  17
Counter({'0000-0002-8793-9330': 6, '0000-0001-6145-045X': 6, '0000-0003-3162-2913': 1, '0000-0002-9522-6445': 1, '0000-0002-1325

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  625
Counter({'0000-0001-5188-7957': 141, '0000-0002-6063-7615': 82, '0000-0001-6665-6596': 79, '0000-0002-4688-3000': 66, '0000-0001-7152-765X': 51, '0000-0001-8251-4176': 28, '0000-0003-1235-5186': 26, '0000-0002-8883-7838': 25, '0000-0001-8331-3181': 20, '0000-0001-8377-5175': 15, '0000-0002-8861-0596': 14, '0000-0002-3804-2594': 14, '0000-0003-3815-0891': 14, '0000-0002-4497-4961': 10, '0000-0002-9801-9580': 9, '0000-0003-4400-5180': 5, '0000-0002-3500-914X': 5, '0000-0002-0195-6771': 4, '0000-0001-6105-0296': 3, '0000-0002-4681-3360': 3, '0000-0003-0161-0532': 3, '0000-0002-6511-1284': 3, '0000-0002-0195-5509': 2, '0000-0003-0500-1961': 2, '0000-0002-5355-3210': 1})
['0000-0003-1235-5186', '0000-0002-8861-0596', '0000-0001-7152-765X', '0000-0001-6665-6596', '0000-0002-4497-4961', '0000-0002-6063-7615', '0000-0001-5188-7957', '0000-0002-3804-2594', '0000-0002-8883-7838', '0000-0001-8251-4176', '0000-0003-3815-0891', '0000-0001-8331-3181', '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.31      0.47        26
          1       1.00      0.86      0.92        14
          2       0.96      0.84      0.90        51
          3       0.99      0.89      0.93        79
          4       0.00      0.00      0.00        10
          5       0.88      0.73      0.80        82
          6       0.44      0.99      0.61       141
          7       0.00      0.00      0.00        14
          8       1.00      0.12      0.21        25
          9       1.00      0.18      0.30        28
         10       0.00      0.00      0.00        14
         11       1.00      0.25      0.40        20
         12       0.98      0.79      0.87        66
         13       0.00      0.00      0.00        15

avg / total       0.75      0.68      0.64       585

[  8   0   0   0   0   1  17   0   0   0   0   0   0   0   0  12   0   0
   0   0   2   0   0   0   0   0   0   0   0   0  43   0   0   0   8   0
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000-0002-4899-1929': 25, '0000-0003-0487-4242': 24, '0000-0002-3642-1486': 22, '0000-0001-9965-3535': 17, '0000-0002-4168-757X': 17, '0000-0001-6525-3744': 14, '0000-0002-3897-0278': 14, '0000-0002-1181-5112': 12, '0000-0003-1447-9385': 11, '0000-0002-7305-8786': 11, '0000-0002-2655-7806': 10, '0000-0003-3466-5353': 9, '0000-0002-7359-663X': 8, '0000-0003-4600-8668': 6, '0000-0002-1382-7088': 5, '0000-0002-9505-4882': 5, '0000-0003-3667-9900': 4, '0000-0001-9714-6038': 4, '0000-0002-4760-0228': 3, '0000-0003-4188-7915': 3, '0000-0001-9454-0427': 3, '0000-0002-0333-6808': 3, '0000-0003-2134-4964': 3, '0000-0002-6658-047X': 3, '0000-0003-1273-379X': 3, '0000-0002-7047-3183': 3, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.00      0.00      0.00        14
          2       0.39      0.84      0.53       154
          3       0.00      0.00      0.00        22
          4       0.00      0.00      0.00        17
          5       0.00      0.00      0.00        11
          6       0.00      0.00      0.00        25
          7       0.51      0.88      0.64       211
          8       0.00      0.00      0.00        57
          9       0.00      0.00      0.00        28
         10       0.53      0.88      0.66       139
         11       1.00      0.16      0.27        45
         12       0.00      0.00      0.00        24
         13       0.00      0.00      0.00        57
         14       0.00      0.00      0.00        14
         15       0.76      0.45      0.57        55
         16       0.81      0.32      0.45        92
         17       1.00      0.12      0.21   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Counter({'0000-0002-6694-4130': 41, '0000-0002-3050-7262': 16, '0000-0001-9570-135X': 7, '0000-0001-5680-2641': 2})
['0000-0002-6694-4130', '0000-0002-3050-7262']
Total sample size after apply threshold:  57
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 173)
(0, 0)
(0, 0)
1
57
             precision    recall  f1-score   support

          0       1.00      0.98      0.99        41
          1       0.94      1.00      0.97        16

avg / total       0.98      0.98      0.98        57

[40  1  0 16]
MNB Accuracy:  0.9824561403508771
MNB F1:  0.978675645342312
             precision    rec

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


199
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.00      0.00      0.00        12
          2       0.76      1.00      0.87       151
          3       1.00      0.04      0.08        24

avg / total       0.70      0.76      0.67       199

[  0   0  12   0   0   0  12   0   0   0 151   0   0   0  23   1]
MNB Accuracy:  0.7638190954773869
MNB F1:  0.2363323782234957
             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       1.00      0.33      0.50        12
          2       0.83      1.00      0.91       151
          3       1.00      0.46      0.63        24

avg / total       0.87      0.84      0.81       199

[  2   0  10   0   0   4   8   0   0   0 151   0   0   0  13  11]
svc Accuracy:  0.8442211055276382
svc F1:  0.5802981552981553
             precision    recall  f1-score   support

          0       0.00      0.00      0.00  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  25
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(25, 42)
(0, 0)
(0, 0)
1
25
             precision    recall  f1-score   support

          0       0.83      0.50      0.62        10
          1       0.74      0.93      0.82        15

avg / total       0.78      0.76      0.74        25

[ 5  5  1 14]
MNB Accuracy:  0.76
MNB F1:  0.7242647058823529
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        10
          1       1.00      0.93      0.97        15

avg / total       0.96      0.96      0.96  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96       123
          1       1.00      0.76      0.86        37

avg / total       0.95      0.94      0.94       160

[123   0   9  28]
svc Accuracy:  0.94375
svc F1:  0.9131221719457014
             precision    recall  f1-score   support

          0       0.84      1.00      0.91       123
          1       1.00      0.38      0.55        37

avg / total       0.88      0.86      0.83       160

[123   0  23  14]
LR Accuracy:  0.85625
LR F1:  0.7317588745535388
For name:  k_xu
total sample size before apply threshold:  37
Counter({'0000-0002-2788-194X': 19, '0000-0003-2036-3469': 14, '0000-0002-3985-739X': 3, '0000-0001-7851-2629': 1})
['0000-0003-2036-3469', '0000-0002-2788-194X']
Total sample size after apply threshold:  33
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.90      1.00      0.95        19

avg / total       0.95      0.94      0.94        33

[12  2  0 19]
LR Accuracy:  0.9393939393939394
LR F1:  0.9365384615384615
For name:  s_antunes
total sample size before apply threshold:  54
Counter({'0000-0002-6686-9919': 35, '0000-0002-5512-9093': 12, '0000-0003-3218-3924': 4, '0000-0002-2264-3774': 3})
['0000-0002-5512-9093', '0000-0002-6686-9919']
Total sample size after apply threshold:  47
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabular

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
(0, 0)
1
42
             precision    recall  f1-score   support

          0       1.00      0.06      0.12        16
          1       0.52      1.00      0.68        16
          2       1.00      1.00      1.00        10

avg / total       0.82      0.64      0.54        42

[ 1 15  0  0 16  0  0  0 10]
MNB Accuracy:  0.6428571428571429
MNB F1:  0.5994993742177722
             precision    recall  f1-score   support

          0       0.83      0.94      0.88        16
          1       1.00      0.81      0.90        16
          2       0.91      1.00      0.95        10

avg / total       0.91      0.90      0.90        42

[15  0  1  3 13  0  0  0 10]
svc Accuracy:  0.9047619047619048
svc F1:  0.9104285392317847
             precision    recall  f1-score   support

          0       0.79      0.94      0.86        16
          1       0.93      0.81      0.87        16
          2       1.00      0.90      0.95        10

avg / total       0.89      0.88      0.88      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        27
          1       1.00      0.72      0.84        25
          2       0.95      1.00      0.97        19
          3       0.00      0.00      0.00        10
          4       1.00      0.09      0.17        11
          5       1.00      0.60      0.75        20
          6       1.00      0.78      0.88        18
          7       0.00      0.00      0.00        13
          8       0.48      1.00      0.65        61
          9       1.00      0.78      0.88        32

avg / total       0.77      0.72      0.69       236

[20  0  1  0  0  0  0  0  6  0  0 18  0  0  0  0  0  0  7  0  0  0 19  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10  0  0  0  0  0  1  0  0  0
 10  0  0  0  0  0  0 12  0  0  8  0  0  0  0  0  0  0 14  0  4  0  0  0
  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0 61  0  0  0  0  0  0  0
  0  0  7 25]
MNB Accuracy:  0.7203389830508474
MNB F1:  0.5983898172

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      1.00      0.82        40
          1       1.00      0.62      0.76        13
          2       1.00      0.36      0.53        14
          3       1.00      0.87      0.93        30

avg / total       0.87      0.81      0.80        97

[40  0  0  0  5  8  0  0  9  0  5  0  4  0  0 26]
LR Accuracy:  0.8144329896907216
LR F1:  0.7582796276405299
For name:  g_guidi
total sample size before apply threshold:  37
Counter({'0000-0002-3061-9870': 15, '0000-0003-3199-6624': 11, '0000-0001-9535-9152': 5, '0000-0002-1393-326X': 4, '0000-0002-8857-0096': 2})
['0000-0003-3199-6624', '0000-0002-3061-9870']
Total sample size after apply threshold:  26
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smoot

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(134, 375)
(0, 0)
(0, 0)
1
134
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        10
          1       1.00      0.81      0.89        26
          2       0.88      1.00      0.94        29
          3       0.97      1.00      0.99        33
          4       0.80      1.00      0.89        36

avg / total       0.91      0.90      0.87       134

[ 1  0  1  1  7  0 21  3  0  2  0  0 29  0  0  0  0  0 33  0  0  0  0  0
 36]
MNB Accuracy:  0.8955223880597015
MNB F1:  0.776976517963416
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(180, 623)
(0, 0)
(0, 0)
1
180
             precision    recall  f1-score   support

          0       1.00      0.53      0.69        19
          1       1.00      0.08      0.15        12
          2       0.73      0.94      0.82        96
          3       0.00      0.00      0.00        13
          4       1.00      0.82      0.90        17
          5       0.74      1.00      0.85        23

avg / total       0.75      0.77      0.71       180

[10  0  9  0  0  0  0  1 11  0  0  0  0  0 90  0  0  6  0  0 11  0  0  2
  0  0  3  0 14  0  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        19
          1       1.00      0.42      0.59        12
          2       0.81      0.94      0.87        96
          3       0.83      0.38      0.53        13
          4       1.00      0.88      0.94        17
          5       0.79      1.00      0.88        23

avg / total       0.86      0.84      0.83       180

[14  0  5  0  0  0  0  5  7  0  0  0  0  0 90  1  0  5  0  0  7  5  0  1
  0  0  2  0 15  0  0  0  0  0  0 23]
svc Accuracy:  0.8444444444444444
svc F1:  0.7757860890138114
             precision    recall  f1-score   support

          0       1.00      0.42      0.59        19
          1       0.00      0.00      0.00        12
          2       0.71      0.96      0.81        96
          3       1.00      0.08      0.14        13
          4       1.00      0.88      0.94        17
          5       0.81      0.91      0.86        23

avg / total       0.75     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       1.00      0.92      0.96        12
          2       1.00      1.00      1.00        19

avg / total       0.98      0.98      0.98        48

[17  0  0  1 11  0  0  0 19]
LR Accuracy:  0.9791666666666666
LR F1:  0.975983436853002
For name:  x_fu
total sample size before apply threshold:  16
Counter({'0000-0001-6928-4396': 8, '0000-0001-9295-6314': 6, '0000-0002-8012-4753': 1, '0000-0002-4305-6624': 1})
[]
Total sample size after apply threshold:  0
For name:  f_ortega
total sample size before apply threshold:  368
Counter({'0000-0003-2001-1121': 205, '0000-0003-2111-769X': 86, '0000-0002-4730-9270': 38, '0000-0002-3172-2095': 22, '0000-0002-7431-354X': 9, '0000-0001-7850-2105': 7, '0000-0003-0231-2051': 1})
['0000-0002-3172-2095', '0000-0003-2001-1121', '0000-0002-4730-9270', '0000-0003-2111-769X']
Total sample size after apply threshold:  351
TfidfVectorizer(an

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.77      0.87        44
          1       1.00      0.86      0.92        21
          2       0.94      0.88      0.91        73
          3       1.00      0.80      0.89        10
          4       0.82      0.95      0.88       107
          5       1.00      0.70      0.82        20
          6       1.00      0.76      0.87        51
          7       0.66      0.92      0.77        59
          8       1.00      0.79      0.88        14

avg / total       0.89      0.86      0.87       399

[ 34   0   0   0   5   0   0   5   0   0  18   0   0   1   0   0   2   0
   0   0  64   0   3   0   0   6   0   0   0   0   8   2   0   0   0   0
   0   0   2   0 102   0   0   3   0   0   0   1   0   4  14   0   1   0
   0   0   0   0   3   0  39   9   0   0   0   1   0   4   0   0  54   0
   0   0   0   0   1   0   0   2  11]
svc Accuracy:  0.8621553884711779
svc F1:  0.8674473302519855
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94       101
          1       1.00      0.43      0.60        21

avg / total       0.91      0.90      0.88       122

[101   0  12   9]
MNB Accuracy:  0.9016393442622951
MNB F1:  0.7719626168224298
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       101
          1       0.95      0.86      0.90        21

avg / total       0.97      0.97      0.97       122

[100   1   3  18]
svc Accuracy:  0.9672131147540983
svc F1:  0.9401960784313725
             precision    recall  f1-score   support

          0       0.83      1.00      0.91       101
          1       1.00      0.05      0.09        21

avg / total       0.86      0.84      0.77       122

[101   0  20   1]
LR Accuracy:  0.8360655737704918
LR F1:  0.5004095004095004
For name:  h_song
total sample size before apply threshold:  210
Counter({'0000-0001-5684-4059': 88, '0000-0001-5553-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(181, 1094)
(0, 0)
(0, 0)
1
181
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        29
          1       0.00      0.00      0.00        14
          2       0.69      1.00      0.82        88
          3       0.85      0.57      0.68        30
          4       1.00      0.40      0.57        20

avg / total       0.75      0.77      0.73       181

[26  0  1  2  0  0  0 14  0  0  0  0 88  0  0  0  0 13 17  0  0  0 11  1
  8]
MNB Accuracy:  0.7679558011049724
MNB F1:  0.6030975536091816
             precision    recall  f1-score   support

          0       1.00      0.97      0.98        29
          1       1.00      0.36      0.53        14
          2       0.88      1.00      0.94        88
          3       0.87      0.90      0.89        30
          4       0.94      0.80      0.86        20

avg / total       0.91      0.91      0.90       181

[28  0  0  1  0  0  5  9  0  0  0  0 88  0  0  0  0  2 27  1  0  0  1  3
 16]
sv

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       1.00      0.25      0.40        12
          2       0.67      1.00      0.80        24
          3       0.92      1.00      0.96        23

avg / total       0.87      0.81      0.78        73

[ 9  0  4  1  0  3  8  1  0  0 24  0  0  0  0 23]
MNB Accuracy:  0.8082191780821918
MNB F1:  0.7352355072463769
             precision    recall  f1-score   support

          0       0.92      0.79      0.85        14
          1       0.91      0.83      0.87        12
          2       0.82      0.96      0.88        24
          3       1.00      0.96      0.98        23

avg / total       0.91      0.90      0.90        73

[11  1  2  0  0 10  2  0  1  0 23  0  0  0  1 22]
svc Accuracy:  0.9041095890410958
svc F1:  0.8945280564845782
             precision    recall  f1-score   support

          0       1.00      0.64      0.78        14
          1       1.00     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      0.30      0.43        10
          1       0.95      0.99      0.97       128

avg / total       0.93      0.94      0.93       138

[  3   7   1 127]
svc Accuracy:  0.9420289855072463
svc F1:  0.6990185387131951
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.93      1.00      0.96       128

avg / total       0.86      0.93      0.89       138

[  0  10   0 128]
LR Accuracy:  0.927536231884058
LR F1:  0.48120300751879697
For name:  a_simon
total sample size before apply threshold:  117
Counter({'0000-0002-6141-7921': 60, '0000-0002-0151-0120': 19, '0000-0002-6509-4541': 14, '0000-0001-6023-6427': 14, '0000-0002-1879-5628': 5, '0000-0002-3286-5776': 4, '0000-0003-4641-6186': 1})
['0000-0002-6141-7921', '0000-0002-6509-4541', '0000-0002-0151-0120', '0000-0001-6023-6427']
Total sample size after apply threshold:  107
TfidfVector

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.97      0.86       149
          1       0.88      0.84      0.86        98
          2       0.00      0.00      0.00        36

avg / total       0.71      0.80      0.75       283

[145   4   0  15  82   1  29   7   0]
MNB Accuracy:  0.8021201413427562
MNB F1:  0.5722089697119902
             precision    recall  f1-score   support

          0       0.85      0.93      0.89       149
          1       0.86      0.93      0.89        98
          2       0.79      0.31      0.44        36

avg / total       0.85      0.85      0.83       283

[139   8   2   6  91   1  18   7  11]
svc Accuracy:  0.8515901060070671
svc F1:  0.7410608345902464
             precision    recall  f1-score   support

          0       0.78      0.95      0.86       149
          1       0.89      0.86      0.88        98
          2       0.83      0.14      0.24        36

avg / total       0.82      0.82      0.78       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(403, 752)
(0, 0)
(0, 0)
1
403
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        21
          1       0.44      1.00      0.61        68
          2       0.50      1.00      0.66        69
          3       0.00      0.00      0.00        17
          4       0.00      0.00      0.00        14
          5       1.00      0.68      0.81        25
          6       0.00      0.00      0.00        10
          7       1.00      0.38      0.55        32
          8       0.00      0.00      0.00        12
          9       0.00      0.00      0.00        15
         10       1.00      0.23      0.37        22
         11       1.00      0.22      0.36        18
         12       0.00      0.00      0.00        12
         13       0.92      0.85      0.89        68

avg / total       0.61      0.60      0.53       403

[ 9 10  2  0  0  0  0  0  0  0  0  0  0  0  0 68  0  0  0  0  0  0  0  0
  0  0  0  0  0  0 69  0  0  0  0  0  0  0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.98      0.97        66
          1       1.00      0.57      0.72        30
          2       0.87      1.00      0.93        72

avg / total       0.93      0.92      0.91       168

[65  0  1  3 17 10  0  0 72]
MNB Accuracy:  0.9166666666666666
MNB F1:  0.8741952557050027
             precision    recall  f1-score   support

          0       0.98      0.98      0.98        66
          1       1.00      0.70      0.82        30
          2       0.89      1.00      0.94        72

avg / total       0.95      0.94      0.94       168

[65  0  1  1 21  8  0  0 72]
svc Accuracy:  0.9404761904761905
svc F1:  0.9165181224004754
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        66
          1       1.00      0.43      0.60        30
          2       0.73      1.00      0.84        72

avg / total       0.88      0.84      0.83       168

[56  0 10  0 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.62      0.92      0.74        85
          1       1.00      0.73      0.84        22
          2       0.00      0.00      0.00        11
          3       0.00      0.00      0.00        12
          4       1.00      0.44      0.62        18
          5       0.00      0.00      0.00        12
          6       1.00      0.84      0.91        31
          7       1.00      0.25      0.40        16
          8       0.59      0.98      0.74        54
          9       1.00      0.40      0.57        15

avg / total       0.68      0.69      0.64       276

[78  0  0  0  0  0  0  0  7  0  6 16  0  0  0  0  0  0  0  0 10  0  0  0
  0  0  0  0  1  0 12  0  0  0  0  0  0  0  0  0  3  0  0  0  8  0  0  0
  7  0  2  0  0  0  0  0  0  0 10  0  3  0  0  0  0  0 26  0  2  0  2  0
  0  0  0  0  0  4 10  0  1  0  0  0  0  0  0  0 53  0  9  0  0  0  0  0
  0  0  0  6]
MNB Accuracy:  0.6920289855072463
MNB F1:  0.4816646755

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      1.00      0.94        15
          1       1.00      0.90      0.95        10
          2       1.00      0.92      0.96        12

avg / total       0.95      0.95      0.95        37

[15  0  0  1  9  0  1  0 11]
svc Accuracy:  0.9459459459459459
svc F1:  0.9471300533943555
             precision    recall  f1-score   support

          0       0.79      1.00      0.88        15
          1       1.00      0.70      0.82        10
          2       1.00      0.92      0.96        12

avg / total       0.91      0.89      0.89        37

[15  0  0  3  7  0  1  0 11]
LR Accuracy:  0.8918918918918919
LR F1:  0.887468030690537
For name:  j_matthews
total sample size before apply threshold:  65
Counter({'0000-0002-9815-8636': 46, '0000-0001-6184-1813': 7, '0000-0002-5993-7610': 5, '0000-0002-1832-4420': 4, '0000-0002-7282-8929': 1, '0000-0002-6888-9438': 1, '0000-0002-3968-8282': 1})
['0000-0002-9815-8636']


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.98       146
          1       1.00      0.30      0.46        10

avg / total       0.96      0.96      0.94       156

[146   0   7   3]
svc Accuracy:  0.9551282051282052
svc F1:  0.7190635451505016
             precision    recall  f1-score   support

          0       0.94      1.00      0.97       146
          1       0.00      0.00      0.00        10

avg / total       0.88      0.94      0.90       156

[146   0  10   0]
LR Accuracy:  0.9358974358974359
LR F1:  0.48344370860927155
For name:  r_harris
total sample size before apply threshold:  50
Counter({'0000-0002-4377-5063': 26, '0000-0002-7943-5650': 8, '0000-0002-2636-1520': 6, '0000-0003-1787-7784': 3, '0000-0002-9247-0768': 3, '0000-0003-0954-1981': 2, '0000-0003-3322-1371': 2})
['0000-0002-4377-5063']
Total sample size after apply threshold:  26
For name:  c_vaughan
total sample size before apply threshold:  83
Counter({'0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(34, 160)
(0, 0)
(0, 0)
1
34
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        20
          1       1.00      0.50      0.67        14

avg / total       0.85      0.79      0.78        34

[20  0  7  7]
MNB Accuracy:  0.7941176470588235
MNB F1:  0.7588652482269503
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        20
          1       0.88      1.00      0.93        14

avg / total       0.95      0.94      0.94        34

[18  2  0 14]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        17
          1       0.96      0.88      0.91        49
          2       0.90      0.96      0.93        57

avg / total       0.94      0.93      0.93       123

[17  0  0  0 43  6  0  2 55]
svc Accuracy:  0.9349593495934959
svc F1:  0.9490323356172617
             precision    recall  f1-score   support

          0       1.00      0.24      0.38        17
          1       0.95      0.84      0.89        49
          2       0.72      0.96      0.83        57

avg / total       0.85      0.81      0.79       123

[ 4  0 13  0 41  8  0  2 55]
LR Accuracy:  0.8130081300813008
LR F1:  0.6997747993171334
For name:  p_teixeira
total sample size before apply threshold:  213
Counter({'0000-0002-7258-7977': 60, '0000-0002-6296-5137': 55, '0000-0001-7202-0527': 48, '0000-0003-2315-2261': 26, '0000-0003-2735-6608': 22, '0000-0002-7596-9735': 1, '0000-0002-1593-8064': 1})
['0000-0001-7202-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      1.00      0.91        48
          1       1.00      0.98      0.99        60
          2       0.95      0.73      0.83        26
          3       1.00      0.86      0.93        22
          4       1.00      1.00      1.00        55

avg / total       0.95      0.95      0.95       211

[48  0  0  0  0  1 59  0  0  0  7  0 19  0  0  2  0  1 19  0  0  0  0  0
 55]
svc Accuracy:  0.9478672985781991
svc F1:  0.9300346481656749
             precision    recall  f1-score   support

          0       1.00      0.83      0.91        48
          1       1.00      1.00      1.00        60
          2       1.00      0.62      0.76        26
          3       1.00      0.91      0.95        22
          4       0.73      1.00      0.85        55

avg / total       0.93      0.91      0.90       211

[40  0  0  0  8  0 60  0  0  0  0  0 16  0 10  0  0  0 20  2  0  0  0  0
 55]
LR Accuracy:  0.9052132701421801
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



['0000-0002-8214-1696', '0000-0002-7231-0185', '0000-0003-3399-055X']
Total sample size after apply threshold:  201
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(201, 228)
(0, 0)
(0, 0)
1
201
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       1.00      0.62      0.76        65
          2       0.77      1.00      0.87       124

avg / total       0.80      0.82      0.78       201

[  0   0  12   0  40  25   0   0 124]
MNB Accuracy:  0.8159203980099502
MNB F1:  0.5440267335004177
             precision    recall  f1-sco

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(146, 1028)
(0, 0)
(0, 0)
1
146
             precision    recall  f1-score   support

          0       0.83      0.99      0.90        84
          1       1.00      0.79      0.88        33
          2       0.95      0.66      0.78        29

avg / total       0.89      0.88      0.87       146

[83  0  1  7 26  0 10  0 19]
MNB Accuracy:  0.8767123287671232
MNB F1:  0.8530133497761669


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      1.00      0.94        84
          1       1.00      0.85      0.92        33
          2       1.00      0.83      0.91        29

avg / total       0.94      0.93      0.93       146

[84  0  0  5 28  0  5  0 24]
svc Accuracy:  0.9315068493150684
svc F1:  0.9225044629876124
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        84
          1       1.00      0.70      0.82        33
          2       1.00      0.52      0.68        29

avg / total       0.87      0.84      0.82       146

[84  0  0 10 23  0 14  0 15]
LR Accuracy:  0.8356164383561644
LR F1:  0.7927489177489179
For name:  d_parsons
total sample size before apply threshold:  30
Counter({'0000-0002-3956-6031': 26, '0000-0002-1393-8431': 2, '0000-0002-9121-7859': 1, '0000-0002-5142-4466': 1})
['0000-0002-3956-6031']
Total sample size after apply threshold:  26
For name:  a_choudhury
total samp

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[18  6  0 28]
LR Accuracy:  0.8846153846153846
LR F1:  0.880184331797235
For name:  c_richter
total sample size before apply threshold:  11
Counter({'0000-0002-5658-6173': 4, '0000-0002-6591-1118': 4, '0000-0001-6017-1520': 2, '0000-0002-6839-7994': 1})
[]
Total sample size after apply threshold:  0
For name:  m_hossain
total sample size before apply threshold:  102
Counter({'0000-0003-1408-2273': 26, '0000-0002-1878-8145': 17, '0000-0003-3967-2544': 10, '0000-0003-3399-581X': 9, '0000-0003-3303-5755': 7, '0000-0003-1271-1515': 7, '0000-0003-4733-0018': 6, '0000-0002-9953-586X': 5, '0000-0001-8019-843X': 4, '0000-0001-7996-9233': 3, '0000-0002-1917-8701': 1, '0000-0002-0984-984X': 1, '0000-0002-7673-8410': 1, '0000-0002-0977-4593': 1, '0000-0003-2970-2324': 1, '0000-0001-6753-4216': 1, '0000-0002-3929-6211': 1, '0000-0002-6621-8737': 1})
['0000-0003-3967-2544', '0000-0003-1408-2273', '0000-0002-1878-8145']
Total sample size after apply threshold:  53
TfidfVectorizer(analyzer='word', b

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        45
          1       0.93      1.00      0.96       122

avg / total       0.95      0.95      0.94       167

[ 36   9   0 122]
svc Accuracy:  0.9461077844311377
svc F1:  0.9266578831796224
             precision    recall  f1-score   support

          0       1.00      0.27      0.42        45
          1       0.79      1.00      0.88       122

avg / total       0.84      0.80      0.76       167

[ 12  33   0 122]
LR Accuracy:  0.8023952095808383
LR F1:  0.6509595287858636
For name:  m_soares
total sample size before apply threshold:  247
Counter({'0000-0001-9701-836X': 75, '0000-0002-9314-4833': 68, '0000-0001-6071-0272': 44, '0000-0003-1579-8513': 32, '0000-0002-5213-2377': 10, '0000-0001-8860-0470': 7, '0000-0003-4227-4141': 4, '0000-0002-7181-1906': 3, '0000-0002-4614-8209': 2, '0000-0002-8059-7067': 1, '0000-0002-9013-2570': 1})
['0000-0002-5213-2377', '0000-0001-6071-0272

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       1.00      0.91      0.95        44
          2       1.00      0.75      0.86        32
          3       0.89      0.93      0.91        68
          4       0.83      0.96      0.89        75

avg / total       0.91      0.90      0.90       229

[ 7  0  0  0  3  0 40  0  2  2  0  0 24  3  5  0  0  0 63  5  0  0  0  3
 72]
svc Accuracy:  0.8995633187772926
svc F1:  0.8856833860642579
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       1.00      0.86      0.93        44
          2       1.00      0.62      0.77        32
          3       0.95      0.91      0.93        68
          4       0.73      0.99      0.84        75

avg / total       0.90      0.86      0.86       229

[ 4  0  0  0  6  0 38  0  1  5  0  0 20  1 11  0  0  0 62  6  0  0  0  1
 74]
LR Accuracy:  0.8646288209606987
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.83      0.91        29
          1       1.00      0.92      0.96        61
          2       0.81      1.00      0.89        42

avg / total       0.94      0.92      0.93       132

[24  0  5  0 56  5  0  0 42]
svc Accuracy:  0.9242424242424242
svc F1:  0.918847451966681
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        29
          1       0.75      0.95      0.84        61
          2       0.87      0.62      0.72        42

avg / total       0.84      0.83      0.82       132

[25  3  1  0 58  3  0 16 26]
LR Accuracy:  0.8257575757575758
LR F1:  0.8295759527643586
For name:  a_rao
total sample size before apply threshold:  93
Counter({'0000-0002-2676-2762': 36, '0000-0003-0320-2962': 20, '0000-0002-2550-6097': 11, '0000-0001-6440-1274': 8, '0000-0003-2319-6539': 5, '0000-0002-2474-5010': 5, '0000-0003-4480-3190': 3, '0000-0003-4879-1123': 2, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 46
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 154)
(0, 0)
(0, 0)
1
46
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        29
          1       1.00      0.82      0.90        17

avg / total       0.94      0.93      0.93        46

[29  0  3 14]
MNB Accuracy:  0.9347826086956522
MNB F1:  0.9270227392913801
             precision    recall  f1-score   support

          0       0.91      1.00      0.95        29
          1       1.00      0.82      0.90        17

avg / total       0.94      0.93      0.93        46

[29  0  3 14]
svc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(37, 170)
(0, 0)
(0, 0)
1
37
             precision    recall  f1-score   support

          0       0.56      1.00      0.71        15
          1       1.00      0.45      0.62        11
          2       1.00      0.45      0.62        11

avg / total       0.82      0.68      0.66        37

[15  0  0  6  5  0  6  0  5]
MNB Accuracy:  0.6756756756756757
MNB F1:  0.6547619047619048
             precision    recall  f1-score   support

          0       0.60      0.80      0.69        15
          1       0.88      0.64      0.74        11
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 55)
(0, 0)
(0, 0)
1
21
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.90      0.95        10

avg / total       0.96      0.95      0.95        21

[11  0  1  9]
MNB Accuracy:  0.9523809523809523
MNB F1:  0.9519450800915332
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.91      1.00      0.95        10

avg / total       0.96      0.95      0.95        21

[10  1  0 10]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(104, 161)
(0, 0)
(0, 0)
1
104
             precision    recall  f1-score   support

          0       0.86      0.75      0.80        16
          1       0.94      0.88      0.91        17
          2       0.96      0.92      0.94        25
          3       0.77      1.00      0.87        20
          4       0.96      0.88      0.92        26

avg / total       0.90      0.89      0.89       104

[12  1  0  3  0  0 15  0  2  0  1  0 23  0  1  0  0  0 20  0  1  0  1  1
 23]
MNB Accuracy:  0.8942307692307693
MNB F1:  0.887486327337259
            

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(183, 329)
(0, 0)
(0, 0)
1
183
             precision    recall  f1-score   support

          0       0.88      1.00      0.93        63
          1       0.92      0.95      0.94        38
          2       0.92      0.85      0.88        27
          3       0.97      0.97      0.97        33
          4       1.00      0.64      0.78        22

avg / total       0.92      0.92      0.91       183

[63  0  0  0  0  1 36  1  0  0  1  2 23  1  0  0  1  0 32  0  7  0  1  0
 14]
MNB Accuracy:  0.9180327868852459
MNB F1:  0.9000976800976801


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        63
          1       0.94      0.89      0.92        38
          2       0.95      0.78      0.86        27
          3       1.00      0.97      0.98        33
          4       1.00      0.73      0.84        22

avg / total       0.92      0.91      0.91       183

[63  0  0  0  0  3 34  1  0  0  4  2 21  0  0  1  0  0 32  0  6  0  0  0
 16]
svc Accuracy:  0.907103825136612
svc F1:  0.9005564847670111
             precision    recall  f1-score   support

          0       0.75      1.00      0.86        63
          1       0.94      0.87      0.90        38
          2       1.00      0.74      0.85        27
          3       1.00      0.97      0.98        33
          4       1.00      0.55      0.71        22

avg / total       0.90      0.87      0.87       183

[63  0  0  0  0  5 33  0  0  0  5  2 20  0  0  1  0  0 32  0 10  0  0  0
 12]
LR Accuracy:  0.8743169398907104
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.75      1.00      0.86        15

avg / total       0.86      0.81      0.79        26

[ 6  5  0 15]
LR Accuracy:  0.8076923076923077
LR F1:  0.7815126050420167
For name:  d_zhang
total sample size before apply threshold:  94
Counter({'0000-0002-4175-5982': 17, '0000-0002-7665-2182': 12, '0000-0003-0779-6438': 11, '0000-0003-4280-0068': 8, '0000-0001-9295-4992': 7, '0000-0001-9508-8209': 7, '0000-0001-6930-5994': 6, '0000-0001-9478-5344': 6, '0000-0001-5809-0027': 5, '0000-0002-4149-4938': 4, '0000-0002-1581-2357': 4, '0000-0001-5956-4618': 2, '0000-0001-7063-7742': 2, '0000-0002-2541-837X': 1, '0000-0001-6259-7082': 1, '0000-0002-4515-2070': 1})
['0000-0002-4175-5982', '0000-0002-7665-2182', '0000-0003-0779-6438']
Total sample size after apply threshold:  40
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(28, 145)
(0, 0)
(0, 0)
1
28
             precision    recall  f1-score   support

          0       0.73      1.00      0.84        16
          1       1.00      0.50      0.67        12

avg / total       0.84      0.79      0.77        28

[16  0  6  6]
MNB Accuracy:  0.7857142857142857
MNB F1:  0.7543859649122807
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        16
          1       0.86      1.00      0.92        12

avg / total       0.94      0.93      0.93        28

[14  2  0 12]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(206, 440)
(0, 0)
(0, 0)
1
206
             precision    recall  f1-score   support

          0       0.58      0.99      0.73       102
          1       0.93      0.54      0.68        46
          2       0.00      0.00      0.00        14
          3       1.00      0.18      0.30        17
          4       0.00      0.00      0.00        17
          5       0.00      0.00      0.00        10

avg / total       0.58      0.63      0.54       206

[101   0   0   0   1   0  21  25   0   0   0   0  14   0   0   0   0   0
  13   1   0   3   0   0 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.79      0.97      0.87       102
          1       0.90      0.80      0.85        46
          2       0.88      0.50      0.64        14
          3       1.00      0.82      0.90        17
          4       1.00      0.35      0.52        17
          5       0.83      1.00      0.91        10

avg / total       0.86      0.84      0.83       206

[99  3  0  0  0  0  9 37  0  0  0  0  5  0  7  0  0  2  2  0  1 14  0  0
 10  1  0  0  6  0  0  0  0  0  0 10]
svc Accuracy:  0.8398058252427184
svc F1:  0.7822068151699769
             precision    recall  f1-score   support

          0       0.66      0.99      0.79       102
          1       0.94      0.70      0.80        46
          2       1.00      0.14      0.25        14
          3       1.00      0.29      0.45        17
          4       1.00      0.18      0.30        17
          5       0.89      0.80      0.84        10

avg / total       0.81     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.79      0.88        14
          1       0.91      0.83      0.87        12
          2       1.00      0.93      0.96        27
          3       1.00      0.89      0.94        18
          4       0.63      0.97      0.76        30
          5       1.00      0.36      0.53        11
          6       1.00      0.96      0.98        28

avg / total       0.91      0.87      0.87       140

[11  0  0  0  3  0  0  0 10  0  0  2  0  0  0  0 25  0  2  0  0  0  0  0
 16  2  0  0  0  1  0  0 29  0  0  0  0  0  0  7  4  0  0  0  0  0  1  0
 27]
svc Accuracy:  0.8714285714285714
svc F1:  0.847227079915194
             precision    recall  f1-score   support

          0       1.00      0.21      0.35        14
          1       1.00      0.67      0.80        12
          2       1.00      0.96      0.98        27
          3       1.00      0.89      0.94        18
          4       0.54      1.00      0.70

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.90      0.86        41
          1       0.92      0.87      0.89        53
          2       1.00      0.92      0.96        13

avg / total       0.89      0.89      0.89       107

[37  4  0  7 46  0  1  0 12]
svc Accuracy:  0.8878504672897196
svc F1:  0.9045563332580718
             precision    recall  f1-score   support

          0       0.91      0.76      0.83        41
          1       0.82      0.94      0.88        53
          2       1.00      0.92      0.96        13

avg / total       0.88      0.87      0.87       107

[31 10  0  3 50  0  0  1 12]
LR Accuracy:  0.8691588785046729
LR F1:  0.887953216374269
For name:  m_viana
total sample size before apply threshold:  139
Counter({'0000-0002-0464-4845': 34, '0000-0003-4356-8109': 31, '0000-0002-4073-3802': 29, '0000-0001-9665-2115': 26, '0000-0001-9288-2108': 13, '0000-0002-3074-767X': 5, '0000-0002-5657-5570': 1})
['0000-0001-9665-2115'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.76      0.96      0.85        26
          1       1.00      0.94      0.97        31
          2       1.00      0.93      0.96        29
          3       0.91      0.77      0.83        13
          4       1.00      0.97      0.99        34

avg / total       0.94      0.93      0.93       133

[25  0  0  1  0  2 29  0  0  0  2  0 27  0  0  3  0  0 10  0  1  0  0  0
 33]
svc Accuracy:  0.9323308270676691
svc F1:  0.919363593654006
             precision    recall  f1-score   support

          0       0.80      0.92      0.86        26
          1       0.91      0.94      0.92        31
          2       0.93      0.97      0.95        29
          3       0.88      0.54      0.67        13
          4       1.00      0.97      0.99        34

avg / total       0.91      0.91      0.91       133

[24  1  0  1  0  1 29  1  0  0  0  1 28  0  0  4  1  1  7  0  1  0  0  0
 33]
LR Accuracy:  0.9097744360902256
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.97      0.95        60
          1       0.91      0.84      0.87        25

avg / total       0.93      0.93      0.93        85

[58  2  4 21]
svc Accuracy:  0.9294117647058824
svc F1:  0.9129098360655737
             precision    recall  f1-score   support

          0       0.77      1.00      0.87        60
          1       1.00      0.28      0.44        25

avg / total       0.84      0.79      0.74        85

[60  0 18  7]
LR Accuracy:  0.788235294117647
LR F1:  0.6535326086956522
For name:  c_liao
total sample size before apply threshold:  35
Counter({'0000-0002-1324-9644': 11, '0000-0001-5168-6493': 11, '0000-0001-9777-3701': 6, '0000-0003-3459-1913': 6, '0000-0003-4156-0912': 1})
['0000-0002-1324-9644', '0000-0001-5168-6493']
Total sample size after apply threshold:  22
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(139, 378)
(0, 0)
(0, 0)
1
139
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        68
          1       1.00      0.67      0.80        18
          2       0.83      1.00      0.91        53

avg / total       0.93      0.92      0.92       139

[63  0  5  0 12  6  0  0 53]
MNB Accuracy:  0.920863309352518
MNB F1:  0.8892716556838695
             precision    recall  f1-score   support

          0       0.96      1.00      0.98        68
          1       1.00      0.83      0.91        18
       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 27
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(27, 93)
(0, 0)
(0, 0)
1
27
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       1.00      0.90      0.95        10

avg / total       0.97      0.96      0.96        27

[17  0  1  9]
MNB Accuracy:  0.9629629629629629
MNB F1:  0.9593984962406015
             precision    recall  f1-score   support

          0       0.94      0.94      0.94        17
          1       0.90      0.90      0.90        10

avg / total       0.93      0.93      0.93        27

[16  1  1  9]
svc 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(100, 351)
(0, 0)
(0, 0)
1
100
             precision    recall  f1-score   support

          0       0.97      0.97      0.97        35
          1       0.98      0.98      0.98        65

avg / total       0.98      0.98      0.98       100

[34  1  1 64]
MNB Accuracy:  0.98
MNB F1:  0.9780219780219781
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        35
          1       0.97      1.00      0.98        65

avg / total       0.98      0.98      0.98       100

[33  2  0 65]
svc Accuracy:  0.98

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.98      1.00      0.99       238
          1       1.00      0.81      0.89        31

avg / total       0.98      0.98      0.98       269

[238   0   6  25]
svc Accuracy:  0.9776951672862454
svc F1:  0.9402045050385299
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       238
          1       1.00      0.13      0.23        31

avg / total       0.91      0.90      0.86       269

[238   0  27   4]
LR Accuracy:  0.8996282527881041
LR F1:  0.587446748082931
For name:  k_jacobsen
total sample size before apply threshold:  113
Counter({'0000-0002-4198-6246': 93, '0000-0002-1121-2979': 17, '0000-0002-3450-0850': 2, '0000-0003-0135-0988': 1})
['0000-0002-4198-6246', '0000-0002-1121-2979']
Total sample size after apply threshold:  110
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='con

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      1.00      0.96        93
          1       1.00      0.59      0.74        17

avg / total       0.94      0.94      0.93       110

[93  0  7 10]
svc Accuracy:  0.9363636363636364
svc F1:  0.8522356553444637
             precision    recall  f1-score   support

          0       0.86      1.00      0.93        93
          1       1.00      0.12      0.21        17

avg / total       0.88      0.86      0.81       110

[93  0 15  2]
LR Accuracy:  0.8636363636363636
LR F1:  0.5679497250589159
For name:  s_kelly
total sample size before apply threshold:  102
Counter({'0000-0003-4002-048X': 31, '0000-0001-8583-5362': 26, '0000-0002-8245-0181': 20, '0000-0003-3533-5268': 12, '0000-0002-0375-1040': 11, '0000-0002-3078-8404': 2})
['0000-0002-8245-0181', '0000-0001-8583-5362', '0000-0003-3533-5268', '0000-0002-0375-1040', '0000-0003-4002-048X']
Total sample size after apply threshold:  100
TfidfVectorizer(analy

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       0.81      0.85      0.83        26
          2       1.00      0.75      0.86        12
          3       1.00      0.55      0.71        11
          4       0.74      0.90      0.81        31

avg / total       0.87      0.85      0.85       100

[20  0  0  0  0  0 22  0  0  4  0  1  9  0  2  0  1  0  6  4  0  3  0  0
 28]
svc Accuracy:  0.85
svc F1:  0.8409616184455734
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        20
          1       1.00      0.54      0.70        26
          2       1.00      0.75      0.86        12
          3       1.00      0.18      0.31        11
          4       0.56      1.00      0.72        31

avg / total       0.86      0.76      0.74       100

[20  0  0  0  0  0 14  0  0 12  0  0  9  0  3  0  0  0  2  9  0  0  0  0
 31]
LR Accuracy:  0.76
LR F1:  0.7171530794786609
Fo

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(65, 172)
(0, 0)
(0, 0)
1
65
             precision    recall  f1-score   support

          0       1.00      0.77      0.87        26
          1       0.87      1.00      0.93        39

avg / total       0.92      0.91      0.90        65

[20  6  0 39]
MNB Accuracy:  0.9076923076923077
MNB F1:  0.8990683229813665
             precision    recall  f1-score   support

          0       1.00      0.81      0.89        26
          1       0.89      1.00      0.94        39

avg / total       0.93      0.92      0.92        65

[21  5  0 39]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.29      0.45        17
          1       0.00      0.00      0.00        11
          2       1.00      0.62      0.76        13
          3       0.84      1.00      0.91       144

avg / total       0.81      0.85      0.80       185

[  5   0   0  12   0   0   0  11   0   0   8   5   0   0   0 144]
MNB Accuracy:  0.8486486486486486
MNB F1:  0.5319606553783769
             precision    recall  f1-score   support

          0       0.94      0.88      0.91        17
          1       0.88      0.64      0.74        11
          2       1.00      0.77      0.87        13
          3       0.94      0.99      0.96       144

avg / total       0.94      0.94      0.94       185

[ 15   0   0   2   0   7   0   4   0   0  10   3   1   1   0 142]
svc Accuracy:  0.9405405405405406
svc F1:  0.8695525240380377
             precision    recall  f1-score   support

          0       1.00      0.29      0.45      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(297, 1721)
(0, 0)
(0, 0)
1
297
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       1.00      0.05      0.10        19
          2       1.00      0.54      0.70        35
          3       1.00      0.77      0.87        69
          4       0.54      1.00      0.70       112
          5       1.00      0.20      0.33        15
          6       1.00      0.37      0.54        27

avg / total       0.82      0.67      0.63       297

[  2   0   0   0  18   0   0   0   1   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.66      0.94      0.77        87
          1       0.98      0.95      0.97       117
          2       0.97      0.79      0.87        38
          3       1.00      0.40      0.57        10
          4       1.00      0.58      0.74        24
          5       0.90      0.76      0.82        70

avg / total       0.88      0.85      0.85       346

[ 82   0   1   0   0   4   5 111   0   0   0   1   8   0  30   0   0   0
   4   2   0   4   0   0   9   0   0   0  14   1  17   0   0   0   0  53]
svc Accuracy:  0.8497109826589595
svc F1:  0.7897239362340579
             precision    recall  f1-score   support

          0       0.65      0.93      0.76        87
          1       0.85      0.97      0.90       117
          2       0.96      0.71      0.82        38
          3       0.00      0.00      0.00        10
          4       1.00      0.25      0.40        24
          5       1.00      0.77      0.87   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      1.00      0.86        15
          1       1.00      0.62      0.76        13

avg / total       0.87      0.82      0.81        28

[15  0  5  8]
LR Accuracy:  0.8214285714285714
LR F1:  0.8095238095238095
For name:  b_kang
total sample size before apply threshold:  20
Counter({'0000-0001-5902-0549': 10, '0000-0001-6946-2279': 5, '0000-0003-2637-4695': 2, '0000-0003-0901-4903': 1, '0000-0002-4299-2170': 1, '0000-0002-1690-7753': 1})
['0000-0001-5902-0549']
Total sample size after apply threshold:  10
For name:  s_carter
total sample size before apply threshold:  205
Counter({'0000-0002-3585-9400': 124, '0000-0003-2617-8694': 44, '0000-0002-9080-519X': 15, '0000-0002-4670-0884': 12, '0000-0002-9817-0029': 5, '0000-0002-3619-8640': 2, '0000-0002-8169-4483': 2, '0000-0002-2907-9651': 1})
['0000-0002-3585-9400', '0000-0002-4670-0884', '0000-0003-2617-8694', '0000-0002-9080-519X']
Total sample size after appl

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.40      0.57        15
          1       0.74      1.00      0.85        56
          2       1.00      0.27      0.42        15

avg / total       0.83      0.77      0.73        86

[ 6  9  0  0 56  0  0 11  4]
svc Accuracy:  0.7674418604651163
svc F1:  0.6136553504974557
             precision    recall  f1-score   support

          0       1.00      0.07      0.12        15
          1       0.66      1.00      0.79        56
          2       0.00      0.00      0.00        15

avg / total       0.60      0.66      0.54        86

[ 1 14  0  0 56  0  0 15  0]
LR Accuracy:  0.6627906976744186
LR F1:  0.3064420803782506
For name:  m_gutierrez
total sample size before apply threshold:  32
Counter({'0000-0003-3199-0337': 30, '0000-0003-0964-6222': 2})
['0000-0003-3199-0337']
Total sample size after apply threshold:  30
For name:  s_moon
total sample size before apply threshold:  85
Counter({'0000-0001

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        14
          1       1.00      0.86      0.92        14
          2       0.96      0.86      0.91        29
          3       0.95      0.95      0.95        74
          4       0.73      0.95      0.82        39

avg / total       0.91      0.89      0.89       170

[ 7  0  0  1  6  0 12  0  1  1  0  0 25  1  3  0  0  0 70  4  0  0  1  1
 37]
svc Accuracy:  0.888235294117647
svc F1:  0.8534005334005335
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        14
          1       1.00      0.79      0.88        14
          2       0.96      0.83      0.89        29
          3       0.83      0.96      0.89        74
          4       0.76      0.79      0.77        39

avg / total       0.86      0.85      0.84       170

[ 7  0  0  3  4  0 11  0  2  1  0  0 24  3  2  0  0  0 71  3  0  0  1  7
 31]
LR Accuracy:  0.8470588235294118
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.70      0.83        27
          1       0.89      1.00      0.94        70
          2       1.00      0.94      0.97        17

avg / total       0.93      0.92      0.92       114

[19  8  0  0 70  0  0  1 16]
svc Accuracy:  0.9210526315789473
svc F1:  0.9117937472183169
             precision    recall  f1-score   support

          0       1.00      0.07      0.14        27
          1       0.72      1.00      0.84        70
          2       1.00      0.88      0.94        17

avg / total       0.83      0.76      0.69       114

[ 2 25  0  0 70  0  0  2 15]
LR Accuracy:  0.7631578947368421
LR F1:  0.6379181292587239
For name:  j_conde
total sample size before apply threshold:  84
Counter({'0000-0001-8422-6792': 35, '0000-0002-2187-479X': 29, '0000-0002-5677-3024': 19, '0000-0001-8739-6893': 1})
['0000-0001-8422-6792', '0000-0002-5677-3024', '0000-0002-2187-479X']
Total sample size after apply th

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97        35
          1       1.00      0.89      0.94        19
          2       1.00      1.00      1.00        29

avg / total       0.98      0.98      0.98        83

[35  0  0  2 17  0  0  0 29]
svc Accuracy:  0.9759036144578314
svc F1:  0.9722222222222222
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        35
          1       1.00      0.89      0.94        19
          2       1.00      1.00      1.00        29

avg / total       0.98      0.98      0.98        83

[35  0  0  2 17  0  0  0 29]
LR Accuracy:  0.9759036144578314
LR F1:  0.9722222222222222
For name:  k_wright
total sample size before apply threshold:  59
Counter({'0000-0003-0040-9247': 18, '0000-0002-9020-1572': 15, '0000-0003-3865-9743': 12, '0000-0002-0387-3048': 7, '0000-0001-6202-1737': 6, '0000-0003-0700-6010': 1})
['0000-0003-0040-9247', '0000-0002-9020-1572', '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.93      1.00      0.96       232
          1       1.00      0.55      0.71        11
          2       1.00      0.62      0.76        13
          3       1.00      0.30      0.46        10

avg / total       0.94      0.94      0.93       266

[232   0   0   0   5   6   0   0   5   0   8   0   7   0   0   3]
svc Accuracy:  0.9360902255639098
svc F1:  0.7234956352603412
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       232
          1       0.00      0.00      0.00        11
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        10

avg / total       0.76      0.87      0.81       266

[232   0   0   0  11   0   0   0  13   0   0   0  10   0   0   0]
LR Accuracy:  0.8721804511278195
LR F1:  0.2329317269076305
For name:  h_huang
total sample size before apply threshold:  224
Counter({'0000-0002-3386-0934': 87, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.78      0.82        18
          1       0.80      0.83      0.82        24
          2       1.00      0.79      0.88        14
          3       1.00      0.88      0.93        16
          4       0.89      0.98      0.93        87
          5       0.85      0.69      0.76        16

avg / total       0.89      0.89      0.88       175

[14  3  0  0  1  0  2 20  0  0  2  0  0  1 11  0  2  0  0  0  0 14  1  1
  0  1  0  0 85  1  0  0  0  0  5 11]
svc Accuracy:  0.8857142857142857
svc F1:  0.856795285666556
             precision    recall  f1-score   support

          0       0.92      0.67      0.77        18
          1       1.00      0.42      0.59        24
          2       1.00      0.57      0.73        14
          3       1.00      0.56      0.72        16
          4       0.67      1.00      0.81        87
          5       1.00      0.38      0.55        16

avg / total       0.83      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       0.66      1.00      0.80        35

avg / total       0.78      0.67      0.57        55

[ 2 18  0 35]
LR Accuracy:  0.6727272727272727
LR F1:  0.48863636363636365
For name:  y_xu
total sample size before apply threshold:  137
Counter({'0000-0002-2195-1695': 47, '0000-0002-6689-7768': 19, '0000-0002-6406-7832': 17, '0000-0001-6643-3173': 9, '0000-0002-0763-9953': 8, '0000-0002-4479-6157': 8, '0000-0001-7429-4724': 5, '0000-0002-5578-4960': 4, '0000-0002-1887-0632': 4, '0000-0002-9834-3006': 3, '0000-0002-9945-3514': 3, '0000-0001-8488-0399': 2, '0000-0001-9106-0049': 1, '0000-0003-4549-6110': 1, '0000-0002-2341-7971': 1, '0000-0003-4420-6353': 1, '0000-0002-7963-6890': 1, '0000-0002-7962-6668': 1, '0000-0003-1355-0055': 1, '0000-0002-1563-8811': 1})
['0000-0002-6406-7832', '0000-0002-2195-1695', '0000-0002-6689-7768']
Total sample size after apply threshold:  83

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.95      0.98        22
          1       0.97      1.00      0.99       216
          2       1.00      0.55      0.71        11

avg / total       0.98      0.98      0.97       249

[ 21   1   0   0 216   0   0   5   6]
MNB Accuracy:  0.9759036144578314
MNB F1:  0.8896426362835673
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        22
          1       0.99      1.00      0.99       216
          2       1.00      1.00      1.00        11

avg / total       0.99      0.99      0.99       249

[ 19   3   0   0 216   0   0   0  11]
svc Accuracy:  0.9879518072289156
svc F1:  0.9733109055228484
             precision    recall  f1-score   support

          0       1.00      0.32      0.48        22
          1       0.90      1.00      0.95       216
          2       1.00      0.27      0.43        11

avg / total       0.92      0.91      0.89       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      1.00      0.96       110
          1       1.00      0.89      0.94        57
          2       1.00      0.60      0.75        10

avg / total       0.95      0.94      0.94       177

[110   0   0   6  51   0   4   0   6]
svc Accuracy:  0.943502824858757
svc F1:  0.8836553945249598
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       110
          1       1.00      0.82      0.90        57
          2       1.00      0.30      0.46        10

avg / total       0.92      0.90      0.89       177

[110   0   0  10  47   0   7   0   3]
LR Accuracy:  0.903954802259887
LR F1:  0.7645515525262362
For name:  p_robinson
total sample size before apply threshold:  275
Counter({'0000-0002-7878-0313': 133, '0000-0002-0736-9199': 119, '0000-0002-3156-3418': 19, '0000-0002-0577-3147': 4})
['0000-0002-0736-9199', '0000-0002-7878-0313', '0000-0002-3156-3418']
Total samp

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.95      0.97       119
          1       0.90      1.00      0.95       133
          2       1.00      0.53      0.69        19

avg / total       0.95      0.94      0.94       271

[113   6   0   0 133   0   0   9  10]
svc Accuracy:  0.9446494464944649
svc F1:  0.8701374401767089
             precision    recall  f1-score   support

          0       1.00      0.91      0.95       119
          1       0.83      1.00      0.91       133
          2       1.00      0.16      0.27        19

avg / total       0.92      0.90      0.88       271

[108  11   0   0 133   0   0  16   3]
LR Accuracy:  0.9003690036900369
LR F1:  0.7107063174330243
For name:  c_zou
total sample size before apply threshold:  32
Counter({'0000-0003-2484-7292': 22, '0000-0001-8569-3747': 8, '0000-0003-4305-5055': 1, '0000-0002-9712-4282': 1})
['0000-0003-2484-7292']
Total sample size after apply threshold:  22
For name:  s_rana
t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.45      0.62        11
          1       0.75      1.00      0.86        18

avg / total       0.84      0.79      0.77        29

[ 5  6  0 18]
LR Accuracy:  0.7931034482758621
LR F1:  0.7410714285714286
For name:  s_jeong
total sample size before apply threshold:  93
Counter({'0000-0001-6178-8338': 33, '0000-0002-1958-8436': 21, '0000-0002-6376-7001': 13, '0000-0002-6480-7685': 7, '0000-0002-9084-5183': 6, '0000-0001-8995-3497': 5, '0000-0002-8370-3566': 1, '0000-0002-4004-3510': 1, '0000-0001-9175-9642': 1, '0000-0001-9197-1184': 1, '0000-0002-9868-621X': 1, '0000-0002-3309-0693': 1, '0000-0001-9575-0354': 1, '0000-0001-9588-1928': 1})
['0000-0002-6376-7001', '0000-0002-1958-8436', '0000-0001-6178-8338']
Total sample size after apply threshold:  67
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowe

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.74      0.85        35
          1       0.95      1.00      0.97       167

avg / total       0.96      0.96      0.95       202

[ 26   9   0 167]
svc Accuracy:  0.9554455445544554
svc F1:  0.9131099746690245
             precision    recall  f1-score   support

          0       1.00      0.11      0.21        35
          1       0.84      1.00      0.92       167

avg / total       0.87      0.85      0.79       202

[  4  31   0 167]
LR Accuracy:  0.8465346534653465
LR F1:  0.560098349139445
For name:  m_reilly
total sample size before apply threshold:  20
Counter({'0000-0001-8029-0084': 17, '0000-0002-5526-8245': 1, '0000-0001-8746-3224': 1, '0000-0003-2506-3190': 1})
['0000-0001-8029-0084']
Total sample size after apply threshold:  17
For name:  d_nguyen
total sample size before apply threshold:  25
Counter({'0000-0002-4997-555X': 8, '0000-0002-3283-3504': 7, '0000-0001-6420-7308': 3, '0000-0002

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        33
          1       0.88      0.64      0.74        11
          2       0.92      0.85      0.88        13
          3       0.95      0.90      0.92        20
          4       0.74      1.00      0.85        29
          5       1.00      0.81      0.90        16
          6       1.00      0.85      0.92        13

avg / total       0.92      0.90      0.90       135

[33  0  0  0  0  0  0  0  7  1  1  2  0  0  0  0 11  0  2  0  0  0  1  0
 18  1  0  0  0  0  0  0 29  0  0  0  0  0  0  3 13  0  0  0  0  0  2  0
 11]
svc Accuracy:  0.9037037037037037
svc F1:  0.8865826565164666
             precision    recall  f1-score   support

          0       0.97      1.00      0.99        33
          1       1.00      0.64      0.78        11
          2       0.89      0.62      0.73        13
          3       1.00      0.90      0.95        20
          4       0.64      0.97      0.7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.84      1.00      0.92       125
          1       1.00      0.20      0.33        15
          2       1.00      0.94      0.97        18
          3       1.00      0.92      0.96        12
          4       0.86      0.75      0.80        16
          5       1.00      0.61      0.76        18

avg / total       0.89      0.88      0.86       204

[125   0   0   0   0   0  12   3   0   0   0   0   1   0  17   0   0   0
   1   0   0  11   0   0   4   0   0   0  12   0   5   0   0   0   2  11]
svc Accuracy:  0.8774509803921569
svc F1:  0.7892758748830713
             precision    recall  f1-score   support

          0       0.71      1.00      0.83       125
          1       0.00      0.00      0.00        15
          2       1.00      0.72      0.84        18
          3       1.00      0.58      0.74        12
          4       1.00      0.31      0.48        16
          5       1.00      0.22      0.36   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.96      0.98        28
          1       0.96      1.00      0.98        24

avg / total       0.98      0.98      0.98        52

[27  1  0 24]
LR Accuracy:  0.9807692307692307
LR F1:  0.9807050092764378
For name:  d_collins
total sample size before apply threshold:  31
Counter({'0000-0001-6754-9290': 8, '0000-0002-6248-9644': 7, '0000-0002-3283-0733': 6, '0000-0003-2274-0889': 5, '0000-0003-2484-1640': 2, '0000-0002-8432-7021': 1, '0000-0001-8891-1893': 1, '0000-0002-7981-3586': 1})
[]
Total sample size after apply threshold:  0
For name:  l_davies
total sample size before apply threshold:  96
Counter({'0000-0001-8801-3559': 62, '0000-0002-0451-8670': 19, '0000-0002-4876-6270': 11, '0000-0002-2986-705X': 4})
['0000-0001-8801-3559', '0000-0002-4876-6270', '0000-0002-0451-8670']
Total sample size after apply threshold:  92
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dty

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.36      0.53        22
          1       0.88      1.00      0.94       104

avg / total       0.90      0.89      0.87       126

[  8  14   0 104]
svc Accuracy:  0.8888888888888888
svc F1:  0.7351351351351352
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.83      1.00      0.90       104

avg / total       0.68      0.83      0.75       126

[  0  22   0 104]
LR Accuracy:  0.8253968253968254
LR F1:  0.45217391304347826
For name:  a_fontana
total sample size before apply threshold:  203
Counter({'0000-0002-6660-5315': 65, '0000-0002-5453-461X': 59, '0000-0002-5391-7520': 44, '0000-0002-8481-1219': 16, '0000-0002-4791-8746': 14, '0000-0003-3820-2823': 3, '0000-0003-1556-2770': 2})
['0000-0002-5391-7520', '0000-0002-5453-461X', '0000-0002-4791-8746', '0000-0002-6660-5315', '0000-0002-8481-1219']
Total sample size after apply 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.95      0.98        44
          1       0.88      0.98      0.93        59
          2       1.00      0.64      0.78        14
          3       0.98      0.98      0.98        65
          4       1.00      1.00      1.00        16

avg / total       0.96      0.95      0.95       198

[42  2  0  0  0  0 58  0  1  0  0  5  9  0  0  0  1  0 64  0  0  0  0  0
 16]
svc Accuracy:  0.9545454545454546
svc F1:  0.934393653262814
             precision    recall  f1-score   support

          0       0.97      0.89      0.93        44
          1       0.91      0.85      0.88        59
          2       1.00      0.50      0.67        14
          3       0.77      1.00      0.87        65
          4       1.00      0.75      0.86        16

avg / total       0.89      0.87      0.87       198

[39  1  0  4  0  1 50  0  8  0  0  3  7  4  0  0  0  0 65  0  0  1  0  3
 12]
LR Accuracy:  0.8737373737373737
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.68      0.84      0.75        32
          1       0.00      0.00      0.00        11
          2       0.63      0.84      0.72        32
          3       0.83      1.00      0.91       179
          4       1.00      0.45      0.62        20
          5       1.00      0.21      0.34        24
          6       0.90      0.43      0.58        21
          7       0.90      0.82      0.86        34

avg / total       0.81      0.80      0.77       353

[ 27   0   3   1   0   0   1   0   0   0   1  10   0   0   0   0   3   0
  27   2   0   0   0   0   0   0   0 179   0   0   0   0   0   0   0   8
   9   0   0   3   4   0   6   9   0   5   0   0   5   0   5   2   0   0
   9   0   1   0   1   4   0   0   0  28]
MNB Accuracy:  0.8045325779036827
MNB F1:  0.5982912882290575
             precision    recall  f1-score   support

          0       0.82      0.88      0.85        32
          1       1.00      0.55     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        11
          1       0.80      1.00      0.89        43
          2       1.00      0.36      0.53        11

avg / total       0.87      0.83      0.81        65

[ 7  4  0  0 43  0  0  7  4]
svc Accuracy:  0.8307692307692308
svc F1:  0.7325696830851469
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        11
          1       0.70      1.00      0.83        43
          2       0.00      0.00      0.00        11

avg / total       0.64      0.72      0.64        65

[ 4  7  0  0 43  0  0 11  0]
LR Accuracy:  0.7230769230769231
LR F1:  0.4534188034188034
For name:  t_smith
total sample size before apply threshold:  603
Counter({'0000-0002-3650-9381': 154, '0000-0003-1673-2954': 113, '0000-0002-2120-2766': 85, '0000-0002-6279-9685': 84, '0000-0003-3528-6793': 65, '0000-0003-4453-9713': 32, '0000-0002-5197-5030': 26, '0000-0002-3945-63

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.09      0.17        32
          2       0.92      0.71      0.80        65
          3       0.93      0.68      0.79        84
          4       0.88      0.93      0.90       113
          5       0.66      1.00      0.79       154
          6       0.96      0.92      0.94        85
          7       1.00      0.73      0.84        26

avg / total       0.84      0.81      0.79       569

[  0   0   0   1   2   7   0   0   0   3   2   1   9  17   0   0   0   0
  46   1   2  13   3   0   0   0   2  57   2  23   0   0   0   0   0   0
 105   8   0   0   0   0   0   0   0 154   0   0   0   0   0   0   0   7
  78   0   0   0   0   1   0   6   0  19]
MNB Accuracy:  0.81195079086116
MNB F1:  0.654362535142212
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.50      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       1.00      1.00      1.00        10
          2       0.72      1.00      0.84        18

avg / total       0.88      0.83      0.81        41

[ 6  0  7  0 10  0  0  0 18]
MNB Accuracy:  0.8292682926829268
MNB F1:  0.8229294165646674
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       1.00      1.00      1.00        10
          2       0.95      1.00      0.97        18

avg / total       0.98      0.98      0.98        41

[12  0  1  0 10  0  0  0 18]
svc Accuracy:  0.975609756097561
svc F1:  0.9776576576576576
             precision    recall  f1-score   support

          0       1.00      0.46      0.63        13
          1       1.00      1.00      1.00        10
          2       0.72      1.00      0.84        18

avg / total       0.88      0.83      0.81        41

[ 6  0  7  0 10

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  15
For name:  t_wong
total sample size before apply threshold:  14
Counter({'0000-0002-1045-2698': 9, '0000-0002-5752-7917': 2, '0000-0001-9234-4529': 1, '0000-0001-6187-8851': 1, '0000-0001-8611-4911': 1})
[]
Total sample size after apply threshold:  0
For name:  s_ross
total sample size before apply threshold:  25
Counter({'0000-0002-2302-8415': 17, '0000-0001-7305-3451': 3, '0000-0002-3094-3769': 2, '0000-0003-3512-9579': 1, '0000-0001-5676-4489': 1, '0000-0001-5523-2376': 1})
['0000-0002-2302-8415']
Total sample size after apply threshold:  17
For name:  d_richardson
total sample size before apply threshold:  456
Counter({'0000-0003-0960-6415': 231, '0000-0002-7751-1058': 167, '0000-0002-3992-8610': 22, '0000-0003-0247-9118': 17, '0000-0002-3189-2190': 12, '0000-0002-0054-6850': 7})
['0000-0002-3189-2190', '0000-0003-0960-6415', '0000-0002-7751-1058', '0000-0002-3992-8610', '0000-0003-0247-9118']
Total sample size after apply threshold:  4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.86      1.00      0.93       231
          2       0.98      0.95      0.96       167
          3       0.92      0.50      0.65        22
          4       1.00      0.47      0.64        17

avg / total       0.89      0.91      0.89       449

[  0  11   1   0   0   0 230   1   0   0   0   8 159   0   0   0  11   0
  11   0   0   6   2   1   8]
MNB Accuracy:  0.9086859688195991
MNB F1:  0.6352497014170584
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       0.89      1.00      0.94       231
          2       0.99      0.95      0.97       167
          3       1.00      0.73      0.84        22
          4       1.00      0.53      0.69        17

avg / total       0.94      0.94      0.93       449

[  6   6   0   0   0   0 231   0   0   0   0   9 158   0   0   0   6   0
  16   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      1.00      0.93        26
          1       1.00      0.79      0.88        14
          2       0.85      0.81      0.83        21

avg / total       0.89      0.89      0.88        61

[26  0  0  0 11  3  4  0 17]
MNB Accuracy:  0.8852459016393442
MNB F1:  0.8792799070847851
             precision    recall  f1-score   support

          0       0.87      1.00      0.93        26
          1       1.00      0.71      0.83        14
          2       0.86      0.86      0.86        21

avg / total       0.89      0.89      0.88        61

[26  0  0  1 10  3  3  0 18]
svc Accuracy:  0.8852459016393442
svc F1:  0.873015873015873
             precision    recall  f1-score   support

          0       0.74      1.00      0.85        26
          1       1.00      0.71      0.83        14
          2       1.00      0.76      0.86        21

avg / total       0.89      0.85      0.85        61

[26  0  0  4 10

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.84      1.00      0.91        85
          1       1.00      0.70      0.82        10
          2       1.00      0.68      0.81        22
          3       1.00      0.40      0.57        10

avg / total       0.89      0.87      0.86       127

[85  0  0  0  3  7  0  0  7  0 15  0  6  0  0  4]
svc Accuracy:  0.8740157480314961
svc F1:  0.779936822156936
             precision    recall  f1-score   support

          0       0.71      1.00      0.83        85
          1       0.00      0.00      0.00        10
          2       1.00      0.36      0.53        22
          3       0.00      0.00      0.00        10

avg / total       0.65      0.73      0.65       127

[85  0  0  0 10  0  0  0 14  0  8  0 10  0  0  0]
LR Accuracy:  0.7322834645669292
LR F1:  0.3416666666666667
For name:  a_lin
total sample size before apply threshold:  46
Counter({'0000-0003-4236-7233': 27, '0000-0001-6310-9765': 10, '0000-0001-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  70
Counter({'0000-0001-6998-5686': 48, '0000-0001-5807-5820': 11, '0000-0003-3957-6288': 4, '0000-0003-4964-2197': 2, '0000-0002-9066-6935': 2, '0000-0003-4872-0632': 2, '0000-0002-7297-9639': 1})
['0000-0001-5807-5820', '0000-0001-6998-5686']
Total sample size after apply threshold:  59
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 586)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       0.89      0.73      0.80        11
          1       0.94      0.98      0.96        48

avg / total       0.93      0.93  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[ 8  3  1 47]
MNB Accuracy:  0.9322033898305084
MNB F1:  0.8795918367346938
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        11
          1       1.00      1.00      1.00        48

avg / total       1.00      1.00      1.00        59

[11  0  0 48]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       0.84      1.00      0.91        48

avg / total       0.87      0.85      0.80        59

[ 2  9  0 48]
LR Accuracy:  0.847457627118644
LR F1:  0.610989010989011
For name:  h_vogel
total sample size before apply threshold:  15
Counter({'0000-0001-9821-7731': 5, '0000-0002-9902-8120': 4, '0000-0003-2404-9485': 4, '0000-0003-0072-4239': 2})
[]
Total sample size after apply threshold:  0
For name:  m_campos
total sample size before apply threshold:  148
Counter({'0000-0001-7738-9892': 107, '0000-0003-3217-9001': 12, '0000-0003-4313-706

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97       107
          1       1.00      0.42      0.59        12

avg / total       0.94      0.94      0.93       119

[107   0   7   5]
LR Accuracy:  0.9411764705882353
LR F1:  0.7782805429864253
For name:  d_stewart
total sample size before apply threshold:  294
Counter({'0000-0002-8157-7746': 210, '0000-0001-7360-8592': 77, '0000-0002-6764-4842': 3, '0000-0002-8499-7105': 1, '0000-0002-4087-5544': 1, '0000-0001-5144-1234': 1, '0000-0002-3690-9844': 1})
['0000-0001-7360-8592', '0000-0002-8157-7746']
Total sample size after apply threshold:  287
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(287, 519)
(0, 0)
(0, 0)
1
287
             precision    recall  f1-score   support

          0       0.97      0.92      0.95        77
          1       0.97      0.99      0.98       210

avg / total       0.97      0.97      0.97       287

[ 71   6   2 208]
MNB Accuracy:  0.9721254355400697
MNB F1:  0.9638993710691823
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        77
          1       0.96      1.00      0.98       210

avg / total       0.97      0.97      0.97       287

[ 69   8   0 210]
svc Accuracy:  0.9721254355400697
svc F1:  0.963256945333504
             precision    recall  f1-score   support

          0       1.00      0.57      0.73        77
          1       0.86      1.00      0.93       210

avg / total       0.90      0.89      0.87       287

[ 44  33   0 210]
LR Accuracy:  0.8850174216027874
LR F1:  0.827212522576761
For name:  j_abrantes
total sample size before apply threshold:  57
Counter({'0000-0002-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Total sample size after apply threshold:  103
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(103, 412)
(0, 0)
(0, 0)
1
103
             precision    recall  f1-score   support

          0       0.80      0.22      0.35        18
          1       1.00      0.10      0.18        10
          2       0.00      0.00      0.00        10
          3       0.66      0.98      0.79        65

avg / total       0.65      0.67      0.58       103

[ 4  0  0 14  0  1  0  9  0  0  0 10  1  0  0 64]
MNB Accuracy:  0.6699029126213593
MNB F1:  0.32994193139120676
             precision    recall  f1-score  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        36
          1       0.95      1.00      0.97       155

avg / total       0.96      0.95      0.95       191

[ 27   9   0 155]
svc Accuracy:  0.9528795811518325
svc F1:  0.9144648454993283
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        36
          1       0.86      1.00      0.93       155

avg / total       0.89      0.87      0.84       191

[ 11  25   0 155]
LR Accuracy:  0.8691099476439791
LR F1:  0.6967291203556685
For name:  f_campos
total sample size before apply threshold:  49
Counter({'0000-0001-8376-0977': 14, '0000-0002-5948-472X': 12, '0000-0002-1132-3257': 10, '0000-0001-8332-5043': 9, '0000-0001-9826-751X': 2, '0000-0001-5828-2862': 2})
['0000-0001-8376-0977', '0000-0002-5948-472X', '0000-0002-1132-3257']
Total sample size after apply threshold:  36
TfidfVectorizer(analyzer='word', binary=False, decode_error='s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(418, 1043)
(0, 0)
(0, 0)
1
418
             precision    recall  f1-score   support

          0       1.00      0.10      0.18        20
          1       0.73      1.00      0.84       188
          2       1.00      0.30      0.47        23
          3       1.00      0.91      0.95        65
          4       1.00      0.85      0.92        98
          5       1.00      0.33      0.50        24

avg / total       0.88      0.83      0.80       418

[  2  18   0   0   0   0   0 188   0   0   0   0   0  16   7   0   0   0
   0   6   0  59   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      0.50      0.65        28
          1       1.00      0.78      0.88        23
          2       1.00      0.20      0.33        15
          3       0.56      0.98      0.71        41

avg / total       0.82      0.70      0.68       107

[14  0  0 14  0 18  0  5  0  0  3 12  1  0  0 40]
MNB Accuracy:  0.7009345794392523
MNB F1:  0.6442076547011317
             precision    recall  f1-score   support

          0       0.95      0.75      0.84        28
          1       1.00      0.91      0.95        23
          2       1.00      0.27      0.42        15
          3       0.67      0.98      0.79        41

avg / total       0.86      0.80      0.79       107

[21  0  0  7  0 21  0  2  0  0  4 11  1  0  0 40]
svc Accuracy:  0.8037383177570093
svc F1:  0.7519193235112984
             precision    recall  f1-score   support

          0       1.00      0.46      0.63        28
          1       1.00     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(139, 750)
(0, 0)
(0, 0)
1
139
             precision    recall  f1-score   support

          0       0.92      0.89      0.90        64
          1       0.84      1.00      0.92        65
          2       0.00      0.00      0.00        10

avg / total       0.82      0.88      0.84       139

[57  7  0  0 65  0  5  5  0]
MNB Accuracy:  0.8776978417266187
MNB F1:  0.6067516208361279
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        64
          1       1.00      0.97      0.98        65
          2       1.00      0.10      0.18        10

avg / total       0.93      0.92      0.90       139

[64  0  0  2 63  0  9  0  1]
svc Accuracy:  0.920863309352518
svc F1:  0.6956854970568999
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        64
          1       1.00      0.97      0.98        65
          2       0.00      0.00      0.00        10

avg / total       0.86      0.91     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.68      0.81        22
          1       1.00      0.68      0.81        44
          2       1.00      0.71      0.83        14
          3       0.76      1.00      0.86        78

avg / total       0.88      0.84      0.84       158

[15  0  0  7  0 30  0 14  0  0 10  4  0  0  0 78]
svc Accuracy:  0.8417721518987342
svc F1:  0.8292083519984073
             precision    recall  f1-score   support

          0       1.00      0.32      0.48        22
          1       1.00      0.55      0.71        44
          2       1.00      0.07      0.13        14
          3       0.62      1.00      0.76        78

avg / total       0.81      0.70      0.65       158

[ 7  0  0 15  0 24  0 20  0  0  1 13  0  0  0 78]
LR Accuracy:  0.6962025316455697
LR F1:  0.5216700473292766
For name:  m_king
total sample size before apply threshold:  58
Counter({'0000-0002-2587-9117': 26, '0000-0001-6030-5154': 13, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 27
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(27, 105)
(0, 0)
(0, 0)
1
27
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       0.87      1.00      0.93        13

avg / total       0.94      0.93      0.93        27

[12  2  0 13]
MNB Accuracy:  0.9259259259259259
MNB F1:  0.9258241758241759
             precision    recall  f1-score   support

          0       0.93      1.00      0.97        14
          1       1.00      0.92      0.96        13

avg / total       0.97      0.96      0.96        27

[14  0  1 12]
svc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.96      1.00      0.98       279

avg / total       0.96      0.96      0.95       295

[  5  11   0 279]
MNB Accuracy:  0.9627118644067797
MNB F1:  0.7284291572516528
             precision    recall  f1-score   support

          0       1.00      0.31      0.48        16
          1       0.96      1.00      0.98       279

avg / total       0.96      0.96      0.95       295

[  5  11   0 279]
svc Accuracy:  0.9627118644067797
svc F1:  0.7284291572516528


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.95      1.00      0.97       279

avg / total       0.89      0.95      0.92       295

[  0  16   0 279]
LR Accuracy:  0.9457627118644067
LR F1:  0.48606271777003485
For name:  j_cooper
total sample size before apply threshold:  147
Counter({'0000-0003-1339-4750': 85, '0000-0001-6009-3542': 24, '0000-0001-8163-2306': 19, '0000-0002-9014-4395': 14, '0000-0002-8626-7827': 4, '0000-0002-4932-1740': 1})
['0000-0002-9014-4395', '0000-0001-6009-3542', '0000-0001-8163-2306', '0000-0003-1339-4750']
Total sample size after apply threshold:  142
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sub

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        17
          1       0.91      1.00      0.95        81
          2       1.00      0.83      0.91        18

avg / total       0.94      0.93      0.93       116

[12  5  0  0 81  0  0  3 15]
svc Accuracy:  0.9310344827586207
svc F1:  0.8965394308193497
             precision    recall  f1-score   support

          0       1.00      0.06      0.11        17
          1       0.72      1.00      0.84        81
          2       1.00      0.17      0.29        18

avg / total       0.81      0.73      0.65       116

[ 1 16  0  0 81  0  0 15  3]
LR Accuracy:  0.7327586206896551
LR F1:  0.412067878389122
For name:  s_hussein
total sample size before apply threshold:  33
Counter({'0000-0002-7946-0717': 18, '0000-0002-6305-508X': 9, '0000-0003-3657-7410': 4, '0000-0002-5394-4385': 1, '0000-0002-0139-1483': 1})
['0000-0002-7946-0717']
Total sample size after apply threshold:  18
For name

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[18  0  0  0 14  0  0  0 22]
LR Accuracy:  1.0
LR F1:  1.0
For name:  f_zhang
total sample size before apply threshold:  103
Counter({'0000-0001-6035-4829': 27, '0000-0001-7434-7339': 23, '0000-0002-0480-7501': 11, '0000-0001-9542-6634': 10, '0000-0003-1298-9795': 9, '0000-0002-1371-266X': 7, '0000-0002-1957-0543': 5, '0000-0002-2822-2049': 4, '0000-0002-9309-9577': 2, '0000-0003-1709-7788': 2, '0000-0001-7550-9483': 1, '0000-0002-8438-7155': 1, '0000-0003-2829-0735': 1})
['0000-0001-7434-7339', '0000-0002-0480-7501', '0000-0001-6035-4829', '0000-0001-9542-6634']
Total sample size after apply threshold:  71
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.7171717171717171
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.97      1.00      0.98        29

avg / total       0.98      0.98      0.98        42

[12  1  0 29]
svc Accuracy:  0.9761904761904762
svc F1:  0.9715254237288136
             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       0.71      1.00      0.83        29

avg / total       0.80      0.71      0.62        42

[ 1 12  0 29]
LR Accuracy:  0.7142857142857143
LR F1:  0.4857142857142857
For name:  a_palma
total sample size before apply threshold:  61
Counter({'0000-0003-2099-1297': 34, '0000-0002-8530-4913': 13, '0000-0002-5971-3676': 8, '0000-0003-0420-1785': 3, '0000-0002-1682-7032': 2, '0000-0002-7263-4868': 1})
['0000-0002-8530-4913', '0000-0003-2099-1297']
Total sample size after apply threshold:  47
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        34

avg / total       1.00      1.00      1.00        47

[13  0  0 34]
MNB Accuracy:  1.0
MNB F1:  1.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       1.00      1.00      1.00        34

avg / total       1.00      1.00      1.00        47

[13  0  0 34]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        13
          1       0.97      1.00      0.99        34

avg / total       0.98      0.98      0.98        47

[12  1  0 34]
LR Accuracy:  0.9787234042553191
LR F1:  0.9727536231884057
For name:  e_shaw
total sample size before apply threshold:  16
Counter({'0000-0003-1424-7568': 9, '0000-0002-5653-0145': 4, '0000-0002-4148-3526': 2, '0000-0002-4334-1900': 1})
[]
Total sa

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(36, 101)
(0, 0)
(0, 0)
1
36
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        18
          1       1.00      0.94      0.97        18

avg / total       0.97      0.97      0.97        36

[18  0  1 17]
MNB Accuracy:  0.9722222222222222
MNB F1:  0.9722007722007722
             precision    recall  f1-score   support

          0       0.85      0.94      0.89        18
          1       0.94      0.83      0.88        18

avg / total       0.89      0.89      0.89        36

[17  1  3 15]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 176)
(0, 0)
(0, 0)
1
55
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.79      1.00      0.88        23
          2       1.00      0.81      0.90        16

avg / total       0.91      0.89      0.89        55

[13  3  0  0 23  0  0  3 13]
MNB Accuracy:  0.8909090909090909
MNB F1:  0.8925729442970822
             precision    recall  f1-score   support

          0       1.00      0.81      0.90        16
          1       0.85      1.00      0.92        23
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.60      0.71        10
          1       0.76      0.93      0.84        14

avg / total       0.80      0.79      0.78        24

[ 6  4  1 13]
MNB Accuracy:  0.7916666666666666
MNB F1:  0.7722960151802656
             precision    recall  f1-score   support

          0       1.00      0.60      0.75        10
          1       0.78      1.00      0.88        14

avg / total       0.87      0.83      0.82        24

[ 6  4  0 14]
svc Accuracy:  0.8333333333333334
svc F1:  0.8125
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       0.74      1.00      0.85        14

avg / total       0.85      0.79      0.77        24

[ 5  5  0 14]
LR Accuracy:  0.7916666666666666
LR F1:  0.7575757575757576
For name:  l_simon
total sample size before apply threshold:  14
Counter({'0000-0003-4321-8539': 7, '0000-0003-4870-1052': 4, '0000-0002-5010

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.96      0.96        23
          1       0.96      0.96      0.96        23

avg / total       0.96      0.96      0.96        46

[22  1  1 22]
MNB Accuracy:  0.9565217391304348
MNB F1:  0.9565217391304348
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        23
          1       0.92      1.00      0.96        23

avg / total       0.96      0.96      0.96        46

[21  2  0 23]
svc Accuracy:  0.9565217391304348
svc F1:  0.9564393939393939
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        23
          1       0.92      1.00      0.96        23

avg / total       0.96      0.96      0.96        46

[21  2  0 23]
LR Accuracy:  0.9565217391304348
LR F1:  0.9564393939393939
For name:  l_torres
total sample size before apply threshold:  65
Counter({'0000-0002-0194-7875': 56, '0000-0002-4598-1899': 7, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(123, 377)
(0, 0)
(0, 0)
1
123
             precision    recall  f1-score   support

          0       1.00      0.56      0.72        16
          1       1.00      0.57      0.72        23
          2       1.00      0.47      0.64        19
          3       1.00      0.27      0.42        15
          4       0.57      1.00      0.72        50

avg / total       0.82      0.69      0.67       123

[ 9  0  0  0  7  0 13  0  0 10  0  0  9  0 10  0  0  0  4 11  0  0  0  0
 50]
MNB Accuracy:  0.6910569105691057
MNB F1:  0.6461539355635465
           

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        21
          1       1.00      1.00      1.00        93
          2       1.00      1.00      1.00        10

avg / total       1.00      1.00      1.00       124

[21  0  0  0 93  0  0  0 10]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        21
          1       0.82      1.00      0.90        93
          2       1.00      0.20      0.33        10

avg / total       0.87      0.84      0.81       124

[ 9 12  0  0 93  0  0  8  2]
LR Accuracy:  0.8387096774193549
LR F1:  0.6120819848975189
For name:  r_hu
total sample size before apply threshold:  128
Counter({'0000-0001-6709-031X': 93, '0000-0001-7412-8451': 27, '0000-0001-6893-529X': 4, '0000-0001-5549-3082': 2, '0000-0002-7126-4076': 1, '0000-0001-5921-6891': 1})
['0000-0001-6709-031X', '0000-0001-7412-8451']
Total sample size after apply thresh

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


j_braun
total sample size before apply threshold:  72
Counter({'0000-0002-8886-078X': 37, '0000-0002-4504-6235': 25, '0000-0002-8309-6401': 5, '0000-0002-2491-5788': 5})
['0000-0002-8886-078X', '0000-0002-4504-6235']
Total sample size after apply threshold:  62
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(62, 157)
(0, 0)
(0, 0)
1
62
             precision    recall  f1-score   support

          0       0.88      1.00      0.94        37
          1       1.00      0.80      0.89        25

avg / total       0.93      0.92      0.92        62

[37  0  5 20]
MNB Accuracy:  0.9193548387096774
MN

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


w_lu
total sample size before apply threshold:  138
Counter({'0000-0003-4731-1976': 38, '0000-0001-6722-1527': 33, '0000-0001-5358-305X': 30, '0000-0001-7421-347X': 13, '0000-0002-1405-4806': 6, '0000-0001-9798-8964': 4, '0000-0003-4334-5722': 3, '0000-0002-6570-3044': 3, '0000-0002-5243-5554': 2, '0000-0001-5508-342X': 2, '0000-0002-1398-9933': 1, '0000-0001-6214-4024': 1, '0000-0002-5101-9778': 1, '0000-0002-4528-2246': 1})
['0000-0001-5358-305X', '0000-0003-4731-1976', '0000-0001-7421-347X', '0000-0001-6722-1527']
Total sample size after apply threshold:  114
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        voca

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      1.00      0.90        35
          1       1.00      0.43      0.60        14

avg / total       0.87      0.84      0.81        49

[35  0  8  6]
MNB Accuracy:  0.8367346938775511
MNB F1:  0.7487179487179487
             precision    recall  f1-score   support

          0       0.88      1.00      0.93        35
          1       1.00      0.64      0.78        14

avg / total       0.91      0.90      0.89        49

[35  0  5  9]
svc Accuracy:  0.8979591836734694
svc F1:  0.8579710144927537
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        35
          1       1.00      0.29      0.44        14

avg / total       0.84      0.80      0.75        49

[35  0 10  4]
LR Accuracy:  0.7959183673469388
LR F1:  0.6597222222222223
For name:  k_saito
total sample size before apply threshold:  61
Counter({'0000-0003-4663-1134': 26, '0000-0002-2151-6204': 16, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.24      0.39        25
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        11
          3       1.00      0.11      0.19        28
          4       0.00      0.00      0.00        14
          5       1.00      0.41      0.58        17
          6       1.00      0.83      0.91        35
          7       1.00      0.10      0.18        31
          8       1.00      0.19      0.31        27
          9       0.00      0.00      0.00        12
         10       0.00      0.00      0.00        13
         11       1.00      0.07      0.13        14
         12       1.00      0.41      0.59        29
         13       0.00      0.00      0.00        11
         14       0.00      0.00      0.00        11
         15       1.00      0.18      0.30        17
         16       1.00      0.71      0.83        24
         17       0.75      0.10      0.18   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.96      0.96      0.96        24
          1       0.91      0.71      0.80        14
          2       0.85      0.92      0.88        36
          3       0.77      0.93      0.85        44
          4       0.85      0.88      0.87        26
          5       1.00      0.64      0.78        11
          6       0.90      0.75      0.82        12
          7       1.00      0.92      0.96        12
          8       0.82      0.70      0.76        20

avg / total       0.87      0.86      0.86       199

[23  0  0  1  0  0  0  0  0  0 10  2  2  0  0  0  0  0  0  1 33  1  0  0
  1  0  0  0  0  0 41  2  0  0  0  1  0  0  0  2 23  0  0  0  1  0  0  2
  2  0  7  0  0  0  0  0  1  0  2  0  9  0  0  0  0  0  0  0  0  0 11  1
  1  0  1  4  0  0  0  0 14]
svc Accuracy:  0.8592964824120602
svc F1:  0.8512063086915862
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 300)
(0, 0)
(0, 0)
1
52
             precision    recall  f1-score   support

          0       0.79      1.00      0.89        31
          1       1.00      0.62      0.76        21

avg / total       0.88      0.85      0.84        52

[31  0  8 13]
MNB Accuracy:  0.8461538461538461
MNB F1:  0.8252100840336135
             precision    recall  f1-score   support

          0       0.97      0.94      0.95        31
          1       0.91      0.95      0.93        21

avg / total       0.94      0.94      0.94        52

[29  2  1 20]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 130
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 274)
(0, 0)
(0, 0)
1
130
             precision    recall  f1-score   support

          0       0.59      0.98      0.74        57
          1       0.96      0.72      0.82        32
          2       1.00      0.50      0.67        16
          3       0.00      0.00      0.00        11
          4       1.00      0.21      0.35        14

avg / total       0.73      0.69      0.65       130

[56  1  0  0  0  9 23  0  0  0  8  0  8  0  0 11  0  0  0  0 11  0  0  0
  3]
MNB Accuracy:  0.6923076923076923
MNB F1:  0.5155757039657968


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.68      0.98      0.81        57
          1       0.96      0.81      0.88        32
          2       1.00      0.75      0.86        16
          3       1.00      0.27      0.43        11
          4       1.00      0.43      0.60        14

avg / total       0.85      0.79      0.78       130

[56  1  0  0  0  6 26  0  0  0  4  0 12  0  0  8  0  0  3  0  8  0  0  0
  6]
svc Accuracy:  0.7923076923076923
svc F1:  0.7145651227202257
             precision    recall  f1-score   support

          0       0.54      0.98      0.70        57
          1       0.94      0.53      0.68        32
          2       1.00      0.38      0.55        16
          3       0.00      0.00      0.00        11
          4       1.00      0.14      0.25        14

avg / total       0.70      0.62      0.57       130

[56  1  0  0  0 15 17  0  0  0 10  0  6  0  0 11  0  0  0  0 12  0  0  0
  2]
LR Accuracy:  0.6230769230769231
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      0.98      0.95        44
          1       1.00      1.00      1.00        32
          2       1.00      0.11      0.20        18
          3       0.95      0.95      0.95        19
          4       1.00      0.54      0.70        13
          5       1.00      0.95      0.97        20
          6       0.00      0.00      0.00        16
          7       0.51      0.95      0.67        42

avg / total       0.80      0.79      0.74       204

[43  0  0  0  0  0  0  1  0 32  0  0  0  0  0  0  0  0  2  0  0  0  0 16
  0  0  0 18  0  0  0  1  0  0  0  0  7  0  0  6  0  0  0  0  0 19  0  1
  3  0  0  0  0  0  0 13  1  0  0  1  0  0  0 40]
MNB Accuracy:  0.7892156862745098
MNB F1:  0.6791811258916522
             precision    recall  f1-score   support

          0       0.73      1.00      0.85        44
          1       1.00      1.00      1.00        32
          2       0.83      0.56      0.67       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(169, 220)
(0, 0)
(0, 0)
1
169
             precision    recall  f1-score   support

          0       1.00      0.08      0.15        12
          1       0.79      1.00      0.88        91
          2       0.96      0.61      0.75        36
          3       0.90      0.90      0.90        30

avg / total       0.86      0.83      0.81       169

[ 1  9  0  2  0 91  0  0  0 13 22  1  0  2  1 27]
MNB Accuracy:  0.834319526627219
MNB F1:  0.6707760028354072
             precision    recall  f1-score   support

          0       0.90      0.75      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(332, 492)
(0, 0)
(0, 0)
1
332
             precision    recall  f1-score   support

          0       1.00      0.98      0.99        90
          1       0.00      0.00      0.00        19
          2       0.76      1.00      0.86       166
          3       0.00      0.00      0.00        13
          4       1.00      0.57      0.72        44

avg / total       0.78      0.84      0.80       332

[ 88   0   2   0   0   0   0  19   0   0   0   0 166   0   0   0   0  13
   0   0   0   0  19   0  25]
MNB Accuracy:  0.8403614457831325
MNB F1:  0.515

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.98      0.99        90
          1       1.00      0.95      0.97        19
          2       0.93      0.99      0.96       166
          3       1.00      0.85      0.92        13
          4       0.97      0.82      0.89        44

avg / total       0.96      0.96      0.96       332

[ 88   0   2   0   0   0  18   1   0   0   0   0 165   0   1   0   0   2
  11   0   0   0   8   0  36]
svc Accuracy:  0.9578313253012049
svc F1:  0.9453189798107487
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        90
          1       1.00      0.37      0.54        19
          2       0.78      1.00      0.88       166
          3       1.00      0.08      0.14        13
          4       1.00      0.59      0.74        44

avg / total       0.89      0.86      0.84       332

[ 85   0   5   0   0   0   7  12   0   0   0   0 166   0   0   0   0  12
   1   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[10  0  0 24]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       0.89      1.00      0.94        24

avg / total       0.92      0.91      0.91        34

[ 7  3  0 24]
LR Accuracy:  0.9117647058823529
LR F1:  0.8823529411764706
For name:  p_gaspar
total sample size before apply threshold:  93
Counter({'0000-0003-4217-5717': 87, '0000-0001-5967-0584': 3, '0000-0002-4832-8537': 2, '0000-0003-3388-1724': 1})
['0000-0003-4217-5717']
Total sample size after apply threshold:  87
For name:  r_o'connor
total sample size before apply threshold:  82
Counter({'0000-0003-4426-2507': 36, '0000-0002-4643-9794': 27, '0000-0002-6869-7954': 13, '0000-0002-3916-3101': 6})
['0000-0002-6869-7954', '0000-0003-4426-2507', '0000-0002-4643-9794']
Total sample size after apply threshold:  76
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8153846153846154
MNB F1:  0.8047329637792471
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       1.00      0.93      0.97        15
          2       1.00      0.86      0.92        21
          3       1.00      1.00      1.00        13
          4       1.00      0.71      0.83        17
          5       0.82      1.00      0.90        50

avg / total       0.93      0.92      0.91       130

[12  0  0  0  0  2  0 14  0  0  0  1  0  0 18  0  0  3  0  0  0 13  0  0
  0  0  0  0 12  5  0  0  0  0  0 50]
svc Accuracy:  0.9153846153846154
svc F1:  0.9233596992217681
             precision    recall  f1-score   support

          0       1.00      0.36      0.53        14
          1       1.00      0.67      0.80        15
          2       1.00      0.86      0.92        21
          3       1.00      0.62      0.76        13
          4       1.00      0.65      0.79        17
          5       0.64      1.00  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(288, 528)
(0, 0)
(0, 0)
1
288
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        20
          1       1.00      0.20      0.33        15
          2       0.89      1.00      0.94       237
          3       1.00      0.19      0.32        16

avg / total       0.91      0.90      0.87       288

[ 16   0   4   0   0   3  12   0   0   0 237   0   0   0  13   3]
MNB Accuracy:  0.8993055555555556
MNB F1:  0.6200894050899284


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        20
          1       1.00      0.53      0.70        15
          2       0.94      1.00      0.97       237
          3       1.00      0.75      0.86        16

avg / total       0.95      0.95      0.94       288

[ 16   0   4   0   0   8   7   0   0   0 237   0   0   0   4  12]
svc Accuracy:  0.9479166666666666
svc F1:  0.8527522683297557
             precision    recall  f1-score   support

          0       1.00      0.35      0.52        20
          1       0.00      0.00      0.00        15
          2       0.85      1.00      0.92       237
          3       1.00      0.12      0.22        16

avg / total       0.82      0.85      0.80       288

[  7   0  13   0   0   0  15   0   0   0 237   0   0   0  14   2]
LR Accuracy:  0.8541666666666666
LR F1:  0.41483634797588287
For name:  s_rossi
total sample size before apply threshold:  199
Counter({'0000-0003-3257-8248': 86, '

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.86      0.92        14
          1       1.00      0.88      0.94        25
          2       1.00      0.92      0.96        13
          3       0.94      0.91      0.93        34
          4       1.00      0.80      0.89        10
          5       0.91      1.00      0.95        86

avg / total       0.94      0.94      0.94       182

[12  0  0  1  0  1  0 22  0  1  0  2  0  0 12  0  0  1  0  0  0 31  0  3
  0  0  0  0  8  2  0  0  0  0  0 86]
svc Accuracy:  0.9395604395604396
svc F1:  0.9306309003590084
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        14
          1       1.00      0.84      0.91        25
          2       1.00      0.62      0.76        13
          3       1.00      0.79      0.89        34
          4       1.00      0.40      0.57        10
          5       0.74      1.00      0.85        86

avg / total       0.88     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 89)
(0, 0)
(0, 0)
1
22
             precision    recall  f1-score   support

          0       1.00      0.82      0.90        11
          1       0.85      1.00      0.92        11

avg / total       0.92      0.91      0.91        22

[ 9  2  0 11]
MNB Accuracy:  0.9090909090909091
MNB F1:  0.9083333333333333
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.91      0.95        11

avg / total       0.96      0.95      0.95        22

[11  0  1 10]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(159, 507)
(0, 0)
(0, 0)
1
159
             precision    recall  f1-score   support

          0       0.88      1.00      0.94       112
          1       1.00      0.68      0.81        47

avg / total       0.92      0.91      0.90       159

[112   0  15  32]
MNB Accuracy:  0.9056603773584906
MNB F1:  0.8736825380011652
             precision    recall  f1-score   support

          0       0.90      1.00      0.95       112
          1       1.00      0.72      0.84        47

avg / total       0.93      0.92      0.91       159

[112   0  13  34]
svc Accuracy:  0.9182389937106918
svc F1:  0.8923269260822004
             precision    recall  f1-score   support

          0       0.81      1.00      0.89       112
          1       1.00      0.43      0.60        47

avg / total       0.86      0.83      0.81       159

[112   0  27  20]
LR Accuracy:  0.8301886792452831
LR F1:  0.7447226021287983
For name:  l_rasmussen
total sample size before apply threshold:  249
Counter({'0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        24
          1       0.97      1.00      0.98       214

avg / total       0.97      0.97      0.97       238

[ 17   7   0 214]
svc Accuracy:  0.9705882352941176
svc F1:  0.9065881693299691
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        24
          1       0.92      1.00      0.96       214

avg / total       0.93      0.92      0.90       238

[  6  18   0 214]
LR Accuracy:  0.9243697478991597
LR F1:  0.6798206278026906
For name:  m_saad
total sample size before apply threshold:  4
Counter({'0000-0003-0458-5942': 1, '0000-0002-8071-2328': 1, '0000-0002-5655-8674': 1, '0000-0003-1291-366X': 1})
[]
Total sample size after apply threshold:  0
For name:  j_carr
total sample size before apply threshold:  271
Counter({'0000-0002-4398-8237': 179, '0000-0002-6445-2992': 42, '0000-0002-5028-2160': 40, '0000-0002-2729-0920': 6, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        42
          1       1.00      0.68      0.81        40
          2       0.88      1.00      0.93       179

avg / total       0.92      0.90      0.90       261

[ 30   0  12   0  27  13   0   0 179]
svc Accuracy:  0.9042145593869731
svc F1:  0.8580097770503444
             precision    recall  f1-score   support

          0       1.00      0.21      0.35        42
          1       1.00      0.25      0.40        40
          2       0.74      1.00      0.85       179

avg / total       0.82      0.76      0.70       261

[  9   0  33   0  10  30   0   0 179]
LR Accuracy:  0.7586206896551724
LR F1:  0.5344324903358019
For name:  j_fraser
total sample size before apply threshold:  101
Counter({'0000-0002-5080-2859': 38, '0000-0002-6505-1883': 36, '0000-0002-5980-3989': 9, '0000-0003-0111-9137': 6, '0000-0002-8020-2985': 6, '0000-0001-9697-3795': 3, '0000-0003-4941-1997': 3})
['000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.92      0.96        36
          1       0.93      1.00      0.96        38

avg / total       0.96      0.96      0.96        74

[33  3  0 38]
LR Accuracy:  0.9594594594594594
LR F1:  0.9592735277930655
For name:  s_woo
total sample size before apply threshold:  25
Counter({'0000-0003-3692-7169': 22, '0000-0001-8788-2875': 1, '0000-0001-6765-4322': 1, '0000-0001-6902-0315': 1})
['0000-0003-3692-7169']
Total sample size after apply threshold:  22
For name:  s_bartlett
total sample size before apply threshold:  104
Counter({'0000-0001-9755-2490': 80, '0000-0003-4387-670X': 18, '0000-0002-7044-4454': 3, '0000-0003-0699-2250': 3})
['0000-0003-4387-670X', '0000-0001-9755-2490']
Total sample size after apply threshold:  98
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_featu

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[11  0  0  3  0 14  0  1  0  0 14  5  0  0  0 25]
LR Accuracy:  0.8767123287671232
LR F1:  0.8853649292457008
For name:  w_lee
total sample size before apply threshold:  590
Counter({'0000-0003-3171-7672': 108, '0000-0001-5833-989X': 100, '0000-0003-3231-9764': 82, '0000-0002-1082-7592': 62, '0000-0003-3267-4811': 40, '0000-0001-7805-869X': 36, '0000-0003-2883-0391': 21, '0000-0002-0607-038X': 21, '0000-0002-5461-6770': 16, '0000-0002-3912-6095': 11, '0000-0001-6757-885X': 11, '0000-0001-6408-7668': 10, '0000-0002-9873-1033': 9, '0000-0001-7801-083X': 8, '0000-0001-8430-4797': 7, '0000-0002-2572-7287': 5, '0000-0002-6766-8481': 5, '0000-0001-8706-6026': 4, '0000-0002-0036-2859': 4, '0000-0002-9624-0505': 3, '0000-0002-3413-4029': 3, '0000-0003-1817-8395': 3, '0000-0003-1744-8525': 3, '0000-0001-8052-2420': 2, '0000-0003-0853-8561': 2, '0000-0001-7285-4054': 2, '0000-0001-9645-8179': 2, '0000-0002-4383-756X': 2, '0000-0003-1911-3454': 2, '0000-0003-4333-5444': 1, '0000-0002-7324-5792':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



[ 7  0  0  0  0 13  0 15  0  1  0  0  0  0  0  0  0 10  0  1  0  0  0  0
  0  0  2  0  0 18  1  0  0  0  0  0  0  0  0  4  0  0  0  5  0  1  0  0
  0  0  0  0  5 34  1  0  0  0  0  0  0  0  0  0  1 98  7  2  0  0  0  0
  0  0  0  0  0 26 56  0  0  0  0  0  0  0  0  0  0  0  0 99  0  1  0  0
  0  0  0  0  0  7  0  0 14  0  0  0  0  0  0  0  0  2  0  7  0 53  0  0
  0  0  0  0  0  2  0  8  0  1  0  0  0  0  0  0  0  1  0 14  0  1  0  0]
MNB Accuracy:  0.6525096525096525
MNB F1:  0.42806809117095507
             precision    recall  f1-score   support

          0       0.96      0.67      0.79        36
          1       1.00      0.09      0.17        11
          2       1.00      0.43      0.60        21
          3       1.00      0.70      0.82        10
          4       0.71      0.68      0.69        40
          5       0.56      0.88      0.68       108
          6       0.87      0.71      0.78        82
          7       0.89      0.98      0.93       100
          8       1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.97      1.00      0.98       343

avg / total       0.93      0.96      0.95       355

[  0  12   1 342]
MNB Accuracy:  0.9633802816901409
MNB F1:  0.49067431850789095
             precision    recall  f1-score   support

          0       1.00      0.92      0.96        12
          1       1.00      1.00      1.00       343

avg / total       1.00      1.00      1.00       355

[ 11   1   0 343]
svc Accuracy:  0.9971830985915493
svc F1:  0.9775330675273717
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.97      1.00      0.98       343

avg / total       0.93      0.97      0.95       355

[  0  12   0 343]
LR Accuracy:  0.9661971830985916
LR F1:  0.49140401146131807
For name:  j_albert
total sample size before apply threshold:  78
Counter({'0000-0002-3420-7371': 40, '0000-0001-65

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.98        40
          1       1.00      0.95      0.97        19
          2       1.00      0.92      0.96        13

avg / total       0.97      0.97      0.97        72

[40  0  0  1 18  0  1  0 12]
svc Accuracy:  0.9722222222222222
svc F1:  0.9695275763568447
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        40
          1       1.00      0.84      0.91        19
          2       1.00      0.38      0.56        13

avg / total       0.88      0.85      0.83        72

[40  0  0  3 16  0  8  0  5]
LR Accuracy:  0.8472222222222222
LR F1:  0.782987382987383
For name:  k_goh
total sample size before apply threshold:  42
Counter({'0000-0002-2839-8722': 22, '0000-0002-3623-4891': 5, '0000-0003-0599-9696': 5, '0000-0001-5499-5187': 4, '0000-0002-2367-8303': 3, '0000-0001-5416-9627': 2, '0000-0002-8265-3421': 1})
['0000-0002-2839-8722']
Total

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      1.00      0.98       118
          1       1.00      0.67      0.80        12

avg / total       0.97      0.97      0.97       130

[118   0   4   8]
svc Accuracy:  0.9692307692307692
svc F1:  0.8916666666666666
             precision    recall  f1-score   support

          0       0.91      1.00      0.95       118
          1       0.00      0.00      0.00        12

avg / total       0.82      0.91      0.86       130

[118   0  12   0]
LR Accuracy:  0.9076923076923077
LR F1:  0.47580645161290325
For name:  p_pathak
total sample size before apply threshold:  9
Counter({'0000-0003-0118-3235': 4, '0000-0002-1157-5550': 3, '0000-0002-9771-6624': 1, '0000-0003-2152-3938': 1})
[]
Total sample size after apply threshold:  0
For name:  h_zeng
total sample size before apply threshold:  82
Counter({'0000-0002-8246-2000': 42, '0000-0002-0260-1059': 21, '0000-0002-9909-7732': 6, '0000-0002-9150-214X': 6, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.64      0.78        11
          1       0.78      0.90      0.83        39
          2       1.00      0.76      0.87        17
          3       1.00      0.54      0.70        13
          4       0.80      1.00      0.89       100
          5       1.00      0.92      0.96        26
          6       0.98      0.91      0.94        45
          7       0.88      0.78      0.82        18
          8       0.92      0.92      0.92        39
          9       1.00      0.30      0.46        10
         10       0.96      0.87      0.92        31

avg / total       0.90      0.88      0.87       349

[  7   0   0   0   4   0   0   0   0   0   0   0  35   0   0   2   0   0
   1   1   0   0   0   0  13   0   3   0   0   1   0   0   0   0   0   0
   7   6   0   0   0   0   0   0   0   0   0   0 100   0   0   0   0   0
   0   0   0   0   0   2  24   0   0   0   0   0   0   1   0   0   2   0
  41   0   1   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      1.00      0.95        19
          1       0.86      0.35      0.50        17
          2       0.64      0.94      0.76        17

avg / total       0.80      0.77      0.75        53

[19  0  0  2  6  9  0  1 16]
MNB Accuracy:  0.7735849056603774
MNB F1:  0.7373015873015873
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        19
          1       0.93      0.82      0.87        17
          2       0.84      0.94      0.89        17

avg / total       0.93      0.92      0.92        53

[19  0  0  0 14  3  0  1 16]
svc Accuracy:  0.9245283018867925
svc F1:  0.9212962962962963
             precision    recall  f1-score   support

          0       0.95      1.00      0.97        19
          1       0.88      0.82      0.85        17
          2       0.88      0.88      0.88        17

avg / total       0.90      0.91      0.90        53

[19  0  0  1 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        40
          1       1.00      1.00      1.00        73

avg / total       1.00      1.00      1.00       113

[40  0  0 73]
svc Accuracy:  1.0
svc F1:  1.0
             precision    recall  f1-score   support

          0       1.00      0.88      0.93        40
          1       0.94      1.00      0.97        73

avg / total       0.96      0.96      0.96       113

[35  5  0 73]
LR Accuracy:  0.9557522123893806
LR F1:  0.9501103752759382
For name:  r_moore
total sample size before apply threshold:  221
Counter({'0000-0002-0776-5861': 75, '0000-0001-7221-6693': 51, '0000-0003-1072-2755': 45, '0000-0003-2027-2428': 44, '0000-0003-4196-1804': 6})
['0000-0003-2027-2428', '0000-0003-1072-2755', '0000-0001-7221-6693', '0000-0002-0776-5861']
Total sample size after apply threshold:  215
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.82      0.90        44
          1       1.00      0.82      0.90        45
          2       1.00      0.96      0.98        51
          3       0.81      1.00      0.89        75

avg / total       0.93      0.92      0.92       215

[36  0  0  8  0 37  0  8  0  0 49  2  0  0  0 75]
svc Accuracy:  0.9162790697674419
svc F1:  0.9188240418118466
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        44
          1       1.00      0.78      0.88        45
          2       1.00      0.96      0.98        51
          3       0.70      1.00      0.82        75

avg / total       0.90      0.85      0.85       215

[24  0  0 20  0 35  0 10  0  0 49  2  0  0  0 75]
LR Accuracy:  0.8511627906976744
LR F1:  0.8462645442792502
For name:  m_thomsen
total sample size before apply threshold:  98
Counter({'0000-0002-2469-6458': 37, '0000-0003-2453-5141': 32, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.8953488372093024
svc F1:  0.8839863222510608
             precision    recall  f1-score   support

          0       0.94      0.94      0.94        32
          1       0.74      0.95      0.83        37
          2       1.00      0.41      0.58        17

avg / total       0.87      0.84      0.82        86

[30  2  0  2 35  0  0 10  7]
LR Accuracy:  0.8372093023255814
LR F1:  0.7847222222222223
For name:  l_ng
total sample size before apply threshold:  44
Counter({'0000-0003-1905-3586': 37, '0000-0002-6973-9466': 3, '0000-0001-7500-9403': 1, '0000-0001-5988-008X': 1, '0000-0003-3135-244X': 1, '0000-0002-7189-1272': 1})
['0000-0003-1905-3586']
Total sample size after apply threshold:  37
For name:  a_phillips
total sample size before apply threshold:  170
Counter({'0000-0002-5461-0598': 98, '0000-0001-6367-9784': 24, '0000-0001-5599-6499': 24, '0000-0003-4883-0022': 9, '0000-0003-4225-0158': 7, '0000-0003-4473-5108': 4, '0000-0001-6618-0145': 3, '0000-0001-6335-9430': 1})
['0000-0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 148)
(0, 0)
(0, 0)
1
55
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.95      1.00      0.98        42

avg / total       0.97      0.96      0.96        55

[11  2  0 42]
MNB Accuracy:  0.9636363636363636
MNB F1:  0.9467054263565892
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.95      1.00      0.98        42

avg / total       0.97      0.96      0.96        55

[11  2  0 42]
svc Acc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.79      0.84        34
          1       0.90      0.98      0.94        86
          2       1.00      0.84      0.91        19

avg / total       0.92      0.91      0.91       139

[27  7  0  2 84  0  1  2 16]
svc Accuracy:  0.9136690647482014
svc F1:  0.8988610667730779
             precision    recall  f1-score   support

          0       0.87      0.59      0.70        34
          1       0.80      0.98      0.88        86
          2       1.00      0.58      0.73        19

avg / total       0.84      0.83      0.82       139

[20 14  0  2 84  0  1  7 11]
LR Accuracy:  0.8273381294964028
LR F1:  0.7715562903769021
For name:  s_teixeira
total sample size before apply threshold:  36
Counter({'0000-0003-0419-2348': 12, '0000-0001-5845-058X': 11, '0000-0002-2462-8535': 3, '0000-0002-9473-0113': 3, '0000-0002-7464-3944': 3, '0000-0002-6603-7936': 3, '0000-0003-3664-2577': 1})
['0000-0003-0419-2348'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.88      1.00      0.94        91
          2       1.00      0.55      0.71        11

avg / total       0.91      0.89      0.87       112

[ 3  7  0  0 91  0  0  5  6]
MNB Accuracy:  0.8928571428571429
MNB F1:  0.7018550481255149
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.93      1.00      0.96        91
          2       1.00      0.55      0.71        11

avg / total       0.94      0.94      0.93       112

[ 8  2  0  0 91  0  0  5  6]
svc Accuracy:  0.9375
svc F1:  0.8525780682643429
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.81      1.00      0.90        91
          2       0.00      0.00      0.00        11

avg / total       0.66      0.81      0.73       112

[ 0 10  0  0 91  0  0 11  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 224)
(0, 0)
(0, 0)
1
164
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.93      0.99      0.96       153

avg / total       0.87      0.92      0.89       164

[  0  11   2 151]
MNB Accuracy:  0.9207317073170732
MNB F1:  0.4793650793650794
             precision    recall  f1-score   support

          0       1.00      0.18      0.31        11
          1       0.94      1.00      0.97       153

avg / total       0.95      0.95      0.93       164

[  2   9   0 15

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.99      1.00      0.99        92

avg / total       0.99      0.99      0.99       102

[ 9  1  0 92]
MNB Accuracy:  0.9901960784313726
MNB F1:  0.9709815078236131
             precision    recall  f1-score   support

          0       1.00      0.90      0.95        10
          1       0.99      1.00      0.99        92

avg / total       0.99      0.99      0.99       102

[ 9  1  0 92]
svc Accuracy:  0.9901960784313726
svc F1:  0.9709815078236131
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.94      1.00      0.97        92

avg / total       0.94      0.94      0.93       102

[ 4  6  0 92]
LR Accuracy:  0.9411764705882353
LR F1:  0.7699248120300752
For name:  p_lima
total sample size before apply threshold:  24
Counter({'0000-0002-1252-2565': 8, '0000-0002-9739-0783': 8, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      1.00      0.97       174
          1       1.00      0.66      0.79        35

avg / total       0.95      0.94      0.94       209

[174   0  12  23]
MNB Accuracy:  0.9425837320574163
MNB F1:  0.8798850574712643
             precision    recall  f1-score   support

          0       0.96      1.00      0.98       174
          1       1.00      0.77      0.87        35

avg / total       0.96      0.96      0.96       209

[174   0   8  27]
svc Accuracy:  0.9617224880382775
svc F1:  0.9242479159115622
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       174
          1       1.00      0.26      0.41        35

avg / total       0.89      0.88      0.84       209

[174   0  26   9]
LR Accuracy:  0.8755980861244019
LR F1:  0.6697860962566844
For name:  h_moreira
total sample size before apply threshold:  28
Counter({'0000-0002-1487-0539': 13, '0000-0002-548

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 52)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       0.80      0.97      0.88        29
          1       1.00      0.55      0.71        11
          2       0.89      0.84      0.86        19

avg / total       0.87      0.85      0.84        59

[28  0  1  4  6  1  3  0 16]
MNB Accuracy:  0.847457627118644
MNB F1:  0.8152490726020138
             precision    recall  f1-score   support

          0       1.00      0.86      0.93        29
          1       1.00      0.55      0.71        11
          

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.87      0.88        47
          1       1.00      0.17      0.29        18
          2       0.84      0.84      0.84        62
          3       1.00      0.08      0.15        24
          4       1.00      0.77      0.87        26
          5       0.58      1.00      0.74        62

avg / total       0.83      0.75      0.71       239

[41  0  0  0  0  6  1  3  1  0  0 13  0  0 52  0  0 10  3  0  7  2  0 12
  1  0  2  0 20  3  0  0  0  0  0 62]
MNB Accuracy:  0.7531380753138075
MNB F1:  0.6279418337623106
             precision    recall  f1-score   support

          0       0.96      0.91      0.93        47
          1       0.85      0.61      0.71        18
          2       0.85      1.00      0.92        62
          3       0.79      0.79      0.79        24
          4       1.00      0.92      0.96        26
          5       0.98      0.95      0.97        62

avg / total       0.92     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.88      0.94        17
          1       0.75      0.60      0.67        15
          2       1.00      0.93      0.96        14
          3       0.72      0.95      0.82        19
          4       1.00      1.00      1.00        20

avg / total       0.89      0.88      0.88        85

[15  1  0  1  0  0  9  0  6  0  0  1 13  0  0  0  1  0 18  0  0  0  0  0
 20]
svc Accuracy:  0.8823529411764706
svc F1:  0.8770622895622896
             precision    recall  f1-score   support

          0       1.00      0.94      0.97        17
          1       1.00      0.40      0.57        15
          2       1.00      0.79      0.88        14
          3       0.62      0.95      0.75        19
          4       0.87      1.00      0.93        20

avg / total       0.88      0.84      0.83        85

[16  0  0  1  0  0  6  0  8  1  0  0 11  2  1  0  0  0 18  1  0  0  0  0
 20]
LR Accuracy:  0.8352941176470589
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.58      0.98      0.72        43
          1       1.00      0.95      0.97        38
          2       1.00      0.20      0.33        10
          3       0.89      0.53      0.67        15
          4       1.00      0.62      0.77        24
          5       1.00      0.62      0.76        13

avg / total       0.86      0.78      0.77       143

[42  0  0  1  0  0  2 36  0  0  0  0  8  0  2  0  0  0  7  0  0  8  0  0
  9  0  0  0 15  0  5  0  0  0  0  8]
svc Accuracy:  0.7762237762237763
svc F1:  0.7047077391904978
             precision    recall  f1-score   support

          0       0.55      0.98      0.70        43
          1       1.00      0.97      0.99        38
          2       0.00      0.00      0.00        10
          3       0.88      0.47      0.61        15
          4       1.00      0.62      0.77        24
          5       1.00      0.46      0.63        13

avg / total       0.78     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.93      0.96        40
          1       0.95      1.00      0.97        58

avg / total       0.97      0.97      0.97        98

[37  3  0 58]
svc Accuracy:  0.9693877551020408
svc F1:  0.9679144385026738
             precision    recall  f1-score   support

          0       1.00      0.85      0.92        40
          1       0.91      1.00      0.95        58

avg / total       0.94      0.94      0.94        98

[34  6  0 58]
LR Accuracy:  0.9387755102040817
LR F1:  0.9348692955250332
For name:  h_brown
total sample size before apply threshold:  48
Counter({'0000-0001-8578-5510': 17, '0000-0002-0067-991X': 9, '0000-0003-4870-8369': 8, '0000-0001-7418-5536': 6, '0000-0001-6227-5147': 3, '0000-0001-9404-9515': 3, '0000-0003-2292-7766': 2})
['0000-0001-8578-5510']
Total sample size after apply threshold:  17
For name:  s_martins
total sample size before apply threshold:  84
Counter({'0000-0002-9396-5

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(198, 651)
(0, 0)
(0, 0)
1
198
             precision    recall  f1-score   support

          0       0.50      1.00      0.67        78
          1       1.00      0.22      0.36        18
          2       1.00      0.87      0.93        23
          3       0.00      0.00      0.00        11
          4       1.00      0.59      0.75        32
          5       0.00      0.00      0.00        13
          6       0.00      0.00      0.00        23

avg / total       0.57      0.61      0.53       198

[78  0  0  0  0  0  0 14  4  0  0  0  0  0  3

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      1.00      0.78        78
          1       1.00      0.56      0.71        18
          2       1.00      0.91      0.95        23
          3       1.00      0.45      0.62        11
          4       1.00      0.69      0.81        32
          5       1.00      0.46      0.63        13
          6       0.92      0.52      0.67        23

avg / total       0.85      0.78      0.77       198

[78  0  0  0  0  0  0  8 10  0  0  0  0  0  2  0 21  0  0  0  0  6  0  0
  5  0  0  0 10  0  0  0 22  0  0  6  0  0  0  0  6  1 11  0  0  0  0  0
 12]
svc Accuracy:  0.7777777777777778
svc F1:  0.7415444565244317
             precision    recall  f1-score   support

          0       0.48      1.00      0.65        78
          1       1.00      0.17      0.29        18
          2       1.00      0.83      0.90        23
          3       0.00      0.00      0.00        11
          4       1.00      0.44      0.6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        29
          1       1.00      0.83      0.91        12
          2       0.95      1.00      0.97        97
          3       1.00      0.79      0.88        14

avg / total       0.97      0.97      0.97       152

[29  0  0  0  0 10  2  0  0  0 97  0  0  0  3 11]
svc Accuracy:  0.9671052631578947
svc F1:  0.9409913202375514
             precision    recall  f1-score   support

          0       1.00      0.79      0.88        29
          1       1.00      0.83      0.91        12
          2       0.87      1.00      0.93        97
          3       1.00      0.50      0.67        14

avg / total       0.91      0.90      0.89       152

[23  0  6  0  0 10  2  0  0  0 97  0  0  0  7  7]
LR Accuracy:  0.9013157894736842
LR F1:  0.8471506563611827
For name:  m_sahin
total sample size before apply threshold:  48
Counter({'0000-0001-7044-2953': 41, '0000-0002-3490-6009': 3, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.97      0.98        30
          1       0.90      1.00      0.95        26
          2       1.00      0.83      0.91        12
          3       1.00      1.00      1.00        12

avg / total       0.97      0.96      0.96        80

[29  1  0  0  0 26  0  0  0  2 10  0  0  0  0 12]
svc Accuracy:  0.9625
svc F1:  0.9593990755007704
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        30
          1       0.93      1.00      0.96        26
          2       1.00      0.83      0.91        12
          3       1.00      1.00      1.00        12

avg / total       0.98      0.97      0.97        80

[30  0  0  0  0 26  0  0  0  2 10  0  0  0  0 12]
LR Accuracy:  0.975
LR F1:  0.968013468013468
For name:  j_coutinho
total sample size before apply threshold:  129
Counter({'0000-0002-3841-743X': 105, '0000-0002-6303-9549': 13, '0000-0002-1562-0099': 8, '00

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.89      1.00      0.94       105

avg / total       0.79      0.89      0.84       118

[  0  13   0 105]
LR Accuracy:  0.8898305084745762
LR F1:  0.47085201793721976
For name:  s_huber
total sample size before apply threshold:  44
Counter({'0000-0002-4125-159X': 26, '0000-0003-3558-351X': 12, '0000-0002-8271-7835': 3, '0000-0002-5842-5859': 2, '0000-0001-6303-5188': 1})
['0000-0003-3558-351X', '0000-0002-4125-159X']
Total sample size after apply threshold:  38
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, us

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


73
Counter({'0000-0003-3218-7001': 26, '0000-0001-9710-9835': 21, '0000-0003-2165-5519': 12, '0000-0002-4094-7982': 3, '0000-0002-5637-1041': 3, '0000-0001-6528-9034': 3, '0000-0003-4940-6522': 2, '0000-0003-0298-8246': 2, '0000-0001-8679-2886': 1})
['0000-0001-9710-9835', '0000-0003-2165-5519', '0000-0003-3218-7001']
Total sample size after apply threshold:  59
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 108)
(0, 0)
(0, 0)
1
59
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        21
          1       1.00      0.92      0.96        12
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 350
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(350, 1067)
(0, 0)
(0, 0)
1
350
             precision    recall  f1-score   support

          0       0.89      0.94      0.91        85
          1       1.00      0.19      0.32        16
          2       0.76      0.99      0.86       108
          3       0.00      0.00      0.00        13
          4       1.00      0.73      0.85        41
          5       0.98      0.98      0.98        87

avg / total       0.86      0.87      0.84       350

[ 80   0   4   0   0   1   5   3   8   0   0   0   0   0 107   0   0   1
   2   0  11   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      1.00      0.78       155
          1       1.00      0.41      0.58        44
          2       1.00      0.65      0.79        60
          3       1.00      0.44      0.62        36
          4       0.00      0.00      0.00        21

avg / total       0.76      0.72      0.68       316

[155   0   0   0   0  26  18   0   0   0  21   0  39   0   0  20   0   0
  16   0  21   0   0   0   0]
MNB Accuracy:  0.7215189873417721
MNB F1:  0.552560607383107


In [None]:
# accuracy
from statistics import mean 
cleaned_mnb_accuracy = [x for x in all_mnb_accuracy if isinstance(x, float)]
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_mnb_accuracy))
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_mnb_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

In [None]:
# f1
from statistics import mean 
# remove string from result
cleaned_mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_mnb_f1))
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_mnb_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

In [None]:
%reset

In [None]:
%who