In [2]:
import com_func

Dataset = "pubmed"

# parameters
threshold = 10
cutoff = 3

coauthor_emb_type = "off"
venue_emb_type = "tf_idf"
year_emb_type = "tf_idf"
pp_textual_emb_type = "off"
citation_emb_type = "off"

In [3]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7],
                                "publish_year": read_data[10]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [4]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [5]:
# co-author relation to frequence count
def co_author_to_vector(raw_co_author_data, emb_type="off"):
    while True:
        if emb_type == "tf":
            co_author_vectorizer = CountVectorizer()
            print(co_author_vectorizer)
            result_vector = co_author_vectorizer.fit_transform(raw_co_author_data).toarray()
            #print(co_author_vectorizer.get_feature_names())
            #print(len(co_author_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_co_author_data).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [11]:
# venue relation with author
def venue_to_vector(raw_venue_id, emb_type="off"):
    while True:
        if emb_type == "tf":
            venue_count_vectorizer = CountVectorizer()
            print(venue_count_vectorizer)
            result_vector = venue_count_vectorizer.fit_transform(raw_venue_id).toarray()
            #print(len(venue_count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_venue_id).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector

In [12]:
# author-year relation to emb
def year_to_vector(raw_year, emb_type="off"):
    while True:
        if emb_type == "tf":
            count_vectorizer = CountVectorizer()
            result_vector = count_vectorizer.fit_transform(raw_year).toarray()
            #print(len(count_vectorizer.vocabulary_))
            break
        elif emb_type == "tf_idf":
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True,
                                               stop_words = None)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(raw_year).toarray()
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="tf"
    return result_vector

In [8]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_sample_size = sum(sample_size)/len(sample_size)
    print("Minimal sample size: ", min(sample_size))
    print("maximal sample size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = count_vectorizer.fit_transform(cleaned_token).toarray()
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_sample_size

In [9]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score,f1_score,accuracy_score
# cross validation
def k_fold_cv(data, label, clf, k=10):
    kf = KFold(n_splits=k, shuffle=False)
    allTrueLabel = []
    allPredLabel = []
    for train_index, test_index in kf.split(data):
        # print("TRAIN:", train_index, " \n TEST:", test_index)
        # split train and test
        data_train, data_test = data[train_index], data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        # fit data to clf
        clf.fit(data_train, label_train)
        # get predicted label
        label_pred = clf.predict(data_test)
        allTrueLabel.extend(label_test)
        allPredLabel.extend(label_pred)
        # print("True positive: {tp}, False positive: {fp}, False negative: {fn}, True negative: {tn}"
        # .format(tp=round_tp, fp=round_fp, fn=round_fn, tn=round_tn))

    accuracy = accuracy_score(allTrueLabel, allPredLabel)
    f1 = f1_score(allTrueLabel, allPredLabel,average='macro')
    
    print(metrics.classification_report(allTrueLabel, allPredLabel))
    print(metrics.confusion_matrix(allTrueLabel, allPredLabel).ravel())
    
    return accuracy, f1

In [15]:
# load the file
import sys
import io
import os
import collections
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)
# collect statistic to output
allname = []
num_class = []
per_class_count = []
average_textual_size = []

all_mnb_accuracy = []
all_mnb_f1 = []
all_svcLinear_accuracy = []
all_svcLinear_f1 = []
all_LR_accuracy = []
all_LR_f1 = []

# read all file in labeled group
for file in listfiles:
    # group name
    temp = file.split("_")
    name = temp[1]+"_"+temp[-1]
    print("For name: ",name)
    allname.append(name)
    # read needed content in labeled file
    labeled_data = read_labeled_file(fileDir+file)
    print("total sample size before apply threshold: ",len(labeled_data))
    # count number of paper each author write based on author ID
    paperCounter = collections.Counter(labeled_data["authorID"])
    print(paperCounter)
    # collect per class statistic
    for k in list(paperCounter):
        if paperCounter[k] < threshold:
            del paperCounter[k]
    temp =list(paperCounter.keys())
    print(temp)
    per_class_count.append(paperCounter)
    num_class.append(len(paperCounter))
    # remove samples that are smaller than threshold
    labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
    print("Total sample size after apply threshold: ",len(labeled_data))
    # if only have one class or no class pass the threshold, not applicable
    if(len(paperCounter)==0) or (len(paperCounter)==1):
        average_textual_size.append("Not applicable")
        all_mnb_accuracy.append("Not applicable")
        all_mnb_f1.append("Not applicable")
        all_svcLinear_accuracy.append("Not applicable")
        all_svcLinear_f1.append("Not applicable")
        all_LR_accuracy.append("Not applicable")
        all_LR_f1.append("Not applicable")
    else:
        # convert author id to label
        gather_label = []
        for index, record in labeled_data.iterrows():
            gather_label.append(temp.index(record["authorID"]))
        labeled_data["label"] = gather_label
        # shuffle the data
        labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
        # extract true label and pid
        label = labeled_data["label"]
        pid = labeled_data["paperID"]
        # list of different data field
        part_collection = []
        # data part 1, co-author matrix
        data_part_co_author = co_author_to_vector(labeled_data["co-author"], emb_type=coauthor_emb_type)
        print(data_part_co_author.shape)
        part_collection.append(data_part_co_author)
        # data part 2.1, venue_id that author attend
        data_part_venue = venue_to_vector(labeled_data["venue_id"], emb_type=venue_emb_type)
        print(data_part_venue.shape)
        part_collection.append(data_part_venue)
        # data part 2.2 year that author attend
        data_part_year = year_to_vector(labeled_data["publish_year"], emb_type=year_emb_type)
        print(data_part_year.shape)
        part_collection.append(data_part_year)
        # merge different part of data data together by concatenate it all together
        # remove empty emb (when emb set off)
        part_collection = [part for part in part_collection if len(part)!=0]
        print(len(part_collection))
        if len(part_collection)>1:
            combinedata = np.concatenate(part_collection,axis=1)
        elif len(part_collection)==1:
            if isinstance(part_collection[0], pd.DataFrame):
                combinedata = part_collection[0].values
            else:
                combinedata = part_collection[0]
        else:
            print("No data available")
            break
        print(combinedata.shape)
        # using converted feature vector to train classifier
        # using Multinomial naive bayes
        clf = MultinomialNB()
        # use 10 fold cv
        mnbaccuracy, mnbmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("MNB Accuracy: ",mnbaccuracy)
        print("MNB F1: ", mnbmarcof1)
        all_mnb_accuracy.append(mnbaccuracy)
        all_mnb_f1.append(mnbmarcof1)
        # using SVM with linear kernal
        clf = SVC(decision_function_shape='ovr', kernel='linear')
        svcaccuracy, svcmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("svc Accuracy: ",svcaccuracy)
        print("svc F1: ", svcmarcof1)
        all_svcLinear_accuracy.append(svcaccuracy)
        all_svcLinear_f1.append(svcmarcof1)
        # using logistic regression
        clf = LogisticRegression(multi_class='ovr')
        LRaccuracy, LRmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
        print("LR Accuracy: ",LRaccuracy)
        print("LR F1: ", LRmarcof1)
        all_LR_accuracy.append(LRaccuracy)
        all_LR_f1.append(LRmarcof1)
    
# write evaluation result to excel
output = pd.DataFrame({'Name Group':allname,"Class number":num_class,"per_class_size":per_class_count, 
                       "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) macro f1": all_svcLinear_f1, 
                       "mnb accuracy":all_mnb_accuracy, "mnb macro f1": all_mnb_f1,
                       "logistic regression accuracy":all_LR_accuracy, "logistic regression macro f1": all_LR_f1})

savePath = "../result/"+Dataset+"/year_venue_only/"
if not os.path.exists(savePath):
    os.makedirs(savePath)
filename = "2004_meta_"+venue_emb_type+"_threshold="+str(threshold)+".csv"
output.to_csv(savePath+filename, encoding='utf-8',index=False)
print("Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']
Total sample size after apply threshold:  127
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(127, 79)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='conte

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.56      0.29      0.38        31
          1       0.85      0.59      0.70        39
          2       0.61      0.89      0.72        57

avg / total       0.67      0.65      0.63       127

[ 9  2 20  3 23 13  4  2 51]
LR Accuracy:  0.6535433070866141
LR F1:  0.6011175585643671
For name:  f_esteves
total sample size before apply threshold:  34
Counter({'0000-0002-3046-1313': 18, '0000-0002-5403-0091': 12, '0000-0003-0589-0746': 3, '0000-0003-3172-6253': 1})
['0000-0002-5403-0091', '0000-0002-3046-1313']
Total sample size after apply threshold:  30
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_p

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.58      0.64        12
          1       0.75      0.83      0.79        18

avg / total       0.73      0.73      0.73        30

[ 7  5  3 15]
svc Accuracy:  0.7333333333333333
svc F1:  0.7129186602870814
             precision    recall  f1-score   support

          0       0.71      0.42      0.53        12
          1       0.70      0.89      0.78        18

avg / total       0.70      0.70      0.68        30

[ 5  7  2 16]
LR Accuracy:  0.7
LR F1:  0.6534017971758665
For name:  c_miller
total sample size before apply threshold:  252
Counter({'0000-0003-4341-1283': 51, '0000-0002-3989-7973': 40, '0000-0002-3813-1706': 39, '0000-0003-2772-9531': 27, '0000-0001-6082-9273': 22, '0000-0002-2601-4422': 22, '0000-0002-9448-8144': 19, '0000-0001-8628-4902': 15, '0000-0002-2936-7717': 6, '0000-0003-3898-9734': 6, '0000-0002-5074-6914': 2, '0000-0003-4266-6700': 1, '0000-0002-9286-9787': 1, '0000-0002-08

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.47      0.88      0.61        51
          1       0.62      0.26      0.37        19
          2       0.95      0.67      0.78        27
          3       1.00      0.77      0.87        22
          4       1.00      0.92      0.96        39
          5       0.83      0.33      0.48        15
          6       0.86      0.80      0.83        40
          7       1.00      0.73      0.84        22

avg / total       0.81      0.74      0.74       235

[45  3  1  0  0  1  1  0 13  5  0  0  0  0  1  0  8  0 18  0  0  0  1  0
  5  0  0 17  0  0  0  0  3  0  0  0 36  0  0  0  9  0  0  0  0  5  1  0
  8  0  0  0  0  0 32  0  5  0  0  0  0  0  1 16]
svc Accuracy:  0.7404255319148936
svc F1:  0.7183104257867252
             precision    recall  f1-score   support

          0       0.44      0.86      0.59        51
          1       0.50      0.05      0.10        19
          2       0.90      0.67      0.77       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.81      1.00      0.90        69
          2       0.93      0.59      0.72        22

avg / total       0.76      0.81      0.77       101

[ 0  9  1  0 69  0  2  7 13]
svc Accuracy:  0.8118811881188119
svc F1:  0.5394420394420395
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.73      1.00      0.84        69
          2       1.00      0.27      0.43        22

avg / total       0.71      0.74      0.67       101

[ 0 10  0  0 69  0  0 16  6]
LR Accuracy:  0.7425742574257426
LR F1:  0.4233449477351916
For name:  a_vega
total sample size before apply threshold:  20
Counter({'0000-0002-8207-9925': 10, '0000-0002-2178-2780': 8, '0000-0002-8148-5702': 1, '0000-0003-1082-0961': 1})
['0000-0002-8207-9925']
Total sample size after apply threshold:  10
For name:  k_smith
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        15
          1       0.69      0.76      0.72        29
          2       0.00      0.00      0.00        13
          3       0.84      0.95      0.89       133
          4       0.00      0.00      0.00        14
          5       1.00      0.26      0.41        23
          6       0.54      0.92      0.68        75
          7       1.00      0.16      0.27        19

avg / total       0.67      0.71      0.64       321

[  0   4   0   1   0   0  10   0   0  22   0   3   0   0   4   0   0   1
   0   4   0   0   8   0   0   3   0 127   0   0   3   0   0   0   0   2
   0   0  12   0   0   1   0   5   0   6  11   0   0   1   0   5   0   0
  69   0   0   0   0   5   0   0  11   3]
MNB Accuracy:  0.7071651090342679
MNB F1:  0.372357859678231
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        15
          1       0.57      0.79      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.65      0.87      0.74        46
          2       0.89      0.72      0.79        43

avg / total       0.68      0.72      0.69        99

[ 0 10  0  2 40  4  0 12 31]
svc Accuracy:  0.7171717171717171
svc F1:  0.5118708452041786
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.64      0.83      0.72        46
          2       0.78      0.72      0.75        43

avg / total       0.64      0.70      0.66        99

[ 0  9  1  0 38  8  0 12 31]
LR Accuracy:  0.696969696969697
LR F1:  0.49026582520558426
For name:  j_qian
total sample size before apply threshold:  17
Counter({'0000-0002-8793-9330': 6, '0000-0001-6145-045X': 6, '0000-0003-3162-2913': 1, '0000-0002-9522-6445': 1, '0000-0002-1325-6975': 1, '0000-0002-5438-0833': 1, '0000-0001-5043-020X': 1})
[]
Total sample size after a

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(585, 240)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(585, 35)
2
(585, 275)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.46      0.62        26
          1       0.14      0.07      0.10        14
          2       0.52      0.67      0.59        51
          3       0.80      0.82      0.81        79
          4       1.00      0.30      0.46        10
          5       0.92      0.82      0.86        82
          6       0.56      0.96      0.70       141
          7       1.00      0.21      0.35        14
          8       0.78      0.28      0.41        25
          9       0.94      0.54      0.68        28
         10       1.00      0.07      0.13        14
         11       1.00      0.60      0.75        20
         12       0.96      0.76      0.85        66
         13       1.00      0.47      0.64        15

avg / total       0.77      0.70      0.69       585

[ 12   0   4   2   0   0   8   0   0   0   0   0   0   0   0   1   5   3
   0   0   5   0   0   0   0   0   0   0   0   0  34   3   0   0  14   0
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(1015, 331)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(1015, 22)
2
(1015, 353)
             precision    recall  f1-score   support

          0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       1.00      0.43      0.60        14
          2       0.39      0.86      0.54       154
          3       0.36      0.18      0.24        22
          4       1.00      0.82      0.90        17
          5       0.43      0.27      0.33        11
          6       0.64      0.28      0.39        25
          7       0.94      0.90      0.92       211
          8       0.92      0.61      0.74        57
          9       0.80      0.71      0.75        28
         10       0.80      0.74      0.77       139
         11       1.00      0.58      0.73        45
         12       0.42      0.46      0.44        24
         13       0.86      0.56      0.68        57
         14       0.00      0.00      0.00        14
         15       0.65      0.56      0.60        55
         16       0.83      0.77      0.80        92
         17       1.00      0.12      0.21   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


57
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 24)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 11)
2
(57, 35)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.08      0.15        12
          1       0.00      0.00      0.00        12
          2       0.77      0.99      0.87       151
          3       0.33      0.04      0.07        24

avg / total       0.68      0.76      0.68       199

[  1   0  11   0   0   0  11   1   0   0 150   1   0   0  23   1]
MNB Accuracy:  0.7638190954773869
MNB F1:  0.27374306276040383
             precision    recall  f1-score   support

          0       0.78      0.58      0.67        12
          1       1.00      0.33      0.50        12
          2       0.83      0.99      0.90       151
          3       0.67      0.17      0.27        24

avg / total       0.82      0.82      0.79       199

[  7   0   5   0   0   4   7   1   1   0 149   1   1   0  19   4]
svc Accuracy:  0.8241206030150754
svc F1:  0.5834088620342396
             precision    recall  f1-score   support

          0       1.00      0.25      0.40     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.80      0.84        10
          1       0.88      0.93      0.90        15

avg / total       0.88      0.88      0.88        25

[ 8  2  1 14]
LR Accuracy:  0.88
LR F1:  0.8726655348047538
For name:  r_daniel
total sample size before apply threshold:  173
Counter({'0000-0002-8646-7925': 123, '0000-0002-6483-5897': 37, '0000-0001-8835-8047': 8, '0000-0002-1753-6683': 5})
['0000-0002-8646-7925', '0000-0002-6483-5897']
Total sample size after apply threshold:  160
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=Non

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 20)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 11)
2
(33, 31)
             precision    recall  f1-score   support

          0       0.83

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.73      0.94      0.83        35

avg / total       0.55      0.70      0.61        47

[ 0 12  2 33]
LR Accuracy:  0.7021276595744681
LR F1:  0.41250000000000003
For name:  k_cho
total sample size before apply threshold:  126
Counter({'0000-0002-7751-0469': 55, '0000-0001-6586-983X': 47, '0000-0002-5782-6028': 15, '0000-0003-2555-5048': 6, '0000-0003-3818-9403': 1, '0000-0003-1154-4065': 1, '0000-0003-2926-3958': 1})
['0000-0001-6586-983X', '0000-0002-7751-0469', '0000-0002-5782-6028']
Total sample size after apply threshold:  117
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.73      0.30      0.42        27
          1       0.59      0.64      0.62        25
          2       0.64      0.84      0.73        19
          3       1.00      0.90      0.95        10
          4       0.80      0.36      0.50        11
          5       1.00      0.70      0.82        20
          6       1.00      0.50      0.67        18
          7       1.00      0.15      0.27        13
          8       0.58      0.80      0.67        61
          9       0.41      0.62      0.49        32

avg / total       0.70      0.62      0.61       236

[ 8  3  3  0  0  0  0  0  9  4  1 16  1  0  0  0  0  0  3  4  0  0 16  0
  0  0  0  0  1  2  0  0  0  9  0  0  0  0  1  0  0  0  0  0  4  0  0  0
  0  7  0  2  0  0  1 14  0  0  2  1  0  5  3  0  0  0  9  0  0  1  0  1
  2  0  0  0  0  2  8  0  2  0  0  0  0  0  0  0 49 10  0  0  0  0  0  0
  0  0 12 20]
svc Accuracy:  0.6228813559322034
svc F1:  0.6133001177

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



LR F1:  0.7560490751484541
For name:  g_guidi
total sample size before apply threshold:  37
Counter({'0000-0002-3061-9870': 15, '0000-0003-3199-6624': 11, '0000-0001-9535-9152': 5, '0000-0002-1393-326X': 4, '0000-0002-8857-0096': 2})
['0000-0003-3199-6624', '0000-0002-3061-9870']
Total sample size after apply threshold:  26
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(26, 15)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       0.73      0.73      0.73        26
          2       0.71      0.52      0.60        29
          3       0.60      0.82      0.69        33
          4       0.91      0.81      0.85        36

avg / total       0.76      0.75      0.75       134

[10  0  0  0  0  0 19  2  3  2  0  2 15 11  1  0  2  4 27  0  0  3  0  4
 29]
svc Accuracy:  0.746268656716418
svc F1:  0.7752036199095023
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.67      0.46      0.55        26
          2       0.57      0.41      0.48        29
          3       0.59      0.88      0.71        33
          4       0.71      0.75      0.73        36

avg / total       0.66      0.66      0.65       134

[ 8  1  1  0  0  0 12  4  3  7  0  2 12 12  3  0  1  2 29  1  0  2  2  5
 27]
LR Accuracy:  0.6567164179104478
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.66      0.68      0.67        34
          1       0.93      0.88      0.90        43
          2       0.00      0.00      0.00        12
          3       0.67      0.88      0.76        42

avg / total       0.69      0.75      0.71       131

[23  1  0 10  4 38  0  1  4  1  0  7  4  1  0 37]
LR Accuracy:  0.7480916030534351
LR F1:  0.5835787923416789
For name:  a_maleki
total sample size before apply threshold:  25
Counter({'0000-0001-5490-3350': 15, '0000-0001-8261-8717': 5, '0000-0003-3203-7492': 4, '0000-0001-7888-1985': 1})
['0000-0001-5490-3350']
Total sample size after apply threshold:  15
For name:  j_moon
total sample size before apply threshold:  203
Counter({'0000-0001-8071-1491': 96, '0000-0001-6327-0575': 23, '0000-0001-7776-6889': 19, '0000-0002-9182-5475': 17, '0000-0001-9760-297X': 13, '0000-0002-9274-4554': 12, '0000-0002-8625-6562': 8, '0000-0003-1282-4528': 7, '0000-0003-1569-3068': 2, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  0
For name:  f_ortega
total sample size before apply threshold:  368
Counter({'0000-0003-2001-1121': 205, '0000-0003-2111-769X': 86, '0000-0002-4730-9270': 38, '0000-0002-3172-2095': 22, '0000-0002-7431-354X': 9, '0000-0001-7850-2105': 7, '0000-0003-0231-2051': 1})
['0000-0002-3172-2095', '0000-0003-2001-1121', '0000-0002-4730-9270', '0000-0003-2111-769X']
Total sample size after apply threshold:  351
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(351, 135)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.82      0.64      0.72        44
          1       0.89      0.38      0.53        21
          2       0.68      0.77      0.72        73
          3       1.00      0.20      0.33        10
          4       0.62      0.97      0.76       107
          5       0.70      0.35      0.47        20
          6       0.73      0.47      0.57        51
          7       0.71      0.68      0.70        59
          8       1.00      0.36      0.53        14

avg / total       0.72      0.69      0.67       399

[ 28   0   1   0   9   1   0   5   0   0   8   1   0  12   0   0   0   0
   2   0  56   0   3   0   9   3   0   2   0   0   2   6   0   0   0   0
   0   0   1   0 104   1   0   1   0   0   1   0   0   9   7   0   3   0
   0   0  21   0   5   0  24   1   0   2   0   2   0  14   1   0  40   0
   0   0   0   0   6   0   0   3   5]
svc Accuracy:  0.6867167919799498
svc F1:  0.5915136519580307
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      0.98      0.90       101
          1       0.33      0.05      0.08        21

avg / total       0.75      0.82      0.76       122

[99  2 20  1]
MNB Accuracy:  0.819672131147541
MNB F1:  0.4916666666666667
             precision    recall  f1-score   support

          0       0.90      0.95      0.92       101
          1       0.67      0.48      0.56        21

avg / total       0.86      0.87      0.86       122

[96  5 11 10]
svc Accuracy:  0.8688524590163934
svc F1:  0.7393162393162394
             precision    recall  f1-score   support

          0       0.83      1.00      0.91       101
          1       0.00      0.00      0.00        21

avg / total       0.69      0.83      0.75       122

[101   0  21   0]
LR Accuracy:  0.8278688524590164
LR F1:  0.45291479820627806
For name:  h_song
total sample size before apply threshold:  210
Counter({'0000-0001-5684-4059': 88, '0000-0001-5553-2539': 3

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.72      0.97      0.82        29
          1       1.00      0.14      0.25        14
          2       0.72      0.94      0.81        88
          3       0.73      0.37      0.49        30
          4       0.89      0.40      0.55        20

avg / total       0.76      0.73      0.69       181

[28  0  1  0  0  2  2  9  1  0  2  0 83  2  1  5  0 14 11  0  2  0  9  1
  8]
MNB Accuracy:  0.7292817679558011
MNB F1:  0.5855735857561416
             precision    recall  f1-score   support

          0       1.00      0.97      0.98        29
          1       1.00      0.71      0.83        14
          2       0.73      0.94      0.82        88
          3       0.65      0.37      0.47        30
          4       0.92      0.55      0.69        20

avg / total       0.80      0.79      0.77       181

[28  0  1  0  0  0 10  2  2  0  0  0 83  4  1  0  0 19 11  0  0  0  9  0
 11]
svc Accuracy:  0.7900552486187845


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 43)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 12)
2
(73, 55)
             precision    recall  f1-score   support

          0       0.43

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(107, 76)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(107, 17)
2
(107, 93)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.62      0.97      0.76        60
          1       0.00      0.00      0.00        14
          2       0.70      0.37      0.48        19
          3       1.00      0.29      0.44        14

avg / total       0.60      0.64      0.57       107

[58  0  2  0 13  0  1  0 12  0  7  0 10  0  0  4]
LR Accuracy:  0.6448598130841121
LR F1:  0.42134324994365563
For name:  r_freitas
total sample size before apply threshold:  73
Counter({'0000-0003-4900-3897': 48, '0000-0001-8605-2925': 6, '0000-0002-0123-7232': 6, '0000-0002-4448-6458': 5, '0000-0001-5811-5255': 5, '0000-0002-1645-4125': 2, '0000-0001-8836-1422': 1})
['0000-0003-4900-3897']
Total sample size after apply threshold:  48
For name:  c_yun
total sample size before apply threshold:  284
Counter({'0000-0002-9466-4531': 149, '0000-0002-0041-2887': 98, '0000-0003-2204-8067': 36, '0000-0002-6747-4628': 1})
['0000-0002-9466-4531', '0000-0002-0041-2887', '0000-0003

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        21
          1       0.34      0.88      0.49        68
          2       0.45      0.65      0.53        69
          3       0.00      0.00      0.00        17
          4       0.00      0.00      0.00        14
          5       0.67      0.08      0.14        25
          6       0.00      0.00      0.00        10
          7       0.80      0.38      0.51        32
          8       0.00      0.00      0.00        12
          9       0.00      0.00      0.00        15
         10       0.80      0.18      0.30        22
         11       0.00      0.00      0.00        18
         12       0.00      0.00      0.00        12
         13       0.56      0.84      0.67        68

avg / total       0.38      0.45      0.35       403

[ 0 10  6  0  0  0  0  0  0  0  0  0  0  5  0 60  4  0  0  0  0  0  0  0
  0  0  0  4  0 15 45  0  0  0  0  1  0  0  0  0  0  8  0 10  1  0  0  0
  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.60      0.56      0.58        16
          1       0.53      0.67      0.59        24
          2       0.58      0.41      0.48        17

avg / total       0.57      0.56      0.56        57

[ 9  7  0  3 16  5  3  7  7]
MNB Accuracy:  0.5614035087719298
MNB F1:  0.5519987915241901
             precision    recall  f1-score   support

          0       0.56      0.56      0.56        16
          1       0.55      0.67      0.60        24
          2       0.67      0.47      0.55        17

avg / total       0.59      0.58      0.58        57

[ 9  7  0  4 16  4  3  6  8]
svc Accuracy:  0.5789473684210527
svc F1:  0.5726659076122316
             precision    recall  f1-score   support

          0       0.60      0.56      0.58        16
          1       0.53      0.67      0.59        24
          2       0.58      0.41      0.48        17

avg / total       0.57      0.56      0.56        57

[ 9  7  0  3 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(168, 76)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(168, 22)
2
(168, 98)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.49      0.91      0.63        85
          1       0.50      0.14      0.21        22
          2       0.00      0.00      0.00        11
          3       0.00      0.00      0.00        12
          4       0.33      0.06      0.10        18
          5       0.00      0.00      0.00        12
          6       0.82      0.58      0.68        31
          7       0.00      0.00      0.00        16
          8       0.47      0.76      0.58        54
          9       0.00      0.00      0.00        15

avg / total       0.40      0.51      0.41       276

[77  1  0  0  1  0  2  0  4  0 13  3  0  0  0  0  0  0  6  0  4  0  0  0
  0  0  0  0  7  0  8  0  0  0  0  0  0  0  4  0 14  1  0  0  1  0  0  0
  2  0  8  1  0  0  0  0  0  0  3  0 11  0  0  0  0  0 18  0  2  0  6  0
  0  0  1  0  0  0  9  0 13  0  0  0  0  0  0  0 41  0  4  0  0  0  0  0
  2  0  9  0]
MNB Accuracy:  0.5072463768115942
MNB F1:  0.2204074232

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.93      0.80        15
          1       1.00      0.60      0.75        10
          2       0.82      0.75      0.78        12

avg / total       0.82      0.78      0.78        37

[14  0  1  3  6  1  3  0  9]
MNB Accuracy:  0.7837837837837838
MNB F1:  0.7775362318840578
             precision    recall  f1-score   support

          0       0.65      1.00      0.79        15
          1       1.00      0.70      0.82        10
          2       1.00      0.58      0.74        12

avg / total       0.86      0.78      0.78        37

[15  0  0  3  7  0  5  0  7]
svc Accuracy:  0.7837837837837838
svc F1:  0.7832817337461301
             precision    recall  f1-score   support

          0       0.64      0.93      0.76        15
          1       1.00      0.60      0.75        10
          2       0.78      0.58      0.67        12

avg / total       0.78      0.73      0.73        37

[14  0  1  3  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.08      0.14        13
          1       0.74      1.00      0.85       102
          2       1.00      0.07      0.12        15
          3       0.75      0.78      0.76        27
          4       0.86      0.33      0.48        18

avg / total       0.79      0.75      0.68       175

[  1  11   0   1   0   0 102   0   0   0   0   9   1   4   1   0   6   0
  21   0   0  10   0   2   6]
MNB Accuracy:  0.7485714285714286
MNB F1:  0.47229870129870133
             precision    recall  f1-score   support

          0       1.00      0.38      0.56        13
          1       0.81      1.00      0.89       102
          2       0.80      0.53      0.64        15
          3       0.86      0.70      0.78        27
          4       0.75      0.50      0.60        18

avg / total       0.83      0.82      0.80       175

[  5   7   0   1   0   0 102   0   0   0   0   3   8   1   3   0   8   0
  19   0   0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


156
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(156, 79)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(156, 25)
2
(156, 104)
             precision    recall  f1-score   support

          0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(35, 21)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(35, 11)
2
(35, 32)
             precision    recall  f1-score   support

          0       0.76

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


['0000-0003-0664-2891', '0000-0002-9519-5714', '0000-0001-7811-134X']
Total sample size after apply threshold:  123
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(123, 73)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


213
Counter({'0000-0002-7258-7977': 60, '0000-0002-6296-5137': 55, '0000-0001-7202-0527': 48, '0000-0003-2315-2261': 26, '0000-0003-2735-6608': 22, '0000-0002-7596-9735': 1, '0000-0002-1593-8064': 1})
['0000-0001-7202-0527', '0000-0002-7258-7977', '0000-0003-2315-2261', '0000-0003-2735-6608', '0000-0002-6296-5137']
Total sample size after apply threshold:  211
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(211, 84)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.74      0.90      0.81        48
          1       1.00      0.92      0.96        60
          2       0.96      0.85      0.90        26
          3       0.62      0.36      0.46        22
          4       0.76      0.85      0.80        55

avg / total       0.83      0.83      0.82       211

[43  0  0  1  4  4 55  0  0  1  1  0 22  1  2  6  0  0  8  8  4  0  1  3
 47]
svc Accuracy:  0.8293838862559242
svc F1:  0.7852726676165093
             precision    recall  f1-score   support

          0       0.76      0.79      0.78        48
          1       0.92      0.90      0.91        60
          2       1.00      0.77      0.87        26
          3       0.75      0.41      0.53        22
          4       0.70      0.89      0.78        55

avg / total       0.82      0.81      0.80       211

[38  2  0  1  7  3 54  0  0  3  1  1 20  1  3  3  2  0  9  8  5  0  0  1
 49]
LR Accuracy:  0.8056872037914692
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(201, 81)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(201, 17)
2
(201, 98)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.62      0.42      0.50        12
          1       0.98      0.72      0.83        65
          2       0.85      0.99      0.91       124

avg / total       0.88      0.87      0.86       201

[  5   1   6   2  47  16   1   0 123]
svc Accuracy:  0.8706467661691543
svc F1:  0.7487855161145288
             precision    recall  f1-score   support

          0       0.75      0.25      0.38        12
          1       0.95      0.63      0.76        65
          2       0.81      1.00      0.89       124

avg / total       0.85      0.84      0.82       201

[  3   2   7   1  41  23   0   0 124]
LR Accuracy:  0.835820895522388
LR F1:  0.6754485300648371
For name:  f_williams
total sample size before apply threshold:  149
Counter({'0000-0002-2998-2744': 84, '0000-0002-6194-2734': 33, '0000-0002-3046-9235': 29, '0000-0003-4144-1411': 2, '0000-0001-7507-4870': 1})
['0000-0002-2998-2744', '0000-0002-6194-2734', '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(146, 20)
2
(146, 97)
             precision    recall  f1-score   support

          0       0.79      0.99      0.88        84
          1       0.85      0.67      0.75        33
          2       0.87      0.45      0.59        29

avg / total       0.82      0.81      0.79       146

[83  0  1 10 22  1 12  4 13]
MNB Accuracy:  0.8082191780821918
MNB F1:  0.738326227026792
             precision    recall  f1-score   support

          0       0.82      0.98      0.89        84
          1       1.00      0.82      0.90        33
          2       0.84      0.55      0.67        29

avg / total       0.87      0.86      0.85       146

[82  0  2  5 27  1 13  0 16]
svc Accuracy:  0.8561643835616438
svc F1:  0.8193236714975844
             precision    recall  f1-score   support

          0       0.73      0.99      0.84        84
          1       1.00      0.64      0.78        33
          2       0.92      0.38      0.54        29

avg / total       0.83      0.79      0.77     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 28)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 17)
2
(52, 45)
             precision    recall  f1-score   support

          0       0.75

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(53, 35)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(53, 12)
2
(53, 47)
             precision    recall  f1-score   support

          0       0.50

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(167, 95)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(167, 15)
2
(167, 110)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.67      0.20      0.31        10
          1       0.78      0.64      0.70        44
          2       0.81      0.66      0.72        32
          3       0.59      0.88      0.71        68
          4       0.94      0.79      0.86        75

avg / total       0.77      0.74      0.74       229

[ 2  1  0  5  2  0 28  1 14  1  0  1 21 10  0  0  5  2 60  1  1  1  2 12
 59]
svc Accuracy:  0.74235807860262
svc F1:  0.6593923748185079
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.74      0.59      0.66        44
          2       0.70      0.50      0.58        32
          3       0.54      0.68      0.60        68
          4       0.69      0.79      0.74        75

avg / total       0.63      0.64      0.63       229

[ 0  1  0  4  5  0 26  3 11  4  0  3 16 12  1  0  4  2 46 16  1  1  2 12
 59]
LR Accuracy:  0.6419213973799127
LR 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.59      0.67        29
          1       0.89      0.90      0.89        61
          2       0.83      0.95      0.89        42

avg / total       0.84      0.85      0.84       132

[17  7  5  3 55  3  2  0 40]
svc Accuracy:  0.8484848484848485
svc F1:  0.8166214995483289
             precision    recall  f1-score   support

          0       1.00      0.62      0.77        29
          1       0.84      0.93      0.88        61
          2       0.80      0.88      0.84        42

avg / total       0.86      0.85      0.84       132

[18  6  5  0 57  4  0  5 37]
LR Accuracy:  0.8484848484848485
LR F1:  0.8301958226500532
For name:  a_rao
total sample size before apply threshold:  93
Counter({'0000-0002-2676-2762': 36, '0000-0003-0320-2962': 20, '0000-0002-2550-6097': 11, '0000-0001-6440-1274': 8, '0000-0003-2319-6539': 5, '0000-0002-2474-5010': 5, '0000-0003-4480-3190': 3, '0000-0003-4879-1123': 2, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


h_cui
total sample size before apply threshold:  40
Counter({'0000-0001-6394-4808': 11, '0000-0003-3358-8958': 10, '0000-0002-9870-748X': 9, '0000-0002-6343-1014': 9, '0000-0002-8627-8534': 1})
['0000-0001-6394-4808', '0000-0003-3358-8958']
Total sample size after apply threshold:  21
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 15)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preproce

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



svc Accuracy:  0.6057692307692307
svc F1:  0.5980141843971631
             precision    recall  f1-score   support

          0       0.56      0.56      0.56        16
          1       0.53      0.53      0.53        17
          2       0.75      0.72      0.73        25
          3       0.63      0.60      0.62        20
          4       0.75      0.81      0.78        26

avg / total       0.66      0.66      0.66       104

[ 9  3  0  3  1  1  9  2  2  3  1  1 18  2  3  4  1  3 12  0  1  3  1  0
 21]
LR Accuracy:  0.6634615384615384
LR F1:  0.6439536070838592
For name:  j_fernandes
total sample size before apply threshold:  208
Counter({'0000-0002-2550-1640': 63, '0000-0003-1556-1698': 38, '0000-0001-5512-4092': 33, '0000-0002-8565-2942': 27, '0000-0002-6726-5324': 22, '0000-0002-9089-273X': 6, '0000-0001-8205-5870': 4, '0000-0001-6387-2939': 3, '0000-0002-4505-4809': 3, '0000-0001-6616-3513': 3, '0000-0003-0337-7084': 3, '0000-0003-1519-8032': 2, '0000-0003-0934-9244': 1})
['

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.65      0.95      0.77        63
          1       0.66      0.50      0.57        38
          2       0.90      0.67      0.77        27
          3       1.00      0.91      0.95        33
          4       0.75      0.41      0.53        22

avg / total       0.76      0.74      0.73       183

[60  2  0  0  1 16 19  2  0  1  5  3 18  0  1  2  1  0 30  0  9  4  0  0
  9]
svc Accuracy:  0.7431693989071039
svc F1:  0.717821578277384
             precision    recall  f1-score   support

          0       0.56      0.95      0.70        63
          1       0.46      0.34      0.39        38
          2       0.81      0.48      0.60        27
          3       1.00      0.82      0.90        33
          4       0.50      0.09      0.15        22

avg / total       0.65      0.63      0.59       183

[60  2  0  0  1 23 13  2  0  0  6  7 13  0  1  4  2  0 27  0 15  4  1  0
  2]
LR Accuracy:  0.6284153005464481
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 0.7926634768740032
For name:  d_zhang
total sample size before apply threshold:  94
Counter({'0000-0002-4175-5982': 17, '0000-0002-7665-2182': 12, '0000-0003-0779-6438': 11, '0000-0003-4280-0068': 8, '0000-0001-9295-4992': 7, '0000-0001-9508-8209': 7, '0000-0001-6930-5994': 6, '0000-0001-9478-5344': 6, '0000-0001-5809-0027': 5, '0000-0002-4149-4938': 4, '0000-0002-1581-2357': 4, '0000-0001-5956-4618': 2, '0000-0001-7063-7742': 2, '0000-0002-2541-837X': 1, '0000-0001-6259-7082': 1, '0000-0002-4515-2070': 1})
['0000-0002-4175-5982', '0000-0002-7665-2182', '0000-0003-0779-6438']
Total sample size after apply threshold:  40
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_patt

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



total sample size before apply threshold:  43
Counter({'0000-0001-9324-5901': 20, '0000-0002-9586-6303': 20, '0000-0003-0587-2505': 1, '0000-0002-5507-1987': 1, '0000-0002-7324-1660': 1})
['0000-0001-9324-5901', '0000-0002-9586-6303']
Total sample size after apply threshold:  40
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(40, 27)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



['0000-0001-6330-6048', '0000-0003-3473-4611', '0000-0002-0463-1024', '0000-0003-1098-3138', '0000-0002-4004-2518', '0000-0002-2334-5664']
Total sample size after apply threshold:  206
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(206, 113)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.29      0.40        14
          1       0.90      0.75      0.82        12
          2       0.86      0.93      0.89        27
          3       0.42      0.44      0.43        18
          4       0.64      0.83      0.72        30
          5       0.60      0.27      0.37        11
          6       0.56      0.64      0.60        28

avg / total       0.66      0.66      0.64       140

[ 4  0  1  3  5  1  0  0  9  0  0  0  0  3  0  0 25  2  0  0  0  0  0  2
  8  4  0  4  2  0  1  1 25  0  1  0  0  0  2  0  3  6  0  1  0  3  5  1
 18]
LR Accuracy:  0.6571428571428571
LR F1:  0.6061584392329734
For name:  n_ali
total sample size before apply threshold:  14
Counter({'0000-0001-8121-0939': 6, '0000-0003-2063-2745': 3, '0000-0003-1245-4299': 2, '0000-0002-8292-0091': 1, '0000-0003-0858-7849': 1, '0000-0003-2924-6429': 1})
[]
Total sample size after apply threshold:  0
For name:  h_ng
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        41
          1       0.90      0.89      0.90        53
          2       0.60      0.23      0.33        13

avg / total       0.83      0.85      0.83       107

[41  0  0  4 47  2  5  5  3]
svc Accuracy:  0.8504672897196262
svc F1:  0.7098901098901099
             precision    recall  f1-score   support

          0       0.80      0.88      0.84        41
          1       0.77      0.89      0.82        53
          2       0.00      0.00      0.00        13

avg / total       0.69      0.78      0.73       107

[36  5  0  5 47  1  4  9  0]
LR Accuracy:  0.7757009345794392
LR F1:  0.5539235686114511
For name:  m_viana
total sample size before apply threshold:  139
Counter({'0000-0002-0464-4845': 34, '0000-0003-4356-8109': 31, '0000-0002-4073-3802': 29, '0000-0001-9665-2115': 26, '0000-0001-9288-2108': 13, '0000-0002-3074-767X': 5, '0000-0002-5657-5570': 1})
['0000-0001-9665-2115

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      0.77      0.85        26
          1       0.67      0.65      0.66        31
          2       0.71      0.86      0.78        29
          3       1.00      0.15      0.27        13
          4       0.64      0.85      0.73        34

avg / total       0.76      0.72      0.70       133

[20  3  0  0  3  1 20  5  0  5  0  1 25  0  3  0  3  3  2  5  0  3  2  0
 29]
MNB Accuracy:  0.7218045112781954
MNB F1:  0.6577790833123613
             precision    recall  f1-score   support

          0       1.00      0.81      0.89        26
          1       0.79      0.71      0.75        31
          2       0.89      0.86      0.88        29
          3       0.40      0.15      0.22        13
          4       0.55      0.82      0.66        34

avg / total       0.75      0.74      0.73       133

[21  0  0  0  5  0 22  1  1  7  0  1 25  0  3  0  2  1  2  8  0  3  1  2
 28]
svc Accuracy:  0.7368421052631579


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 20)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 9)
2
(22, 29)
             precision    recall  f1-score   support

          0       0.50      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



2
(139, 79)
             precision    recall  f1-score   support

          0       0.86      0.99      0.92        68
          1       1.00      0.06      0.11        18
          2       0.78      0.89      0.83        53

avg / total       0.85      0.83      0.78       139

[67  0  1  5  1 12  6  0 47]
MNB Accuracy:  0.8273381294964028
MNB F1:  0.6183099280508216
             precision    recall  f1-score   support

          0       0.85      0.97      0.90        68
          1       0.36      0.22      0.28        18
          2       0.88      0.83      0.85        53

avg / total       0.80      0.82      0.80       139

[66  2  0  8  4  6  4  5 44]
svc Accuracy:  0.8201438848920863
svc F1:  0.6781135300151493
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        68
          1       0.50      0.06      0.10        18
          2       0.82      0.89      0.85        53

avg / total       0.79      0.83      0.79       139

[

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(69, 40)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(69, 10)
2
(69, 50)
             precision    recall  f1-score   support

          0       0.79

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(35, 19)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(35, 11)
2
(35, 30)
             precision    recall  f1-score   support

          0       0.89

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(100, 58)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(100, 18)
2
(100, 76)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(269, 105)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(269, 17)
2
(269, 122)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      1.00      0.97       238
          1       1.00      0.58      0.73        31

avg / total       0.95      0.95      0.95       269

[238   0  13  18]
svc Accuracy:  0.9516728624535316
svc F1:  0.854054505237678
             precision    recall  f1-score   support

          0       0.89      1.00      0.94       238
          1       1.00      0.10      0.18        31

avg / total       0.91      0.90      0.86       269

[238   0  28   3]
LR Accuracy:  0.895910780669145
LR F1:  0.5604575163398693
For name:  k_jacobsen
total sample size before apply threshold:  113
Counter({'0000-0002-4198-6246': 93, '0000-0002-1121-2979': 17, '0000-0002-3450-0850': 2, '0000-0003-0135-0988': 1})
['0000-0002-4198-6246', '0000-0002-1121-2979']
Total sample size after apply threshold:  110
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', inpu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      1.00      0.98        93
          1       1.00      0.76      0.87        17

avg / total       0.97      0.96      0.96       110

[93  0  4 13]
svc Accuracy:  0.9636363636363636
svc F1:  0.9228070175438596
             precision    recall  f1-score   support

          0       0.90      1.00      0.95        93
          1       1.00      0.41      0.58        17

avg / total       0.92      0.91      0.89       110

[93  0 10  7]
LR Accuracy:  0.9090909090909091
LR F1:  0.7661564625850341
For name:  s_kelly
total sample size before apply threshold:  102
Counter({'0000-0003-4002-048X': 31, '0000-0001-8583-5362': 26, '0000-0002-8245-0181': 20, '0000-0003-3533-5268': 12, '0000-0002-0375-1040': 11, '0000-0002-3078-8404': 2})
['0000-0002-8245-0181', '0000-0001-8583-5362', '0000-0003-3533-5268', '0000-0002-0375-1040', '0000-0003-4002-048X']
Total sample size after apply threshold:  100
(0, 0)
TfidfVectorize

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.48      0.80      0.60        20
          1       0.61      0.54      0.57        26
          2       1.00      0.75      0.86        12
          3       0.83      0.45      0.59        11
          4       0.69      0.65      0.67        31

avg / total       0.68      0.64      0.64       100

[16  1  0  1  2  5 14  0  0  7  3  0  9  0  0  5  1  0  5  0  4  7  0  0
 20]
svc Accuracy:  0.64
svc F1:  0.6574493948522806
             precision    recall  f1-score   support

          0       0.54      0.70      0.61        20
          1       0.64      0.62      0.63        26
          2       0.64      0.58      0.61        12
          3       1.00      0.45      0.62        11
          4       0.61      0.65      0.62        31

avg / total       0.65      0.62      0.62       100

[14  1  2  0  3  2 16  1  0  7  2  0  7  0  3  4  1  1  5  0  4  7  0  0
 20]
LR Accuracy:  0.62
LR F1:  0.6189684569479966
Fo

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.35      0.50        17
          1       1.00      0.18      0.31        11
          2       1.00      0.08      0.14        13
          3       0.82      1.00      0.90       144

avg / total       0.85      0.83      0.78       185

[  6   0   0  11   1   2   0   8   0   0   1  12   0   0   0 144]
MNB Accuracy:  0.827027027027027
MNB F1:  0.46334269179096765
             precision    recall  f1-score   support

          0       0.36      0.24      0.29        17
          1       1.00      0.45      0.62        11
          2       1.00      0.62      0.76        13
          3       0.85      0.95      0.90       144

avg / total       0.83      0.83      0.82       185

[  4   0   0  13   0   5   0   6   0   0   8   5   7   0   0 137]
svc Accuracy:  0.8324324324324325
svc F1:  0.6427449258391882
             precision    recall  f1-score   support

          0       1.00      0.12      0.21      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(297, 123)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(297, 30)
2
(297, 153)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.73      0.40      0.52        20
          1       0.87      0.68      0.76        19
          2       0.96      0.77      0.86        35
          3       0.70      0.72      0.71        69
          4       0.74      0.94      0.83       112
          5       1.00      0.80      0.89        15
          6       0.89      0.59      0.71        27

avg / total       0.79      0.78      0.77       297

[  8   0   1   3   8   0   0   0  13   0   6   0   0   0   2   0  27   1
   5   0   0   1   2   0  50  16   0   0   0   0   0   5 105   0   2   0
   0   0   1   2  12   0   0   0   0   5   6   0  16]
svc Accuracy:  0.7777777777777778
svc F1:  0.7541478770832691
             precision    recall  f1-score   support

          0       0.71      0.25      0.37        20
          1       1.00      0.11      0.19        19
          2       0.96      0.63      0.76        35
          3       0.76      0.74      0.75   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.86      0.84      0.85        87
          1       0.69      0.94      0.79       117
          2       0.81      0.58      0.68        38
          3       0.00      0.00      0.00        10
          4       1.00      0.58      0.74        24
          5       0.93      0.76      0.83        70

avg / total       0.80      0.79      0.78       346

[ 73   9   2   2   0   1   3 110   1   0   0   3   2  13  22   1   0   0
   4   5   1   0   0   0   0  10   0   0  14   0   3  13   1   0   0  53]
svc Accuracy:  0.7861271676300579
svc F1:  0.6485786479157833
             precision    recall  f1-score   support

          0       0.86      0.77      0.81        87
          1       0.61      0.91      0.74       117
          2       0.80      0.53      0.63        38
          3       0.00      0.00      0.00        10
          4       1.00      0.42      0.59        24
          5       0.88      0.74      0.81   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  195
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(195, 86)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(195, 29)
2
(195, 115)
             precision   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.60      0.69        15
          1       0.81      1.00      0.90        56
          2       1.00      0.40      0.57        15

avg / total       0.85      0.83      0.80        86

[ 9  6  0  0 56  0  2  7  6]
svc Accuracy:  0.8255813953488372
svc F1:  0.719912087912088
             precision    recall  f1-score   support

          0       1.00      0.20      0.33        15
          1       0.68      1.00      0.81        56
          2       1.00      0.07      0.12        15

avg / total       0.79      0.70      0.61        86

[ 3 12  0  0 56  0  0 14  1]
LR Accuracy:  0.6976744186046512
LR F1:  0.42330917874396135
For name:  m_gutierrez
total sample size before apply threshold:  32
Counter({'0000-0003-3199-0337': 30, '0000-0003-0964-6222': 2})
['0000-0003-3199-0337']
Total sample size after apply threshold:  30
For name:  s_moon
total sample size before apply threshold:  85
Counter({'0000-0001

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.29      0.44        14
          1       1.00      0.14      0.25        14
          2       0.79      0.52      0.62        29
          3       0.58      0.97      0.72        74
          4       0.60      0.31      0.41        39

avg / total       0.69      0.62      0.57       170

[ 4  0  0 10  0  0  2  1  8  3  0  0 15 10  4  0  0  1 72  1  0  0  2 25
 12]
LR Accuracy:  0.6176470588235294
LR F1:  0.4899684391827311
For name:  a_nielsen
total sample size before apply threshold:  132
Counter({'0000-0003-4372-9961': 70, '0000-0001-6616-0187': 27, '0000-0003-4464-8549': 17, '0000-0002-6469-4473': 7, '0000-0002-4837-9449': 3, '0000-0001-9842-5303': 2, '0000-0002-4741-7992': 2, '0000-0002-8955-9374': 2, '0000-0003-2199-2857': 1, '0000-0002-7130-6432': 1})
['0000-0001-6616-0187', '0000-0003-4372-9961', '0000-0003-4464-8549']
Total sample size after apply threshold:  114
(0, 0)
TfidfVectorizer(analyzer

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Counter({'0000-0001-8422-6792': 35, '0000-0002-2187-479X': 29, '0000-0002-5677-3024': 19, '0000-0001-8739-6893': 1})
['0000-0001-8422-6792', '0000-0002-5677-3024', '0000-0002-2187-479X']
Total sample size after apply threshold:  83
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(83, 47)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, s

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[12  5  1  3 10  2  3  2  7]
svc Accuracy:  0.6444444444444445
svc F1:  0.6426767676767676
             precision    recall  f1-score   support

          0       0.60      0.67      0.63        18
          1       0.47      0.53      0.50        15
          2       0.62      0.42      0.50        12

avg / total       0.56      0.56      0.55        45

[12  5  1  5  8  2  3  4  5]
LR Accuracy:  0.5555555555555556
LR F1:  0.543859649122807
For name:  m_parker
total sample size before apply threshold:  280
Counter({'0000-0002-3101-1138': 232, '0000-0002-7172-5231': 13, '0000-0003-1007-4612': 11, '0000-0002-3772-3742': 10, '0000-0002-1052-9296': 6, '0000-0002-3170-3505': 4, '0000-0002-1597-4858': 3, '0000-0001-9845-9108': 1})
['0000-0002-3101-1138', '0000-0003-1007-4612', '0000-0002-7172-5231', '0000-0002-3772-3742']
Total sample size after apply threshold:  266
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding=

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      1.00      0.96       232
          1       0.67      0.18      0.29        11
          2       1.00      0.46      0.63        13
          3       1.00      0.60      0.75        10

avg / total       0.92      0.92      0.91       266

[231   1   0   0   9   2   0   0   7   0   6   0   4   0   0   6]
svc Accuracy:  0.9210526315789473
svc F1:  0.6559537430532854
             precision    recall  f1-score   support

          0       0.88      1.00      0.94       232
          1       0.00      0.00      0.00        11
          2       1.00      0.15      0.27        13
          3       0.00      0.00      0.00        10

avg / total       0.82      0.88      0.83       266

[232   0   0   0  11   0   0   0  11   0   2   0  10   0   0   0]
LR Accuracy:  0.8796992481203008
LR F1:  0.30053763440860215
For name:  h_huang
total sample size before apply threshold:  224
Counter({'0000-0002-3386-0934': 87, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.86      0.33      0.48        18
          1       0.38      0.50      0.43        24
          2       1.00      0.50      0.67        14
          3       0.00      0.00      0.00        16
          4       0.66      0.86      0.75        87
          5       0.12      0.06      0.08        16

avg / total       0.56      0.58      0.54       175

[ 6  5  0  1  6  0  0 12  0  1 11  0  1  1  7  1  4  0  0  4  0  0  8  4
  0  7  0  2 75  3  0  3  0  2 10  1]
svc Accuracy:  0.5771428571428572
svc F1:  0.4008066808813078
             precision    recall  f1-score   support

          0       0.83      0.28      0.42        18
          1       0.42      0.33      0.37        24
          2       1.00      0.36      0.53        14
          3       0.00      0.00      0.00        16
          4       0.59      0.93      0.72        87
          5       0.50      0.06      0.11        16

avg / total       0.56     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      0.50      0.62        20
          1       0.77      0.94      0.85        35

avg / total       0.79      0.78      0.77        55

[10 10  2 33]
LR Accuracy:  0.7818181818181819
LR F1:  0.7355769230769231
For name:  y_xu
total sample size before apply threshold:  137
Counter({'0000-0002-2195-1695': 47, '0000-0002-6689-7768': 19, '0000-0002-6406-7832': 17, '0000-0001-6643-3173': 9, '0000-0002-0763-9953': 8, '0000-0002-4479-6157': 8, '0000-0001-7429-4724': 5, '0000-0002-5578-4960': 4, '0000-0002-1887-0632': 4, '0000-0002-9834-3006': 3, '0000-0002-9945-3514': 3, '0000-0001-8488-0399': 2, '0000-0001-9106-0049': 1, '0000-0003-4549-6110': 1, '0000-0002-2341-7971': 1, '0000-0003-4420-6353': 1, '0000-0002-7963-6890': 1, '0000-0002-7962-6668': 1, '0000-0003-1355-0055': 1, '0000-0002-1563-8811': 1})
['0000-0002-6406-7832', '0000-0002-2195-1695', '0000-0002-6689-7768']
Total sample size after apply threshold:  83


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(37, 20)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(37, 12)
2
(37, 32)
             precision    recall  f1-score   support

          0       0.76

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.18      0.30        22
          1       0.88      1.00      0.93       216
          2       0.00      0.00      0.00        11

avg / total       0.84      0.88      0.84       249

[  4  18   0   1 215   0   0  11   0]
MNB Accuracy:  0.8795180722891566
MNB F1:  0.4103596349973162
             precision    recall  f1-score   support

          0       0.88      0.32      0.47        22
          1       0.91      1.00      0.95       216
          2       1.00      0.55      0.71        11

avg / total       0.92      0.92      0.90       249

[  7  15   0   1 215   0   0   5   6]
svc Accuracy:  0.9156626506024096
svc F1:  0.7086619422343956
             precision    recall  f1-score   support

          0       1.00      0.05      0.09        22
          1       0.87      1.00      0.93       216
          2       0.00      0.00      0.00        11

avg / total       0.84      0.87      0.82       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Counter({'0000-0001-6558-4973': 17, '0000-0002-2775-131X': 8, '0000-0002-7105-2815': 6, '0000-0003-3464-6208': 5, '0000-0002-7241-8719': 5, '0000-0001-8496-156X': 3, '0000-0003-3908-0741': 2, '0000-0001-5277-4575': 2, '0000-0001-6534-1824': 1})
['0000-0001-6558-4973']
Total sample size after apply threshold:  17
For name:  j_richard
total sample size before apply threshold:  179
Counter({'0000-0002-0440-2387': 110, '0000-0003-1503-3035': 57, '0000-0001-5750-0418': 10, '0000-0003-2514-8282': 2})
['0000-0002-0440-2387', '0000-0003-1503-3035', '0000-0001-5750-0418']
Total sample size after apply threshold:  177
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.69      0.87      0.77       119
          1       0.78      0.71      0.74       133
          2       0.00      0.00      0.00        19

avg / total       0.69      0.73      0.70       271

[104  15   0  39  94   0   8  11   0]
MNB Accuracy:  0.7306273062730627
MNB F1:  0.5044844581076465
             precision    recall  f1-score   support

          0       0.71      0.82      0.76       119
          1       0.75      0.74      0.75       133
          2       0.33      0.05      0.09        19

avg / total       0.71      0.73      0.71       271

[98 21  0 33 98  2  7 11  1]
svc Accuracy:  0.7269372693726938
svc F1:  0.532934051198317
             precision    recall  f1-score   support

          0       0.70      0.73      0.72       119
          1       0.70      0.77      0.74       133
          2       0.00      0.00      0.00        19

avg / total       0.65      0.70      0.68       271

[ 87  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(67, 39)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(67, 15)
2
(67, 54)
             precision    recall  f1-score   support

          0       0.75

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.71      0.83        35
          1       0.94      1.00      0.97       167

avg / total       0.95      0.95      0.95       202

[ 25  10   0 167]
svc Accuracy:  0.9504950495049505
svc F1:  0.9021317829457365
             precision    recall  f1-score   support

          0       1.00      0.34      0.51        35
          1       0.88      1.00      0.94       167

avg / total       0.90      0.89      0.86       202

[ 12  23   0 167]
LR Accuracy:  0.8861386138613861
LR F1:  0.7231062637821086
For name:  m_reilly
total sample size before apply threshold:  20
Counter({'0000-0001-8029-0084': 17, '0000-0002-5526-8245': 1, '0000-0001-8746-3224': 1, '0000-0003-2506-3190': 1})
['0000-0001-8029-0084']
Total sample size after apply threshold:  17
For name:  d_nguyen
total sample size before apply threshold:  25
Counter({'0000-0002-4997-555X': 8, '0000-0002-3283-3504': 7, '0000-0001-6420-7308': 3, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(135, 87)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(135, 29)
2
(135, 116)
             precision    recall  f1-score   support

          0       0.42   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.77      0.98      0.87       125
          1       1.00      0.40      0.57        15
          2       1.00      0.72      0.84        18
          3       0.67      0.17      0.27        12
          4       0.67      0.75      0.71        16
          5       1.00      0.28      0.43        18

avg / total       0.82      0.79      0.76       204

[123   0   0   1   1   0   6   6   0   0   3   0   3   0  13   0   2   0
  10   0   0   2   0   0   4   0   0   0  12   0  13   0   0   0   0   5]
svc Accuracy:  0.7892156862745098
svc F1:  0.613944510041669
             precision    recall  f1-score   support

          0       0.69      1.00      0.81       125
          1       1.00      0.07      0.12        15
          2       1.00      0.50      0.67        18
          3       0.00      0.00      0.00        12
          4       0.67      0.50      0.57        16
          5       0.00      0.00      0.00    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



[24  4  7 17]
MNB Accuracy:  0.7884615384615384
MNB F1:  0.784557438794727
             precision    recall  f1-score   support

          0       0.85      0.79      0.81        28
          1       0.77      0.83      0.80        24

avg / total       0.81      0.81      0.81        52

[22  6  4 20]
svc Accuracy:  0.8076923076923077
svc F1:  0.8074074074074074
             precision    recall  f1-score   support

          0       0.78      0.75      0.76        28
          1       0.72      0.75      0.73        24

avg / total       0.75      0.75      0.75        52

[21  7  6 18]
LR Accuracy:  0.75
LR F1:  0.7491651205936921
For name:  d_collins
total sample size before apply threshold:  31
Counter({'0000-0001-6754-9290': 8, '0000-0002-6248-9644': 7, '0000-0002-3283-0733': 6, '0000-0003-2274-0889': 5, '0000-0003-2484-1640': 2, '0000-0002-8432-7021': 1, '0000-0001-8891-1893': 1, '0000-0002-7981-3586': 1})
[]
Total sample size after apply threshold:  0
For name:  l_davies
total 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      0.82      0.88        22
          1       0.96      0.99      0.98       104

avg / total       0.96      0.96      0.96       126

[ 18   4   1 103]
svc Accuracy:  0.9603174603174603
svc F1:  0.9271760490116749
             precision    recall  f1-score   support

          0       1.00      0.41      0.58        22
          1       0.89      1.00      0.94       104

avg / total       0.91      0.90      0.88       126

[  9  13   0 104]
LR Accuracy:  0.8968253968253969
LR F1:  0.760910815939279
For name:  a_fontana
total sample size before apply threshold:  203
Counter({'0000-0002-6660-5315': 65, '0000-0002-5453-461X': 59, '0000-0002-5391-7520': 44, '0000-0002-8481-1219': 16, '0000-0002-4791-8746': 14, '0000-0003-3820-2823': 3, '0000-0003-1556-2770': 2})
['0000-0002-5391-7520', '0000-0002-5453-461X', '0000-0002-4791-8746', '0000-0002-6660-5315', '0000-0002-8481-1219']
Total sample size after apply th

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      0.61      0.63        44
          1       0.63      0.58      0.60        59
          2       1.00      0.43      0.60        14
          3       0.71      0.92      0.80        65
          4       0.91      0.62      0.74        16

avg / total       0.71      0.69      0.68       198

[27  9  0  8  0 12 34  0 12  1  0  6  6  2  0  3  2  0 60  0  0  3  0  3
 10]
svc Accuracy:  0.6919191919191919
svc F1:  0.6740835257978703
             precision    recall  f1-score   support

          0       0.58      0.59      0.58        44
          1       0.67      0.59      0.63        59
          2       0.00      0.00      0.00        14
          3       0.63      0.92      0.75        65
          4       1.00      0.38      0.55        16

avg / total       0.62      0.64      0.61       198

[26  6  0 12  0 11 35  0 13  0  2  8  0  4  0  3  2  0 60  0  3  1  0  6
  6]
LR Accuracy:  0.6414141414141414
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.50      0.62        32
          1       1.00      0.64      0.78        11
          2       0.48      0.31      0.38        32
          3       0.67      0.96      0.79       179
          4       1.00      0.50      0.67        20
          5       1.00      0.29      0.45        24
          6       0.67      0.19      0.30        21
          7       0.77      0.59      0.67        34

avg / total       0.72      0.69      0.66       353

[ 16   0   3  13   0   0   0   0   1   7   1   2   0   0   0   0   1   0
  10  19   0   0   0   2   1   0   3 171   0   0   1   3   0   0   1   9
  10   0   0   0   0   0   0  15   0   7   1   1   0   0   1  16   0   0
   4   0   1   0   2  11   0   0   0  20]
svc Accuracy:  0.6940509915014165
svc F1:  0.5797462891419489
             precision    recall  f1-score   support

          0       1.00      0.31      0.48        32
          1       1.00      0.18     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  603
Counter({'0000-0002-3650-9381': 154, '0000-0003-1673-2954': 113, '0000-0002-2120-2766': 85, '0000-0002-6279-9685': 84, '0000-0003-3528-6793': 65, '0000-0003-4453-9713': 32, '0000-0002-5197-5030': 26, '0000-0002-3945-630X': 10, '0000-0001-7894-6814': 9, '0000-0002-5750-0706': 6, '0000-0002-5495-8906': 4, '0000-0003-3762-6253': 4, '0000-0002-0479-4261': 3, '0000-0003-2389-461X': 2, '0000-0001-6272-8871': 2, '0000-0001-7683-2653': 1, '0000-0002-2104-2264': 1, '0000-0001-9068-4642': 1, '0000-0002-1881-2766': 1})
['0000-0002-3945-630X', '0000-0003-4453-9713', '0000-0003-3528-6793', '0000-0002-6279-9685', '0000-0003-1673-2954', '0000-0002-3650-9381', '0000-0002-2120-2766', '0000-0002-5197-5030']
Total sample size after apply threshold:  569
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.12      0.22        32
          2       0.72      0.52      0.61        65
          3       0.77      0.49      0.60        84
          4       0.58      0.90      0.71       113
          5       0.64      0.85      0.73       154
          6       0.88      0.88      0.88        85
          7       0.00      0.00      0.00        26

avg / total       0.67      0.68      0.64       569

[  0   0   2   0   4   3   1   0   0   4   4   0  16   8   0   0   0   0
  34   8   7  15   1   0   0   0   6  41  15  21   1   0   0   0   0   0
 102  11   0   0   0   0   0   3  17 131   3   0   0   0   0   1   1   8
  75   0   0   0   1   0  13   8   4   0]
MNB Accuracy:  0.680140597539543
MNB F1:  0.46854956422348265
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       1.00      0.41     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.50      0.40      0.44        10
          1       0.71      0.50      0.59        10
          2       0.00      0.00      0.00        11
          3       0.83      0.42      0.56        12
          4       0.57      0.94      0.71        34

avg / total       0.54      0.60      0.53        77

[ 4  0  0  0  6  2  5  0  1  2  1  0  0  0 10  1  0  0  5  6  0  2  0  0
 32]
MNB Accuracy:  0.5974025974025974
MNB F1:  0.4598692810457516
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.60      0.60      0.60        10
          2       0.50      0.18      0.27        11
          3       0.62      0.42      0.50        12
          4       0.64      0.88      0.74        34

avg / total       0.66      0.66      0.64        77

[ 8  0  0  0  2  0  6  0  2  2  0  0  2  1  8  0  1  1  5  5  0  3  1  0
 30]
svc Accuracy:  0.6623376623376623


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(449, 101)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(449, 27)
2
(449, 128)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.33      0.08      0.13        12
          1       0.88      1.00      0.94       231
          2       0.99      0.94      0.97       167
          3       1.00      0.86      0.93        22
          4       0.89      0.47      0.62        17

avg / total       0.92      0.92      0.91       449

[  1  11   0   0   0   1 230   0   0   0   0   9 157   0   1   0   3   0
  19   0   1   7   1   0   8]
svc Accuracy:  0.9242761692650334
svc F1:  0.715712921390533
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.84      1.00      0.91       231
          2       1.00      0.93      0.96       167
          3       1.00      0.68      0.81        22
          4       1.00      0.18      0.30        17

avg / total       0.89      0.90      0.88       449

[  0  12   0   0   0   0 231   0   0   0   0  12 155   0   0   0   7   0
  15   0   0  1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.63      0.85      0.72        26
          1       0.88      0.50      0.64        14
          2       0.50      0.43      0.46        21

avg / total       0.64      0.62      0.61        61

[22  0  4  2  7  5 11  1  9]
LR Accuracy:  0.6229508196721312
LR F1:  0.6064045244373112
For name:  r_little
total sample size before apply threshold:  4
Counter({'0000-0002-4000-946X': 2, '0000-0002-7732-157X': 1, '0000-0003-1870-3241': 1})
[]
Total sample size after apply threshold:  0
For name:  t_kobayashi
total sample size before apply threshold:  150
Counter({'0000-0002-4008-454X': 85, '0000-0002-0237-3623': 22, '0000-0002-2738-373X': 10, '0000-0002-7650-1763': 10, '0000-0002-0903-6259': 6, '0000-0002-9202-7643': 5, '0000-0001-7297-8524': 5, '0000-0002-6952-8669': 4, '0000-0003-0963-2525': 2, '0000-0003-4264-5117': 1})
['0000-0002-4008-454X', '0000-0002-2738-373X', '0000-0002-0237-3623', '0000-0002-7650-1763']
Total 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.09      0.17        11
          1       0.83      1.00      0.91        48

avg / total       0.86      0.83      0.77        59

[ 1 10  0 48]
LR Accuracy:  0.8305084745762712
LR F1:  0.5361635220125787
For name:  h_vogel
total sample size before apply threshold:  15
Counter({'0000-0001-9821-7731': 5, '0000-0002-9902-8120': 4, '0000-0003-2404-9485': 4, '0000-0003-0072-4239': 2})
[]
Total sample size after apply threshold:  0
For name:  m_campos
total sample size before apply threshold:  148
Counter({'0000-0001-7738-9892': 107, '0000-0003-3217-9001': 12, '0000-0003-4313-7069': 8, '0000-0003-1012-6240': 6, '0000-0002-0883-0610': 5, '0000-0002-5233-3769': 5, '0000-0003-4683-0176': 3, '0000-0002-9516-6526': 2})
['0000-0001-7738-9892', '0000-0003-3217-9001']
Total sample size after apply threshold:  119
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


['0000-0001-7360-8592', '0000-0002-8157-7746']
Total sample size after apply threshold:  287
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(287, 124)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.22      0.36        18
          1       0.00      0.00      0.00        10
          2       0.00      0.00      0.00        10
          3       0.67      1.00      0.80        65

avg / total       0.60      0.67      0.57       103

[ 4  0  0 14  0  0  1  9  0  1  0  9  0  0  0 65]
LR Accuracy:  0.6699029126213593
LR F1:  0.29152637485970817
For name:  a_giuliani
total sample size before apply threshold:  196
Counter({'0000-0002-4640-804X': 155, '0000-0003-1710-4933': 36, '0000-0002-4315-1699': 4, '0000-0002-6823-2807': 1})
['0000-0003-1710-4933', '0000-0002-4640-804X']
Total sample size after apply threshold:  191
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.14      0.24        36
          1       0.83      1.00      0.91       155

avg / total       0.86      0.84      0.78       191

[  5  31   0 155]
LR Accuracy:  0.837696335078534
LR F1:  0.5764966740576497
For name:  f_campos
total sample size before apply threshold:  49
Counter({'0000-0001-8376-0977': 14, '0000-0002-5948-472X': 12, '0000-0002-1132-3257': 10, '0000-0001-8332-5043': 9, '0000-0001-9826-751X': 2, '0000-0001-5828-2862': 2})
['0000-0001-8376-0977', '0000-0002-5948-472X', '0000-0002-1132-3257']
Total sample size after apply threshold:  36
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.45      0.56        20
          1       0.76      0.99      0.86       188
          2       0.90      0.78      0.84        23
          3       0.94      0.68      0.79        65
          4       0.97      0.76      0.85        98
          5       0.94      0.67      0.78        24

avg / total       0.85      0.83      0.82       418

[  9   9   0   2   0   0   1 186   0   0   0   1   0   5  18   0   0   0
   1  19   0  44   1   0   1  21   2   0  74   0   0   6   0   1   1  16]
svc Accuracy:  0.8301435406698564
svc F1:  0.7789381604507418
             precision    recall  f1-score   support

          0       1.00      0.15      0.26        20
          1       0.65      0.99      0.79       188
          2       0.86      0.52      0.65        23
          3       0.97      0.55      0.71        65
          4       0.94      0.64      0.76        98
          5       1.00      0.42      0.59   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.53      0.71      0.61        28
          1       0.95      0.87      0.91        23
          2       0.60      0.40      0.48        15
          3       0.82      0.76      0.78        41

avg / total       0.74      0.72      0.72       107

[20  1  2  5  3 20  0  0  7  0  6  2  8  0  2 31]
svc Accuracy:  0.719626168224299
svc F1:  0.6949904104334484
             precision    recall  f1-score   support

          0       0.46      0.39      0.42        28
          1       0.86      0.83      0.84        23
          2       0.56      0.33      0.42        15
          3       0.65      0.83      0.73        41

avg / total       0.63      0.64      0.63       107

[11  2  2 13  3 19  1  0  4  1  5  5  6  0  1 34]
LR Accuracy:  0.6448598130841121
LR F1:  0.6038427074717397
For name:  m_grant
total sample size before apply threshold:  39
Counter({'0000-0002-1380-2104': 28, '0000-0002-7838-8725': 9, '0000-0003

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.45      0.61        22
          1       0.56      0.50      0.53        44
          2       0.00      0.00      0.00        14
          3       0.61      0.83      0.70        78

avg / total       0.58      0.61      0.58       158

[10  1  0 11  1 22  0 21  0  4  0 10  0 12  1 65]
MNB Accuracy:  0.6139240506329114
MNB F1:  0.45972094767275495
             precision    recall  f1-score   support

          0       0.76      0.59      0.67        22
          1       0.54      0.59      0.57        44
          2       0.67      0.14      0.24        14
          3       0.69      0.79      0.74        78

avg / total       0.66      0.65      0.64       158

[13  2  0  7  3 26  0 15  1  5  2  6  0 15  1 62]
svc Accuracy:  0.6518987341772152
svc F1:  0.5513183534283279


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.78      0.32      0.45        22
          1       0.61      0.45      0.52        44
          2       0.00      0.00      0.00        14
          3       0.59      0.87      0.70        78

avg / total       0.57      0.60      0.55       158

[ 7  1  0 14  1 20  0 23  0  3  0 11  1  9  0 68]
LR Accuracy:  0.6012658227848101
LR F1:  0.41803108763534436
For name:  m_king
total sample size before apply threshold:  58
Counter({'0000-0002-2587-9117': 26, '0000-0001-6030-5154': 13, '0000-0001-9895-7297': 9, '0000-0001-5611-9498': 7, '0000-0002-9558-8622': 2, '0000-0001-7993-8808': 1})
['0000-0001-6030-5154', '0000-0002-2587-9117']
Total sample size after apply threshold:  39
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), no

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.71      0.69        14
          1       0.67      0.62      0.64        13

avg / total       0.67      0.67      0.67        27

[10  4  5  8]
svc Accuracy:  0.6666666666666666
svc F1:  0.6648275862068965
             precision    recall  f1-score   support

          0       0.69      0.64      0.67        14
          1       0.64      0.69      0.67        13

avg / total       0.67      0.67      0.67        27

[9 5 4 9]
LR Accuracy:  0.6666666666666666
LR F1:  0.6666666666666666
For name:  m_scholz
total sample size before apply threshold:  42
Counter({'0000-0001-8440-6785': 31, '0000-0002-4300-3020': 9, '0000-0001-9887-9831': 2})
['0000-0001-8440-6785']
Total sample size after apply threshold:  31
For name:  y_ju
total sample size before apply threshold:  27
Counter({'0000-0002-5120-6960': 14, '0000-0001-8325-1494': 9, '0000-0003-0103-1207': 3, '0000-0002-5514-4189': 1})
['0000-0002-5120-6960']

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(295, 104)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(295, 16)
2
(295, 120)
             precision    recall  f1-score   support

          0       0.50  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.14      0.24        14
          1       0.87      0.54      0.67        24
          2       0.67      0.32      0.43        19
          3       0.72      0.98      0.83        85

avg / total       0.73      0.73      0.69       142

[ 2  1  1 10  1 13  0 10  0  1  6 12  0  0  2 83]
MNB Accuracy:  0.7323943661971831
MNB F1:  0.5401330532212886
             precision    recall  f1-score   support

          0       0.90      0.64      0.75        14
          1       0.81      0.71      0.76        24
          2       0.83      0.53      0.65        19
          3       0.84      0.98      0.90        85

avg / total       0.84      0.84      0.83       142

[ 9  1  0  4  1 17  0  6  0  3 10  6  0  0  2 83]
svc Accuracy:  0.8380281690140845
svc F1:  0.7632226897304035
             precision    recall  f1-score   support

          0       1.00      0.14      0.25        14
          1       0.93     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(54, 24)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(54, 13)
2
(54, 37)
             precision    recall  f1-score   support

          0       0.8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.65      0.57      0.60        23
          1       0.88      0.64      0.74        11
          2       0.51      0.67      0.58        27
          3       0.62      0.50      0.56        10

avg / total       0.63      0.61      0.61        71

[13  0 10  0  0  7  3  1  6  1 18  2  1  0  4  5]
svc Accuracy:  0.6056338028169014
svc F1:  0.6194234962249334
             precision    recall  f1-score   support

          0       0.61      0.48      0.54        23
          1       0.55      0.55      0.55        11
          2       0.49      0.70      0.58        27
          3       1.00      0.30      0.46        10

avg / total       0.61      0.55      0.54        71

[11  3  9  0  0  6  5  0  6  2 19  0  1  0  6  3]
LR Accuracy:  0.5492957746478874
LR F1:  0.5298339871510603
For name:  s_chapman
total sample size before apply threshold:  71
Counter({'0000-0003-3347-6024': 23, '0000-0003-0053-1584': 23, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(42, 21)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(42, 19)
2
(42, 40)
             precision    recall  f1-score   support

          0       0.67

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.87      1.00      0.93        34

avg / total       0.91      0.89      0.88        47

[ 8  5  0 34]
LR Accuracy:  0.8936170212765957
LR F1:  0.8467058056099153
For name:  e_shaw
total sample size before apply threshold:  16
Counter({'0000-0003-1424-7568': 9, '0000-0002-5653-0145': 4, '0000-0002-4148-3526': 2, '0000-0002-4334-1900': 1})
[]
Total sample size after apply threshold:  0
For name:  m_cameron
total sample size before apply threshold:  28
Counter({'0000-0001-5788-8790': 17, '0000-0002-2277-7035': 9, '0000-0001-9464-8796': 1, '0000-0002-2508-7718': 1})
['0000-0001-5788-8790']
Total sample size after apply threshold:  17
For name:  a_reid
total sample size before apply threshold:  44
Counter({'0000-0002-0523-926X': 18, '0000-0003-1752-3302': 18, '0000-0003-4713-2951': 6, '0000-0002-2500-2980': 2})
['0000-0002-0523-926X', '0000-0003-1752-3302']
Total sa

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.44      0.25      0.32        16
          1       0.64      0.78      0.71        23
          2       0.61      0.69      0.65        16

avg / total       0.58      0.60      0.58        55

[ 4  7  5  3 18  2  2  3 11]
MNB Accuracy:  0.6
MNB F1:  0.5576470588235294
             precision    recall  f1-score   support

          0       0.30      0.19      0.23        16
          1       0.53      0.70      0.60        23
          2       0.60      0.56      0.58        16

avg / total       0.48      0.51      0.49        55

[ 3  9  4  5 16  2  2  5  9]
svc Accuracy:  0.509090909090909
svc F1:  0.4717293256550712
             precision    recall  f1-score   support

          0       0.29      0.12      0.17        16
          1       0.59      0.74      0.65        23
          2       0.53      0.62      0.57        16

avg / total       0.48      0.53      0.49        55

[ 2  8  6  3 17  3  2  4 10]
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  56
For name:  p_ding
total sample size before apply threshold:  17
Counter({'0000-0002-3535-6053': 8, '0000-0003-2559-4696': 8, '0000-0002-2613-2496': 1})
[]
Total sample size after apply threshold:  0
For name:  g_morris
total sample size before apply threshold:  128
Counter({'0000-0003-1731-8405': 50, '0000-0003-2588-6349': 23, '0000-0002-1097-4453': 19, '0000-0001-9893-6648': 16, '0000-0002-3067-3359': 15, '0000-0003-2892-8428': 5})
['0000-0001-9893-6648', '0000-0003-2588-6349', '0000-0002-1097-4453', '0000-0002-3067-3359', '0000-0003-1731-8405']
Total sample size after apply threshold:  123
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  129
Counter({'0000-0001-8802-9606': 93, '0000-0003-4268-4757': 21, '0000-0003-3509-0686': 10, '0000-0001-7235-5554': 4, '0000-0003-2258-2817': 1})
['0000-0003-4268-4757', '0000-0001-8802-9606', '0000-0003-3509-0686']
Total sample size after apply threshold:  124
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(124, 50)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(120, 55)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(120, 18)
2
(120, 73)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.89      0.82        37
          1       0.79      0.60      0.68        25

avg / total       0.78      0.77      0.77        62

[33  4 10 15]
svc Accuracy:  0.7741935483870968
svc F1:  0.7534090909090909
             precision    recall  f1-score   support

          0       0.77      0.97      0.86        37
          1       0.93      0.56      0.70        25

avg / total       0.83      0.81      0.79        62

[36  1 11 14]
LR Accuracy:  0.8064516129032258
LR F1:  0.7785714285714286
For name:  c_he
total sample size before apply threshold:  49
Counter({'0000-0002-4868-331X': 20, '0000-0002-1918-5186': 13, '0000-0002-0663-275X': 7, '0000-0001-7869-7627': 5, '0000-0001-5426-769X': 2, '0000-0001-9867-9629': 1, '0000-0001-5842-9617': 1})
['0000-0002-4868-331X', '0000-0002-1918-5186']
Total sample size after apply threshold:  33
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='stri

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



             precision    recall  f1-score   support

          0       0.74      1.00      0.85        35
          1       1.00      0.14      0.25        14

avg / total       0.82      0.76      0.68        49

[35  0 12  2]
LR Accuracy:  0.7551020408163265
LR F1:  0.5518292682926829
For name:  k_saito
total sample size before apply threshold:  61
Counter({'0000-0003-4663-1134': 26, '0000-0002-2151-6204': 16, '0000-0002-5726-8775': 11, '0000-0003-2557-1726': 7, '0000-0001-6310-5342': 1})
['0000-0002-2151-6204', '0000-0003-4663-1134', '0000-0002-5726-8775']
Total sample size after apply threshold:  53
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(1370, 492)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(1370, 28)
2
(1370, 520)
             precision    recall  f1-score   support

          0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.47      0.60      0.53        25
          1       1.00      0.08      0.15        12
          2       0.45      0.45      0.45        11
          3       0.58      0.25      0.35        28
          4       0.60      0.43      0.50        14
          5       0.88      0.41      0.56        17
          6       0.94      0.46      0.62        35
          7       0.33      0.35      0.34        31
          8       0.71      0.63      0.67        27
          9       0.50      0.17      0.25        12
         10       1.00      0.54      0.70        13
         11       0.30      0.21      0.25        14
         12       0.65      0.45      0.53        29
         13       0.00      0.00      0.00        11
         14       1.00      0.18      0.31        11
         15       0.21      0.18      0.19        17
         16       0.33      0.33      0.33        24
         17       0.24      0.23      0.24   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.55      0.50      0.52        24
          1       0.00      0.00      0.00        14
          2       0.30      0.39      0.34        36
          3       0.33      0.57      0.42        44
          4       0.85      0.65      0.74        26
          5       1.00      0.18      0.31        11
          6       0.29      0.17      0.21        12
          7       1.00      0.58      0.74        12
          8       1.00      0.55      0.71        20

avg / total       0.54      0.45      0.46       199

[12  0  6  6  0  0  0  0  0  0  0  6  6  2  0  0  0  0  4  2 14 16  0  0
  0  0  0  2  2 10 25  1  0  4  0  0  0  4  0  4 17  0  1  0  0  0  0  4
  5  0  2  0  0  0  1  0  2  7  0  0  2  0  0  1  0  2  2  0  0  0  7  0
  2  0  2  5  0  0  0  0 11]
svc Accuracy:  0.45226130653266333
svc F1:  0.4426375327353314
             precision    recall  f1-score   support

          0       0.67      0.42      0.51       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 35)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(52, 10)
2
(52, 45)
             precision    recall  f1-score   support

          0       0.81

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.68      0.95      0.79        57
          1       0.56      0.56      0.56        32
          2       0.57      0.25      0.35        16
          3       0.50      0.18      0.27        11
          4       1.00      0.50      0.67        14

avg / total       0.65      0.65      0.62       130

[54  3  0  0  0 12 18  2  0  0  5  6  4  1  0  5  3  1  2  0  4  2  0  1
  7]
svc Accuracy:  0.6538461538461539
svc F1:  0.5263961176346134
             precision    recall  f1-score   support

          0       0.59      1.00      0.75        57
          1       0.68      0.53      0.60        32
          2       1.00      0.12      0.22        16
          3       0.00      0.00      0.00        11
          4       1.00      0.50      0.67        14

avg / total       0.66      0.64      0.57       130

[57  0  0  0  0 15 17  0  0  0 10  4  2  0  0  9  2  0  0  0  5  2  0  0
  7]
LR Accuracy:  0.6384615384615384
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.59      0.61      0.60        44
          1       1.00      1.00      1.00        32
          2       0.58      0.39      0.47        18
          3       0.58      0.58      0.58        19
          4       1.00      0.69      0.82        13
          5       0.78      0.35      0.48        20
          6       0.33      0.19      0.24        16
          7       0.44      0.71      0.55        42

avg / total       0.65      0.62      0.61       204

[27  0  3  1  0  0  1 12  0 32  0  0  0  0  0  0  4  0  7  0  0  1  2  4
  1  0  0 11  0  1  1  5  0  0  0  0  9  0  0  4  3  0  0  1  0  7  1  8
  3  0  1  4  0  0  3  5  8  0  1  2  0  0  1 30]
svc Accuracy:  0.6176470588235294
svc F1:  0.5915011274267172
             precision    recall  f1-score   support

          0       0.54      0.68      0.60        44
          1       1.00      1.00      1.00        32
          2       0.58      0.39      0.47       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(169, 75)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(169, 15)
2
(169, 90)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      0.42      0.56        12
          1       1.00      0.09      0.17        11
          2       0.72      1.00      0.83        53
          3       1.00      0.77      0.87        22

avg / total       0.83      0.78      0.73        98

[ 5  0  7  0  1  1  9  0  0  0 53  0  0  0  5 17]
MNB Accuracy:  0.7755102040816326
MNB F1:  0.6071656908271081
             precision    recall  f1-score   support

          0       1.00      0.75      0.86        12
          1       1.00      0.73      0.84        11
          2       0.83      0.98      0.90        53
          3       0.94      0.77      0.85        22

avg / total       0.89      0.88      0.88        98

[ 9  0  3  0  0  8  3  0  0  0 52  1  0  0  5 17]
svc Accuracy:  0.8775510204081632
svc F1:  0.8614499611096708
             precision    recall  f1-score   support

          0       0.83      0.42      0.56        12
          1       1.00     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      0.87      0.74        90
          1       1.00      0.16      0.27        19
          2       0.78      0.88      0.83       166
          3       1.00      0.31      0.47        13
          4       0.47      0.18      0.26        44

avg / total       0.72      0.72      0.68       332

[ 78   0  11   0   1  11   3   3   0   2  14   0 146   0   6   7   0   2
   4   0  12   0  24   0   8]
svc Accuracy:  0.7198795180722891
svc F1:  0.5142010202275663
             precision    recall  f1-score   support

          0       0.65      0.78      0.71        90
          1       0.00      0.00      0.00        19
          2       0.70      0.87      0.78       166
          3       0.00      0.00      0.00        13
          4       0.47      0.18      0.26        44

avg / total       0.59      0.67      0.62       332

[ 70   0  19   0   1  11   0   8   0   0  13   0 145   0   8   5   0   8
   0   0   9  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(76, 50)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(76, 18)
2
(76, 68)
             precision    recall  f1-score   support

          0       1.00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.57      0.27      0.36        15
          2       0.72      0.62      0.67        21
          3       0.56      0.69      0.62        13
          4       0.83      0.59      0.69        17
          5       0.59      0.78      0.67        50

avg / total       0.58      0.58      0.56       130

[ 0  0  2  1  0 11  0  4  1  1  0  9  1  1 13  1  1  4  1  0  1  9  1  1
  2  0  0  3 10  2  7  2  1  1  0 39]
svc Accuracy:  0.5769230769230769
svc F1:  0.5021769418321141
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.00      0.00      0.00        15
          2       0.72      0.62      0.67        21
          3       0.53      0.69      0.60        13
          4       0.90      0.53      0.67        17
          5       0.56      0.94      0.70        50

avg / total       0.50     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.30      0.46        20
          1       0.00      0.00      0.00        15
          2       0.84      1.00      0.91       237
          3       0.00      0.00      0.00        16

avg / total       0.76      0.84      0.78       288

[  6   0  14   0   0   0  15   0   0   0 237   0   0   0  16   0]
LR Accuracy:  0.84375
LR F1:  0.3437083148065807
For name:  s_rossi
total sample size before apply threshold:  199
Counter({'0000-0003-3257-8248': 86, '0000-0002-9963-8121': 34, '0000-0002-9919-0494': 25, '0000-0002-8854-7072': 14, '0000-0003-0346-8410': 13, '0000-0002-3278-8993': 10, '0000-0002-2694-9535': 8, '0000-0001-5134-8398': 5, '0000-0001-7048-7158': 1, '0000-0001-8853-0775': 1, '0000-0001-9511-3857': 1, '0000-0001-7479-5756': 1})
['0000-0002-8854-7072', '0000-0002-9919-0494', '0000-0003-0346-8410', '0000-0002-9963-8121', '0000-0002-3278-8993', '0000-0003-3257-8248']
Total sample size after apply t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.21      0.30        14
          1       1.00      0.80      0.89        25
          2       0.00      0.00      0.00        13
          3       0.87      0.59      0.70        34
          4       0.67      0.20      0.31        10
          5       0.64      0.94      0.76        86

avg / total       0.68      0.69      0.65       182

[ 3  0  1  1  0  9  0 20  0  0  0  5  0  0  0  1  0 12  0  0  0 20  0 14
  1  0  1  0  2  6  2  0  1  1  1 81]
svc Accuracy:  0.6923076923076923
svc F1:  0.49314982713796646
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       1.00      0.64      0.78        25
          2       0.00      0.00      0.00        13
          3       0.87      0.59      0.70        34
          4       0.00      0.00      0.00        10
          5       0.60      0.99      0.75        86

avg / total       0.58    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 20)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(22, 8)
2
(22, 28)
             precision    recall  f1-score   support

          0       0.73 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.98      0.91       112
          1       0.93      0.55      0.69        47

avg / total       0.87      0.86      0.84       159

[110   2  21  26]
svc Accuracy:  0.8553459119496856
svc F1:  0.7993415637860083
             precision    recall  f1-score   support

          0       0.79      0.99      0.88       112
          1       0.95      0.38      0.55        47

avg / total       0.84      0.81      0.78       159

[111   1  29  18]
LR Accuracy:  0.8113207547169812
LR F1:  0.7132034632034632
For name:  l_rasmussen
total sample size before apply threshold:  249
Counter({'0000-0002-7480-3004': 214, '0000-0002-4497-8049': 24, '0000-0001-6613-2469': 5, '0000-0001-5962-6647': 4, '0000-0001-5795-4794': 1, '0000-0002-7301-3182': 1})
['0000-0002-4497-8049', '0000-0002-7480-3004']
Total sample size after apply threshold:  238
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        42
          1       0.94      0.75      0.83        40
          2       0.93      0.99      0.96       179

avg / total       0.94      0.94      0.94       261

[ 38   0   4   0  30  10   0   2 177]
svc Accuracy:  0.9386973180076629
svc F1:  0.9133633633633634
             precision    recall  f1-score   support

          0       1.00      0.74      0.85        42
          1       1.00      0.40      0.57        40
          2       0.84      1.00      0.91       179

avg / total       0.89      0.87      0.85       261

[ 31   0  11   0  16  24   0   0 179]
LR Accuracy:  0.8659003831417624
LR F1:  0.7772283719162315
For name:  j_fraser
total sample size before apply threshold:  101
Counter({'0000-0002-5080-2859': 38, '0000-0002-6505-1883': 36, '0000-0002-5980-3989': 9, '0000-0003-0111-9137': 6, '0000-0002-8020-2985': 6, '0000-0001-9697-3795': 3, '0000-0003-4941-1997': 3})
['000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(98, 55)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(98, 21)
2
(98, 76)
             precision    recall  f1-score   support

          0       0.88

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.71      0.80        14
          1       0.82      0.60      0.69        15
          2       0.88      0.79      0.83        19
          3       0.59      0.80      0.68        25

avg / total       0.77      0.74      0.74        73

[10  0  0  4  0  9  0  6  0  0 15  4  1  2  2 20]
svc Accuracy:  0.7397260273972602
svc F1:  0.7509017818339851
             precision    recall  f1-score   support

          0       0.90      0.64      0.75        14
          1       0.86      0.40      0.55        15
          2       0.88      0.74      0.80        19
          3       0.53      0.84      0.65        25

avg / total       0.76      0.68      0.69        73

[ 9  0  0  5  0  6  0  9  0  0 14  5  1  1  2 21]
LR Accuracy:  0.684931506849315
LR F1:  0.6854020979020978
For name:  w_lee
total sample size before apply threshold:  590
Counter({'0000-0003-3171-7672': 108, '0000-0001-5833-989X': 100, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.33      0.47        36
          1       0.00      0.00      0.00        11
          2       0.80      0.38      0.52        21
          3       0.17      0.10      0.12        10
          4       0.37      0.35      0.36        40
          5       0.45      0.90      0.60       108
          6       0.91      0.60      0.72        82
          7       0.98      0.95      0.96       100
          8       0.92      0.52      0.67        21
          9       0.74      0.63      0.68        62
         10       0.00      0.00      0.00        11
         11       1.00      0.56      0.72        16

avg / total       0.70      0.65      0.64       518

[12  0  0  0  3 18  1  0  0  2  0  0  0  0  0  0  7  4  0  0  0  0  0  0
  0  0  8  1  0 11  0  0  0  1  0  0  0  0  0  1  0  6  0  0  0  3  0  0
  2  6  0  1 14 15  1  1  0  0  0  0  0  0  1  0  4 97  3  0  1  2  0  0
  1  0  1  1  7 21 49  0  0  2  0  0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.79      0.83        19
          1       0.78      0.88      0.82        16

avg / total       0.83      0.83      0.83        35

[15  4  2 14]
svc Accuracy:  0.8285714285714286
svc F1:  0.8284313725490196
             precision    recall  f1-score   support

          0       0.84      0.84      0.84        19
          1       0.81      0.81      0.81        16

avg / total       0.83      0.83      0.83        35

[16  3  3 13]
LR Accuracy:  0.8285714285714286
LR F1:  0.8273026315789473
For name:  g_lewis
total sample size before apply threshold:  367
Counter({'0000-0001-5205-8245': 343, '0000-0002-2548-8423': 12, '0000-0003-3081-9319': 7, '0000-0003-4112-5048': 5})
['0000-0002-2548-8423', '0000-0001-5205-8245']
Total sample size after apply threshold:  355
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='conte

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       0.98      1.00      0.99       343

avg / total       0.98      0.98      0.98       355

[  6   6   0 343]
svc Accuracy:  0.9830985915492958
svc F1:  0.8289980732177264
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.97      1.00      0.98       343

avg / total       0.93      0.97      0.95       355

[  0  12   0 343]
LR Accuracy:  0.9661971830985916
LR F1:  0.49140401146131807
For name:  j_albert
total sample size before apply threshold:  78
Counter({'0000-0002-3420-7371': 40, '0000-0001-6538-9801': 19, '0000-0001-5330-1892': 13, '0000-0002-8256-2650': 6})
['0000-0002-3420-7371', '0000-0001-6538-9801', '0000-0001-5330-1892']
Total sample size after apply threshold:  72
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 130
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 44)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 33)
2
(130, 77)
             precision    recall  f1-score   support

          0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      0.90      0.93        21
          1       0.95      0.98      0.96        42

avg / total       0.95      0.95      0.95        63

[19  2  1 41]
svc Accuracy:  0.9523809523809523
svc F1:  0.9457675753228121
             precision    recall  f1-score   support

          0       0.89      0.81      0.85        21
          1       0.91      0.95      0.93        42

avg / total       0.90      0.90      0.90        63

[17  4  2 40]
LR Accuracy:  0.9047619047619048
LR F1:  0.8901162790697674
For name:  h_liu
total sample size before apply threshold:  439
Counter({'0000-0001-6715-6366': 100, '0000-0002-0253-647X': 45, '0000-0002-1006-6666': 39, '0000-0001-7639-0904': 39, '0000-0002-7233-1509': 31, '0000-0001-9366-6204': 26, '0000-0002-4723-845X': 18, '0000-0003-3326-2640': 17, '0000-0002-3745-7202': 13, '0000-0003-4837-5373': 11, '0000-0003-3103-6949': 10, '0000-0002-4548-2002': 9, '0000-0003-0266-9472': 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.38      0.31      0.34        39
          2       0.90      0.53      0.67        17
          3       1.00      0.15      0.27        13
          4       0.58      0.89      0.70       100
          5       0.88      0.58      0.70        26
          6       0.71      0.78      0.74        45
          7       0.58      0.39      0.47        18
          8       0.62      0.64      0.63        39
          9       0.00      0.00      0.00        10
         10       0.79      0.61      0.69        31

avg / total       0.62      0.61      0.59       349

[ 0  3  0  0  3  0  0  1  0  3  1  0 12  0  0 18  0  1  3  3  0  2  0  0
  9  0  2  0  5  0  1  0  0  1  0  0  2  9  0  0  0  0  0  1  0  4  0  0
 89  2  3  0  1  0  1  0  2  0  0  5 15  1  0  3  0  0  0  0  1  0  6  0
 35  1  2  0  0  1  5  0  0  2  0  1  7  2  0  0  0  4  0  0 10  0  0  0
 25  0  0  4  2 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.89      0.83        19
          1       0.85      0.65      0.73        17
          2       0.72      0.76      0.74        17

avg / total       0.78      0.77      0.77        53

[17  1  1  2 11  4  3  1 13]
LR Accuracy:  0.7735849056603774
LR F1:  0.7684862562911343
For name:  s_fernandes
total sample size before apply threshold:  38
Counter({'0000-0003-1128-833X': 20, '0000-0002-1295-5010': 6, '0000-0002-9035-793X': 5, '0000-0002-7871-6717': 5, '0000-0002-0790-303X': 2})
['0000-0003-1128-833X']
Total sample size after apply threshold:  20
For name:  a_miller
total sample size before apply threshold:  109
Counter({'0000-0002-7056-8502': 33, '0000-0001-8474-5090': 28, '0000-0002-7293-764X': 22, '0000-0002-0553-8470': 15, '0000-0001-9735-6609': 5, '0000-0001-8527-1595': 1, '0000-0002-1761-4143': 1, '0000-0002-3099-1648': 1, '0000-0002-0941-1717': 1, '0000-0001-9739-8462': 1, '0000-0003-0924-8443': 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[17  1  5 21  2 16  3 24  1  2 40  8  3  2  7 63]
MNB Accuracy:  0.6325581395348837
MNB F1:  0.6016785041055175
             precision    recall  f1-score   support

          0       0.90      0.61      0.73        44
          1       0.58      0.42      0.49        45
          2       0.89      0.78      0.83        51
          3       0.60      0.85      0.70        75

avg / total       0.72      0.70      0.69       215

[27  4  1 12  1 19  1 24  1  3 40  7  1  7  3 64]
svc Accuracy:  0.6976744186046512
svc F1:  0.6883848133848134
             precision    recall  f1-score   support

          0       0.77      0.45      0.57        44
          1       0.61      0.38      0.47        45
          2       0.83      0.75      0.78        51
          3       0.57      0.88      0.69        75

avg / total       0.68      0.66      0.64       215

[20  4  3 17  2 17  2 24  1  4 38  8  3  3  3 66]
LR Accuracy:  0.6558139534883721
LR F1:  0.6288559982076359
For name:  m_thomsen
tot

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.67      0.74        24
          1       1.00      0.75      0.86        24
          2       0.88      0.98      0.93        98

avg / total       0.89      0.89      0.89       146

[16  0  8  1 18  5  2  0 96]
svc Accuracy:  0.8904109589041096
svc F1:  0.8429550451795144
             precision    recall  f1-score   support

          0       1.00      0.38      0.55        24
          1       1.00      0.71      0.83        24
          2       0.82      1.00      0.90        98

avg / total       0.88      0.85      0.83       146

[ 9  0 15  0 17  7  0  0 98]
LR Accuracy:  0.8493150684931506
LR F1:  0.7579351356482705
For name:  y_ye
total sample size before apply threshold:  85
Counter({'0000-0002-7517-1715': 75, '0000-0002-2029-4558': 8, '0000-0003-3962-8463': 1, '0000-0002-9172-6514': 1})
['0000-0002-7517-1715']
Total sample size after apply threshold:  75
For name:  m_guerreiro
total sample si

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 32)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 11)
2
(55, 43)
             precision    recall  f1-score   support

          0       0.50

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.76      0.74      0.75        34
          1       0.83      1.00      0.91        86
          2       0.67      0.11      0.18        19

avg / total       0.79      0.81      0.77       139

[25  8  1  0 86  0  8  9  2]
svc Accuracy:  0.8129496402877698
svc F1:  0.61271324952917
             precision    recall  f1-score   support

          0       0.70      0.62      0.66        34
          1       0.81      1.00      0.90        86
          2       0.33      0.05      0.09        19

avg / total       0.72      0.78      0.73       139

[21 11  2  0 86  0  9  9  1]
LR Accuracy:  0.7769784172661871
LR F1:  0.5476641414141414
For name:  s_teixeira
total sample size before apply threshold:  36
Counter({'0000-0003-0419-2348': 12, '0000-0001-5845-058X': 11, '0000-0002-2462-8535': 3, '0000-0002-9473-0113': 3, '0000-0002-7464-3944': 3, '0000-0002-6603-7936': 3, '0000-0003-3664-2577': 1})
['0000-0003-0419-2348', 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.58      0.58      0.58        12
          1       0.55      0.55      0.55        11

avg / total       0.57      0.57      0.57        23

[7 5 5 6]
svc Accuracy:  0.5652173913043478
svc F1:  0.5643939393939394
             precision    recall  f1-score   support

          0       0.62      0.67      0.64        12
          1       0.60      0.55      0.57        11

avg / total       0.61      0.61      0.61        23

[8 4 5 6]
LR Accuracy:  0.6086956521739131
LR F1:  0.6057142857142856
For name:  l_almeida
total sample size before apply threshold:  133
Counter({'0000-0002-4861-8649': 57, '0000-0002-7769-4712': 43, '0000-0003-1370-961X': 12, '0000-0003-0370-214X': 8, '0000-0002-0651-7014': 5, '0000-0001-9346-7520': 4, '0000-0002-1324-0068': 1, '0000-0002-9544-3028': 1, '0000-0003-4711-4454': 1, '0000-0002-0921-887X': 1})
['0000-0003-1370-961X', '0000-0002-4861-8649', '0000-0002-7769-4712']
Total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.33      0.47        12
          1       0.74      0.95      0.83        57
          2       0.85      0.67      0.75        43

avg / total       0.79      0.78      0.76       112

[ 4  5  3  1 54  2  0 14 29]
MNB Accuracy:  0.7767857142857143
MNB F1:  0.6848680731033672
             precision    recall  f1-score   support

          0       0.50      0.17      0.25        12
          1       0.77      0.88      0.82        57
          2       0.79      0.79      0.79        43

avg / total       0.75      0.77      0.75       112

[ 2  7  3  1 50  6  1  8 34]
svc Accuracy:  0.7678571428571429
svc F1:  0.6201232685220486
             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       0.75      0.95      0.84        57
          2       0.84      0.74      0.79        43

avg / total       0.81      0.79      0.76       112

[ 2  7  3  0 5

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 45)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 17)
2
(164, 62)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(102, 24)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(102, 15)
2
(102, 39)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(209, 110)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(209, 34)
2
(209, 144)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.99      0.94       174
          1       0.93      0.37      0.53        35

avg / total       0.89      0.89      0.87       209

[173   1  22  13]
LR Accuracy:  0.8899521531100478
LR F1:  0.734140810795863
For name:  h_moreira
total sample size before apply threshold:  28
Counter({'0000-0002-1487-0539': 13, '0000-0002-5481-0688': 10, '0000-0002-4674-5417': 3, '0000-0002-4556-5027': 1, '0000-0002-5588-374X': 1})
['0000-0002-1487-0539', '0000-0002-5481-0688']
Total sample size after apply threshold:  23
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=N

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 41)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(59, 14)
2
(59, 55)
             precision    recall  f1-score   support

          0       0.61

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.61      0.64      0.62        47
          1       0.55      0.33      0.41        18
          2       0.64      0.60      0.62        62
          3       0.80      0.33      0.47        24
          4       0.42      0.31      0.36        26
          5       0.54      0.81      0.65        62

avg / total       0.59      0.58      0.57       239

[30  0  9  0  2  6  2  6  2  1  4  3  7  0 37  1  2 15  4  2  4  8  1  5
  1  2  2  0  8 13  5  1  4  0  2 50]
MNB Accuracy:  0.5815899581589958
MNB F1:  0.5218257017192108
             precision    recall  f1-score   support

          0       0.82      0.68      0.74        47
          1       0.53      0.44      0.48        18
          2       0.65      0.89      0.75        62
          3       0.82      0.58      0.68        24
          4       0.62      0.62      0.62        26
          5       0.79      0.73      0.76        62

avg / total       0.72     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(143, 69)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(143, 22)
2
(143, 91)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.74      0.85      0.79        40
          1       0.88      0.79      0.84        58

avg / total       0.83      0.82      0.82        98

[34  6 12 46]
MNB Accuracy:  0.8163265306122449
MNB F1:  0.8135306553911205
             precision    recall  f1-score   support

          0       0.91      0.80      0.85        40
          1       0.87      0.95      0.91        58

avg / total       0.89      0.89      0.89        98

[32  8  3 55]
svc Accuracy:  0.8877551020408163
svc F1:  0.8812121212121211
             precision    recall  f1-score   support

          0       0.89      0.78      0.83        40
          1       0.86      0.93      0.89        58

avg / total       0.87      0.87      0.87        98

[31  9  4 54]
LR Accuracy:  0.8673469387755102
LR F1:  0.8596143250688705
For name:  h_brown
total sample size before apply threshold:  48
Counter({'0000-0001-8578-5510': 17, '0000-0002-0067-991X': 9, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.85      0.63        78
          1       0.32      0.33      0.32        18
          2       0.54      0.30      0.39        23
          3       1.00      0.55      0.71        11
          4       0.53      0.31      0.39        32
          5       1.00      0.23      0.38        13
          6       0.29      0.09      0.13        23

avg / total       0.53      0.51      0.47       198

[66  1  1  0  7  0  3  9  6  3  0  0  0  0 13  3  7  0  0  0  0  5  0  0
  6  0  0  0 15  4  1  0 10  0  2  9  1  0  0  0  3  0 14  4  1  0  2  0
  2]
svc Accuracy:  0.5050505050505051
svc F1:  0.42159495851446316
             precision    recall  f1-score   support

          0       0.47      0.92      0.62        78
          1       0.40      0.33      0.36        18
          2       0.75      0.26      0.39        23
          3       1.00      0.18      0.31        11
          4       0.61      0.34      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.52      0.68        29
          1       1.00      0.92      0.96        12
          2       0.77      0.99      0.87        97
          3       0.50      0.07      0.12        14

avg / total       0.81      0.81      0.77       152

[15  0 14  0  0 11  1  0  0  0 96  1  0  0 13  1]
LR Accuracy:  0.8092105263157895
LR F1:  0.6580295503729008
For name:  m_sahin
total sample size before apply threshold:  48
Counter({'0000-0001-7044-2953': 41, '0000-0002-3490-6009': 3, '0000-0001-6502-2209': 2, '0000-0001-7677-8423': 2})
['0000-0001-7044-2953']
Total sample size after apply threshold:  41
For name:  c_feng
total sample size before apply threshold:  88
Counter({'0000-0002-1854-356X': 30, '0000-0002-2130-8851': 26, '0000-0003-3267-0968': 12, '0000-0002-7031-4211': 12, '0000-0002-3278-9451': 7, '0000-0003-1085-4395': 1})
['0000-0002-1854-356X', '0000-0002-2130-8851', '0000-0003-3267-0968', '0000-0002-7031-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.75      0.86        12
          1       0.90      1.00      0.95        26

avg / total       0.93      0.92      0.92        38

[ 9  3  0 26]
LR Accuracy:  0.9210526315789473
LR F1:  0.9012987012987013
For name:  a_rocha
total sample size before apply threshold:  73
Counter({'0000-0003-3218-7001': 26, '0000-0001-9710-9835': 21, '0000-0003-2165-5519': 12, '0000-0002-4094-7982': 3, '0000-0002-5637-1041': 3, '0000-0001-6528-9034': 3, '0000-0003-4940-6522': 2, '0000-0003-0298-8246': 2, '0000-0001-8679-2886': 1})
['0000-0001-9710-9835', '0000-0003-2165-5519', '0000-0003-3218-7001']
Total sample size after apply threshold:  59
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(350, 165)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(350, 31)
2
(350, 196)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.76      0.78      0.77        85
          1       1.00      0.38      0.55        16
          2       0.84      0.75      0.79       108
          3       0.67      0.15      0.25        13
          4       0.94      0.80      0.87        41
          5       0.62      0.87      0.72        87

avg / total       0.78      0.75      0.75       350

[66  0  6  0  0 13  2  6  0  0  0  8 13  0 81  1  0 13  2  0  3  2  0  6
  1  0  0  0 33  7  3  0  6  0  2 76]
svc Accuracy:  0.7542857142857143
svc F1:  0.658207438236598
             precision    recall  f1-score   support

          0       0.79      0.68      0.73        85
          1       1.00      0.12      0.22        16
          2       0.69      0.87      0.77       108
          3       0.00      0.00      0.00        13
          4       0.96      0.56      0.71        41
          5       0.59      0.78      0.67        87

avg / total       0.71      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      0.96      0.77       155
          1       0.97      0.70      0.82        44
          2       0.81      0.48      0.60        60
          3       0.92      0.31      0.46        36
          4       1.00      0.19      0.32        21

avg / total       0.77      0.71      0.68       316

[149   0   5   1   0  12  31   1   0   0  30   1  29   0   0  24   0   1
  11   0  17   0   0   0   4]
svc Accuracy:  0.7088607594936709
svc F1:  0.5936630626954985
             precision    recall  f1-score   support

          0       0.62      0.94      0.75       155
          1       0.83      0.45      0.59        44
          2       0.64      0.50      0.56        60
          3       0.80      0.22      0.35        36
          4       1.00      0.10      0.17        21

avg / total       0.70      0.65      0.61       316

[145   2   8   0   0  19  20   4   1   0  30   0  30   0   0  24   1   3
   8   0  15  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.71      0.45      0.56        11
          1       0.77      0.89      0.83        57
          2       0.60      0.40      0.48        15
          3       0.70      0.90      0.79        42
          4       1.00      0.72      0.84        25
          5       0.83      0.45      0.59        11

avg / total       0.77      0.76      0.75       161

[ 5  2  1  2  0  1  0 51  3  3  0  0  0  5  6  4  0  0  0  4  0 38  0  0
  0  3  0  4 18  0  2  1  0  3  0  5]
svc Accuracy:  0.7639751552795031
svc F1:  0.6803225185580629
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.71      0.95      0.81        57
          2       0.80      0.27      0.40        15
          3       0.61      0.86      0.71        42
          4       1.00      0.68      0.81        25
          5       1.00      0.36      0.53        11

avg / total       0.71     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.78      0.49      0.60        43
          1       0.56      0.48      0.51        21
          2       1.00      0.65      0.79        17
          3       0.92      0.37      0.52        30
          4       1.00      0.33      0.50        12
          5       0.94      0.85      0.89       124
          6       0.71      0.29      0.42        17
          7       1.00      0.78      0.88        18
          8       0.49      0.88      0.63        99

avg / total       0.78      0.70      0.70       381

[ 21   0   0   1   0   0   0   0  21   0  10   0   0   0   0   0   0  11
   0   0  11   0   0   0   0   0   6   2   2   0  11   0   1   0   0  14
   0   1   0   0   4   0   0   0   7   1   0   0   0   0 105   0   0  18
   0   1   0   0   0   1   5   0  10   1   0   0   0   0   1   0  14   2
   2   4   0   0   0   4   2   0  87]
svc Accuracy:  0.7034120734908137
svc F1:  0.6373965300236487
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 27)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 7)
2
(41, 34)
             precision    recall  f1-score   support

          0       0.67 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(291, 100)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(291, 18)
2
(291, 118)
             precision    recall  f1-score   support

          0     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  180
Counter({'0000-0003-1643-6506': 64, '0000-0002-2707-8110': 56, '0000-0002-4804-6687': 20, '0000-0003-4882-4044': 17, '0000-0003-0710-2336': 10, '0000-0002-4395-0511': 7, '0000-0001-5448-0140': 4, '0000-0001-9076-6197': 2})
['0000-0003-0710-2336', '0000-0003-4882-4044', '0000-0003-1643-6506', '0000-0002-2707-8110', '0000-0002-4804-6687']
Total sample size after apply threshold:  167
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(167, 88)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.67      0.67        12
          1       0.67      0.67      0.67        12

avg / total       0.67      0.67      0.67        24

[8 4 4 8]
MNB Accuracy:  0.6666666666666666
MNB F1:  0.6666666666666666
             precision    recall  f1-score   support

          0       0.64      0.58      0.61        12
          1       0.62      0.67      0.64        12

avg / total       0.63      0.62      0.62        24

[7 5 4 8]
svc Accuracy:  0.625
svc F1:  0.6243478260869566
             precision    recall  f1-score   support

          0       0.67      0.67      0.67        12
          1       0.67      0.67      0.67        12

avg / total       0.67      0.67      0.67        24

[8 4 4 8]
LR Accuracy:  0.6666666666666666
LR F1:  0.6666666666666666
For name:  p_stevenson
total sample size before apply threshold:  122
Counter({'0000-0002-3520-5060': 86, '0000-0001-6780-6859': 33, '0000-0002-3232-5155'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 22)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(55, 14)
2
(55, 36)
             precision    recall  f1-score   support

          0       0.64

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.30      0.44        20
          1       1.00      0.08      0.14        13
          2       0.69      1.00      0.81        55

avg / total       0.77      0.70      0.63        88

[ 6  0 14  1  1 11  0  0 55]
LR Accuracy:  0.7045454545454546
LR F1:  0.4673721340388007
For name:  y_kamiya
total sample size before apply threshold:  161
Counter({'0000-0003-4415-520X': 113, '0000-0001-9790-9867': 42, '0000-0001-8716-2536': 5, '0000-0002-0758-0234': 1})
['0000-0003-4415-520X', '0000-0001-9790-9867']
Total sample size after apply threshold:  155
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        18
          1       0.96      1.00      0.98       218

avg / total       0.96      0.96      0.96       236

[  9   9   0 218]
svc Accuracy:  0.961864406779661
svc F1:  0.8232209737827716
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        18
          1       0.92      1.00      0.96       218

avg / total       0.85      0.92      0.89       236

[  0  18   0 218]
LR Accuracy:  0.923728813559322
LR F1:  0.4801762114537445
For name:  a_ward
total sample size before apply threshold:  164
Counter({'0000-0001-7945-7975': 92, '0000-0003-4102-8694': 40, '0000-0002-7000-2453': 10, '0000-0001-6948-4814': 9, '0000-0002-6376-0061': 6, '0000-0003-0038-9426': 4, '0000-0002-9774-8677': 2, '0000-0003-1321-3358': 1})
['0000-0001-7945-7975', '0000-0002-7000-2453', '0000-0003-4102-8694']
Total sample size after apply threshold:  142
(0, 0)
Tfidf

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.97      0.85        92
          1       0.00      0.00      0.00        10
          2       0.88      0.53      0.66        40

avg / total       0.74      0.77      0.73       142

[89  0  3 10  0  0 19  0 21]
LR Accuracy:  0.7746478873239436
LR F1:  0.5012896825396825
For name:  j_chen
total sample size before apply threshold:  1139
Counter({'0000-0001-5077-4483': 92, '0000-0002-5756-3336': 87, '0000-0001-7858-8236': 73, '0000-0002-1752-4201': 61, '0000-0002-9220-8436': 55, '0000-0001-5859-3070': 51, '0000-0003-2996-5781': 49, '0000-0001-8807-3607': 43, '0000-0001-6527-4801': 41, '0000-0001-6879-5936': 35, '0000-0002-7253-2722': 33, '0000-0001-7336-8808': 31, '0000-0001-8634-1145': 29, '0000-0001-6491-6577': 28, '0000-0002-0662-782X': 19, '0000-0001-7381-0918': 18, '0000-0002-4429-283X': 17, '0000-0001-5168-7074': 16, '0000-0002-1591-9744': 14, '0000-0002-7409-7859': 14, '0000-0002-8021-7458': 13, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.71      0.16      0.26        31
          1       0.41      0.79      0.54        73
          2       0.00      0.00      0.00        33
          3       0.40      0.29      0.34        55
          4       0.77      0.61      0.68        28
          5       0.00      0.00      0.00        14
          6       0.00      0.00      0.00        17
          7       0.28      0.73      0.40        92
          8       0.00      0.00      0.00        11
          9       0.30      0.90      0.45        87
         10       0.00      0.00      0.00        16
         11       0.61      0.29      0.39        49
         12       0.00      0.00      0.00        12
         13       0.00      0.00      0.00        10
         14       0.00      0.00      0.00        12
         15       0.32      0.21      0.25        43
         16       0.00      0.00      0.00        18
         17       0.00      0.00      0.00   

             precision    recall  f1-score   support

          0       0.67      0.58      0.62        31
          1       0.65      0.81      0.72        73
          2       0.38      0.18      0.24        33
          3       0.49      0.40      0.44        55
          4       0.76      0.68      0.72        28
          5       0.80      0.57      0.67        14
          6       0.00      0.00      0.00        17
          7       0.33      0.77      0.46        92
          8       0.00      0.00      0.00        11
          9       0.63      0.87      0.73        87
         10       0.57      0.25      0.35        16
         11       0.57      0.41      0.48        49
         12       1.00      0.50      0.67        12
         13       1.00      0.30      0.46        10
         14       0.25      0.08      0.12        12
         15       0.34      0.28      0.31        43
         16       0.18      0.11      0.14        18
         17       0.56      0.47      0.51   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(338, 139)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(338, 28)
2
(338, 167)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.93      0.82      0.87        17
          1       0.86      0.86      0.86        28
          2       0.62      0.38      0.48        13
          3       0.59      0.91      0.72       107
          4       0.50      0.18      0.26        17
          5       0.74      0.50      0.60        34
          6       1.00      0.53      0.70        15
          7       1.00      0.74      0.85        19
          8       0.73      0.69      0.71        32
          9       0.54      0.50      0.52        38
         10       1.00      0.39      0.56        18

avg / total       0.71      0.68      0.67       338

[14  0  0  1  0  2  0  0  0  0  0  0 24  0  2  0  1  0  0  0  1  0  0  0
  5  3  0  0  0  0  0  5  0  0  1  0 97  2  2  0  0  2  3  0  0  0  0 12
  3  0  0  0  0  2  0  1  1  0 15  0 17  0  0  0  0  0  0  0  0  7  0  0
  8  0  0  0  0  0  2  0  3  0  0  0 14  0  0  0  0  0  0  6  0  0  0  0
 22  4  0  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.58      0.41      0.48        17
          1       0.83      0.62      0.71        16
          2       0.50      0.18      0.27        11
          3       0.73      0.50      0.59        16
          4       1.00      1.00      1.00        13
          5       0.60      0.55      0.57        22
          6       0.79      0.73      0.76        30
          7       0.34      0.61      0.44        31

avg / total       0.65      0.60      0.60       156

[ 7  1  0  0  0  1  3  5  0 10  0  1  0  1  0  4  1  0  2  0  0  1  1  6
  0  1  0  8  0  0  0  7  0  0  0  0 13  0  0  0  1  0  0  0  0 12  1  8
  0  0  0  0  0  1 22  7  3  0  2  2  0  4  1 19]
svc Accuracy:  0.5961538461538461
svc F1:  0.6028918080642218
             precision    recall  f1-score   support

          0       0.50      0.41      0.45        17
          1       0.86      0.38      0.52        16
          2       0.50      0.09      0.15       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.48      0.59      0.53        22
          1       1.00      0.54      0.70        13
          2       0.69      0.90      0.78        63
          3       0.78      0.37      0.50        19
          4       0.00      0.00      0.00        13

avg / total       0.63      0.65      0.61       130

[13  0  7  1  1  2  7  3  0  1  4  0 57  1  1  4  0  7  7  1  4  0  9  0
  0]
svc Accuracy:  0.6461538461538462
svc F1:  0.5022868325412356
             precision    recall  f1-score   support

          0       0.43      0.45      0.44        22
          1       1.00      0.54      0.70        13
          2       0.62      0.95      0.75        63
          3       0.50      0.05      0.10        19
          4       0.00      0.00      0.00        13

avg / total       0.55      0.60      0.52       130

[10  0 12  0  0  2  7  4  0  0  2  0 60  1  0  5  0 12  1  1  4  0  9  0
  0]
LR Accuracy:  0.6
LR F1:  0.397936

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.45      0.56      0.50        16
          1       0.60      0.46      0.52        13
          2       0.64      0.60      0.62        15

avg / total       0.56      0.55      0.55        44

[9 3 4 6 6 1 5 1 9]
LR Accuracy:  0.5454545454545454
LR F1:  0.5474762618690655
For name:  s_morris
total sample size before apply threshold:  33
Counter({'0000-0003-2551-9717': 14, '0000-0002-5334-5809': 11, '0000-0002-7023-8634': 4, '0000-0002-8056-0934': 2, '0000-0003-4866-110X': 2})
['0000-0002-5334-5809', '0000-0003-2551-9717']
Total sample size after apply threshold:  25
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(376, 161)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(376, 23)
2
(376, 184)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.61      0.76        28
          1       0.81      0.67      0.73        57
          2       0.00      0.00      0.00        10
          3       0.83      0.36      0.50        14
          4       0.09      0.05      0.06        21
          5       0.95      0.73      0.83        81
          6       1.00      0.90      0.95        48
          7       0.58      0.93      0.71       117

avg / total       0.75      0.72      0.71       376

[ 17   0   1   0   2   0   0   8   0  38   0   0   0   2   0  17   0   1
   0   0   2   0   0   7   0   1   0   5   0   1   0   7   0   1   0   0
   1   0   0  19   0   2   0   1   3  59   0  16   0   0   0   0   0   0
  43   5   0   4   1   0   3   0   0 109]
svc Accuracy:  0.723404255319149
svc F1:  0.5667260818644014
             precision    recall  f1-score   support

          0       1.00      0.61      0.76        28
          1       0.80      0.65      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.85      0.60      0.70        47
          1       0.67      0.18      0.29        11
          2       0.62      0.81      0.70       115
          3       0.49      0.46      0.48        71
          4       0.75      0.21      0.33        14

avg / total       0.63      0.62      0.60       258

[28  0 13  6  0  0  2  4  5  0  2  1 93 18  1  0  0 38 33  0  3  0  3  5
  3]
svc Accuracy:  0.6162790697674418
svc F1:  0.4993113217827177
             precision    recall  f1-score   support

          0       0.73      0.40      0.52        47
          1       0.00      0.00      0.00        11
          2       0.56      0.83      0.66       115
          3       0.48      0.39      0.43        71
          4       1.00      0.21      0.35        14

avg / total       0.57      0.56      0.53       258

[19  0 21  7  0  0  0  9  2  0  3  0 95 17  0  0  0 43 28  0  4  0  3  4
  3]
LR Accuracy:  0.562015503875969
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(171, 98)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(171, 18)
2
(171, 116)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.58      0.67      0.62        21
          2       0.61      0.79      0.69        28

avg / total       0.47      0.58      0.52        62

[ 0  5  8  1 14  6  1  5 22]
LR Accuracy:  0.5806451612903226
LR F1:  0.43657407407407406
For name:  a_vitale
total sample size before apply threshold:  56
Counter({'0000-0001-5586-2255': 43, '0000-0002-8682-3125': 7, '0000-0002-7339-4034': 4, '0000-0003-4980-5574': 2})
['0000-0001-5586-2255']
Total sample size after apply threshold:  43
For name:  q_yang
total sample size before apply threshold:  102
Counter({'0000-0002-3510-8906': 18, '0000-0001-9849-6996': 17, '0000-0003-4205-1909': 17, '0000-0001-6628-5393': 15, '0000-0002-4378-2335': 10, '0000-0003-4038-2464': 8, '0000-0002-6788-8775': 7, '0000-0003-0279-8784': 5, '0000-0001-6720-8795': 2, '0000-0001-8253-2278': 1, '0000-0002-1437-4498': 1, '0000-0003-2067-5999': 1})


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(102, 47)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(102, 16)
2
(102, 63)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.89      1.00      0.94        54
          1       1.00      0.81      0.90        37

avg / total       0.93      0.92      0.92        91

[54  0  7 30]
svc Accuracy:  0.9230769230769231
svc F1:  0.917326411421155
             precision    recall  f1-score   support

          0       0.84      1.00      0.92        54
          1       1.00      0.73      0.84        37

avg / total       0.91      0.89      0.89        91

[54  0 10 27]
LR Accuracy:  0.8901098901098901
LR F1:  0.8795021186440677
For name:  h_chang
total sample size before apply threshold:  182
Counter({'0000-0001-5411-6680': 55, '0000-0002-7997-4822': 39, '0000-0002-8417-8847': 22, '0000-0001-8877-1886': 18, '0000-0001-5810-7562': 11, '0000-0001-5577-2356': 9, '0000-0002-9812-8015': 6, '0000-0002-5248-3433': 5, '0000-0003-4987-5943': 4, '0000-0003-1832-8509': 4, '0000-0002-9405-2121': 2, '0000-0001-7378-8212': 2, '0000-0003-4843-1259': 1, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 19)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 14)
2
(33, 33)
             precision    recall  f1-score   support

          0       0.86

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.96      0.79        56
          1       1.00      0.71      0.83        28
          2       0.98      0.90      0.94        71
          3       0.95      0.81      0.88        75

avg / total       0.90      0.87      0.87       230

[54  0  0  2  8 20  0  0  6  0 64  1 13  0  1 61]
svc Accuracy:  0.8652173913043478
svc F1:  0.8601322033828497
             precision    recall  f1-score   support

          0       0.65      0.84      0.73        56
          1       1.00      0.57      0.73        28
          2       0.91      0.89      0.90        71
          3       0.86      0.84      0.85        75

avg / total       0.84      0.82      0.82       230

[47  0  5  4  9 16  0  3  5  0 63  3 11  0  1 63]
LR Accuracy:  0.8217391304347826
LR F1:  0.8032497696560197
For name:  c_lopez
total sample size before apply threshold:  53
Counter({'0000-0001-9298-2969': 34, '0000-0003-3668-7468': 12, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(86, 39)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(86, 16)
2
(86, 55)
             precision    recall  f1-score   support

          0       0.84

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.75      0.89      0.81        27
          2       0.73      0.50      0.59        16

avg / total       0.81      0.81      0.80        58

[15  0  0  0 24  3  0  8  8]
LR Accuracy:  0.8103448275862069
LR F1:  0.8020506382088303
For name:  a_lima
total sample size before apply threshold:  85
Counter({'0000-0002-1507-2264': 16, '0000-0002-9779-0584': 12, '0000-0002-3582-2640': 10, '0000-0002-2396-9880': 9, '0000-0003-2261-2801': 8, '0000-0001-6980-6553': 8, '0000-0001-8251-6286': 7, '0000-0002-3714-9904': 5, '0000-0002-1055-0554': 5, '0000-0002-9083-3377': 2, '0000-0002-4473-5311': 2, '0000-0002-4568-9126': 1})
['0000-0002-1507-2264', '0000-0002-9779-0584', '0000-0002-3582-2640']
Total sample size after apply threshold:  38
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(128, 24)
2
(128, 85)
             precision    recall  f1-score   support

          0       0.81      0.63      0.71        27
          1       0.91      0.96      0.93       101

avg / total       0.89      0.89      0.89       128

[17 10  4 97]
MNB Accuracy:  0.890625
MNB F1:  0.8205128205128205
             precision    recall  f1-score   support

          0       0.94      0.59      0.73        27
          1       0.90      0.99      0.94       101

avg / total       0.91      0.91      0.90       128

[ 16  11   1 100]
svc Accuracy:  0.90625
svc F1:  0.8353344768439107
             precision    recall  f1-score   support

          0       1.00      0.52      0.68        27
          1       0.89      1.00      0.94       101

avg / total       0.91      0.90      0.89       128

[ 14  13   0 101]
LR Accuracy:  0.8984375
LR F1:  0.8112308564946115
For name:  z_he
total sample size before apply threshold:  160
Counter({'0000-0002-6098-7893': 46, '0000-0001-6302-6556': 40, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 27)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 12)
2
(49, 39)
             precision    recall  f1-score   support

          0       0.89

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.65      0.88      0.75        25
          1       0.60      0.57      0.59        21
          2       0.86      0.40      0.55        15

avg / total       0.68      0.66      0.64        61

[22  3  0  8 12  1  4  5  6]
LR Accuracy:  0.6557377049180327
LR F1:  0.6255277036591629
For name:  r_walker
total sample size before apply threshold:  87
Counter({'0000-0002-5936-1068': 25, '0000-0003-0348-2407': 16, '0000-0001-7383-7846': 15, '0000-0002-6089-8225': 11, '0000-0003-0032-9925': 10, '0000-0001-9736-3497': 7, '0000-0002-2064-4546': 2, '0000-0002-5332-3562': 1})
['0000-0001-7383-7846', '0000-0003-0348-2407', '0000-0002-5936-1068', '0000-0003-0032-9925', '0000-0002-6089-8225']
Total sample size after apply threshold:  77
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_feat

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.33      0.29      0.31        14
          1       0.63      0.67      0.65        18
          2       0.81      0.85      0.83        20

avg / total       0.62      0.63      0.63        52

[ 4  6  4  6 12  0  2  1 17]
svc Accuracy:  0.6346153846153846
svc F1:  0.595203083007961
             precision    recall  f1-score   support

          0       0.40      0.29      0.33        14
          1       0.57      0.67      0.62        18
          2       0.76      0.80      0.78        20

avg / total       0.60      0.62      0.60        52

[ 4  6  4  5 12  1  1  3 16]
LR Accuracy:  0.6153846153846154
LR F1:  0.5764019178653325
For name:  f_andrade
total sample size before apply threshold:  37
Counter({'0000-0002-3856-3816': 12, '0000-0003-1199-2837': 12, '0000-0002-4947-2346': 11, '0000-0001-6257-1712': 2})
['0000-0002-4947-2346', '0000-0002-3856-3816', '0000-0003-1199-2837']
Total sample size after apply t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(69, 52)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(69, 19)
2
(69, 71)
             precision    recall  f1-score   support

          0       0.50

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.7402597402597403
svc F1:  0.7639626919950371
             precision    recall  f1-score   support

          0       1.00      0.73      0.85        15
          1       0.48      0.48      0.48        21
          2       0.67      0.57      0.62        14
          3       0.61      0.74      0.67        27

avg / total       0.66      0.64      0.64        77

[11  1  1  2  0 10  3  8  0  3  8  3  0  7  0 20]
LR Accuracy:  0.6363636363636364
LR F1:  0.651098901098901
For name:  k_ryan
total sample size before apply threshold:  182
Counter({'0000-0002-1059-9681': 79, '0000-0003-3670-8505': 36, '0000-0001-5304-2026': 23, '0000-0003-4563-3744': 22, '0000-0001-9149-260X': 11, '0000-0002-0582-3693': 7, '0000-0002-9454-8768': 3, '0000-0002-6057-452X': 1})
['0000-0001-9149-260X', '0000-0003-3670-8505', '0000-0001-5304-2026', '0000-0003-4563-3744', '0000-0002-1059-9681']
Total sample size after apply threshold:  171
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='stric

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(75, 50)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(75, 14)
2
(75, 64)
             precision    recall  f1-score   support

          0       0.50

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.95      0.66      0.78        29
          1       0.77      0.96      0.86       107
          2       1.00      0.50      0.67        12
          3       0.90      0.67      0.77        42

avg / total       0.84      0.82      0.81       190

[ 19  10   0   0   1 103   0   3   0   6   6   0   0  14   0  28]
svc Accuracy:  0.8210526315789474
svc F1:  0.7669083729382163
             precision    recall  f1-score   support

          0       1.00      0.55      0.71        29
          1       0.70      1.00      0.83       107
          2       1.00      0.25      0.40        12
          3       0.95      0.43      0.59        42

avg / total       0.82      0.76      0.73       190

[ 16  12   0   1   0 107   0   0   0   9   3   0   0  24   0  18]
LR Accuracy:  0.7578947368421053
LR F1:  0.6318824679480417
For name:  a_gordon
total sample size before apply threshold:  126
Counter({'0000-0003-1676-9853': 36, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  24
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(24, 23)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(24, 7)
2
(24, 30)
             precision    recal

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.23      0.38        13
          1       0.84      1.00      0.91        53

avg / total       0.87      0.85      0.81        66

[ 3 10  0 53]
LR Accuracy:  0.8484848484848485
LR F1:  0.6443965517241379
For name:  m_crespo
total sample size before apply threshold:  49
Counter({'0000-0002-7732-7808': 20, '0000-0002-1852-2259': 12, '0000-0001-8762-7874': 9, '0000-0002-7086-9751': 8})
['0000-0002-7732-7808', '0000-0002-1852-2259']
Total sample size after apply threshold:  32
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        voc

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(169, 16)
2
(169, 76)
             precision    recall  f1-score   support

          0       0.85      0.98      0.91       138
          1       0.75      0.30      0.43        20
          2       0.67      0.18      0.29        11

avg / total       0.83      0.85      0.81       169

[135   2   1  14   6   0   9   0   2]
MNB Accuracy:  0.8461538461538461
MNB F1:  0.5421492921492922
             precision    recall  f1-score   support

          0       0.88      0.98      0.93       138
          1       0.79      0.55      0.65        20
          2       1.00      0.18      0.31        11

avg / total       0.88      0.88      0.85       169

[135   3   0   9  11   0   9   0   2]
svc Accuracy:  0.8757396449704142
svc F1:  0.6275287275893705
             precision    recall  f1-score   support

          0       0.83      0.99      0.90       138
          1       0.00      0.00      0.00        20
          2       1.00      0.09      0.17        11

avg / total       0.74     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.71      0.79        66
          1       0.71      0.91      0.80        11
          2       0.84      0.94      0.89        86

avg / total       0.85      0.85      0.84       163

[47  4 15  1 10  0  5  0 81]
svc Accuracy:  0.8466257668711656
svc F1:  0.826675285498815
             precision    recall  f1-score   support

          0       0.90      0.67      0.77        66
          1       0.73      0.73      0.73        11
          2       0.81      0.97      0.88        86

avg / total       0.84      0.83      0.82       163

[44  3 19  2  8  1  3  0 83]
LR Accuracy:  0.8282208588957055
LR F1:  0.7902656656279845
For name:  l_wilson
total sample size before apply threshold:  59
Counter({'0000-0001-8709-8968': 18, '0000-0003-4175-7125': 11, '0000-0001-6659-6001': 11, '0000-0002-3779-8277': 11, '0000-0002-3532-0309': 5, '0000-0002-8333-5660': 3})
['0000-0001-8709-8968', '0000-0003-4175-7125', '0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(346, 161)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(346, 20)
2
(346, 181)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.79      0.77        95
          1       0.93      0.61      0.74        46
          2       1.00      0.46      0.63        13
          3       0.98      0.82      0.90        74
          4       0.61      0.85      0.71        95
          5       1.00      0.20      0.33        10

avg / total       0.80      0.76      0.76       333

[75  0  0  1 19  0  3 28  0  0 15  0  3  0  6  0  4  0  5  0  0 61  8  0
 14  0  0  0 81  0  0  2  0  0  6  2]
svc Accuracy:  0.7597597597597597
svc F1:  0.6797617157524277
             precision    recall  f1-score   support

          0       0.71      0.79      0.75        95
          1       0.85      0.48      0.61        46
          2       0.00      0.00      0.00        13
          3       0.76      0.77      0.77        74
          4       0.58      0.77      0.66        95
          5       0.00      0.00      0.00        10

avg / total       0.65     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.98      0.91        87
          1       0.95      0.75      0.84        55

avg / total       0.90      0.89      0.88       142

[85  2 14 41]
LR Accuracy:  0.8873239436619719
LR F1:  0.8753565942506034
For name:  s_henderson
total sample size before apply threshold:  82
Counter({'0000-0002-1076-3867': 52, '0000-0002-9032-3828': 25, '0000-0003-3019-1891': 4, '0000-0001-6389-4927': 1})
['0000-0002-1076-3867', '0000-0002-9032-3828']
Total sample size after apply threshold:  77
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.71      0.59      0.64        41
          1       0.76      0.85      0.80        65

avg / total       0.74      0.75      0.74       106

[24 17 10 55]
svc Accuracy:  0.7452830188679245
svc F1:  0.7214598540145984
             precision    recall  f1-score   support

          0       0.69      0.44      0.54        41
          1       0.71      0.88      0.79        65

avg / total       0.70      0.71      0.69       106

[18 23  8 57]
LR Accuracy:  0.7075471698113207
LR F1:  0.6617601646937725
For name:  z_xie
total sample size before apply threshold:  99
Counter({'0000-0003-2974-1825': 48, '0000-0001-5816-6159': 17, '0000-0002-8348-4455': 16, '0000-0002-1539-5100': 8, '0000-0002-4526-9746': 6, '0000-0003-0308-5233': 1, '0000-0002-3137-561X': 1, '0000-0003-2492-0592': 1, '0000-0002-6600-8190': 1})
['0000-0003-2974-1825', '0000-0002-8348-4455', '0000-0001-5816-6159']
Total sample size after apply threshold:

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(370, 130)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(370, 22)
2
(370, 152)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.33      0.12      0.17        17
          1       0.76      0.98      0.85       213
          2       1.00      0.52      0.68        25
          3       1.00      0.15      0.27        13
          4       1.00      0.84      0.91        87
          5       0.00      0.00      0.00        15

avg / total       0.79      0.81      0.77       370

[  2  15   0   0   0   0   4 209   0   0   0   0   0  12  13   0   0   0
   0  11   0   2   0   0   0  14   0   0  73   0   0  15   0   0   0   0]
svc Accuracy:  0.8081081081081081
svc F1:  0.4820159937386812
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        17
          1       0.74      1.00      0.85       213
          2       1.00      0.44      0.61        25
          3       0.00      0.00      0.00        13
          4       1.00      0.80      0.89        87
          5       0.00      0.00      0.00   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.29      0.48      0.36        23
          1       1.00      0.54      0.70        26
          2       0.54      0.77      0.64        70
          3       0.53      0.30      0.38        33
          4       0.86      0.55      0.67        11
          5       1.00      0.62      0.76        13
          6       0.59      0.53      0.56        19
          7       1.00      0.20      0.33        10

avg / total       0.64      0.56      0.56       205

[11  0 10  2  0  0  0  0  4 14  7  0  0  0  1  0  5  0 54  5  1  0  5  0
  7  0 16 10  0  0  0  0  4  0  1  0  6  0  0  0  1  0  3  0  0  8  1  0
  4  0  5  0  0  0 10  0  2  0  4  2  0  0  0  2]
svc Accuracy:  0.5609756097560976
svc F1:  0.5497531946784598
             precision    recall  f1-score   support

          0       0.32      0.30      0.31        23
          1       0.92      0.46      0.62        26
          2       0.42      0.83      0.56       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.49      0.63       112
          1       0.91      0.53      0.67        80
          2       0.64      0.94      0.76       180
          3       0.61      0.58      0.59        19

avg / total       0.76      0.71      0.70       391

[ 55   1  56   0   2  42  32   4   5   3 169   3   0   0   8  11]
svc Accuracy:  0.7084398976982097
svc F1:  0.6632489327762479
             precision    recall  f1-score   support

          0       0.76      0.54      0.63       112
          1       0.90      0.35      0.50        80
          2       0.62      0.93      0.74       180
          3       0.78      0.37      0.50        19

avg / total       0.72      0.67      0.65       391

[ 60   1  51   0   7  28  43   2  10   2 168   0   2   0  10   7]
LR Accuracy:  0.6726342710997443
LR F1:  0.594034896917953
For name:  r_sinha
total sample size before apply threshold:  27
Counter({'0000-0001-5497-5055': 11, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


190
Counter({'0000-0002-1771-9017': 83, '0000-0002-5390-4113': 24, '0000-0003-3537-6246': 23, '0000-0003-3398-6294': 17, '0000-0001-8434-1758': 15, '0000-0003-2660-9183': 10, '0000-0003-2193-5473': 5, '0000-0002-4293-5037': 2, '0000-0001-7557-8518': 2, '0000-0002-4643-917X': 1, '0000-0001-8528-0694': 1, '0000-0003-0790-5905': 1, '0000-0003-0355-3981': 1, '0000-0002-8201-1592': 1, '0000-0002-4172-7981': 1, '0000-0001-6544-126X': 1, '0000-0001-7622-7269': 1, '0000-0002-8466-0043': 1})
['0000-0001-8434-1758', '0000-0003-3398-6294', '0000-0002-5390-4113', '0000-0003-3537-6246', '0000-0002-1771-9017', '0000-0003-2660-9183']
Total sample size after apply threshold:  172
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      0.76      0.72        29
          1       0.82      0.56      0.67        16
          2       0.59      0.64      0.62        25

avg / total       0.68      0.67      0.67        70

[22  0  7  3  9  4  7  2 16]
svc Accuracy:  0.6714285714285714
svc F1:  0.6677875858203727
             precision    recall  f1-score   support

          0       0.62      0.79      0.70        29
          1       0.90      0.56      0.69        16
          2       0.65      0.60      0.63        25

avg / total       0.70      0.67      0.67        70

[23  0  6  5  9  2  9  1 15]
LR Accuracy:  0.6714285714285714
LR F1:  0.6714257964257965
For name:  j_zhong
total sample size before apply threshold:  280
Counter({'0000-0002-2265-9338': 115, '0000-0002-1494-6396': 70, '0000-0003-3148-4143': 37, '0000-0002-3534-7480': 21, '0000-0003-1801-9642': 19, '0000-0001-7157-603X': 8, '0000-0002-8815-4105': 4, '0000-0002-8945-4599'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.61      0.38      0.47        37
          1       0.60      0.32      0.41        19
          2       0.40      0.19      0.26        21
          3       0.74      0.91      0.82       115
          4       0.81      0.89      0.84        70

avg / total       0.70      0.73      0.70       262

[ 14   1   3  16   3   3   6   1   8   1   3   0   4   8   6   2   3   0
 105   5   1   0   2   5  62]
svc Accuracy:  0.7290076335877863
svc F1:  0.5598364647556109
             precision    recall  f1-score   support

          0       1.00      0.22      0.36        37
          1       1.00      0.16      0.27        19
          2       0.50      0.05      0.09        21
          3       0.69      0.95      0.80       115
          4       0.70      0.91      0.80        70

avg / total       0.74      0.71      0.64       262

[  8   0   0  20   9   0   3   0  15   1   0   0   1   9  11   0   0   0
 109   6   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.92      0.87        72
          1       1.00      0.13      0.24        15
          2       0.75      0.84      0.79        43

avg / total       0.82      0.80      0.77       130

[66  0  6  7  2  6  7  0 36]
LR Accuracy:  0.8
LR F1:  0.6316413204958096
For name:  m_walsh
total sample size before apply threshold:  37
Counter({'0000-0001-5683-1151': 30, '0000-0001-8920-7419': 3, '0000-0002-1770-3314': 2, '0000-0003-0982-4105': 2})
['0000-0001-5683-1151']
Total sample size after apply threshold:  30
For name:  r_figueiredo
total sample size before apply threshold:  48
Counter({'0000-0002-2122-6530': 29, '0000-0001-5806-0944': 17, '0000-0002-4304-6434': 1, '0000-0002-0933-4854': 1})
['0000-0001-5806-0944', '0000-0002-2122-6530']
Total sample size after apply threshold:  46
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.15      0.24        13
          1       0.75      0.23      0.35        13
          2       0.72      0.33      0.46        39
          3       1.00      0.75      0.86        20
          4       0.80      0.36      0.50        33
          5       0.71      0.45      0.55        49
          6       0.25      0.20      0.22        20
          7       0.68      0.78      0.73       146
          8       0.24      0.17      0.20        23
          9       0.00      0.00      0.00        12
         10       0.87      0.83      0.85        24
         11       1.00      0.27      0.42        15
         12       0.38      0.93      0.54       115
         13       0.92      0.53      0.67        64
         14       1.00      0.40      0.57        15
         15       1.00      0.45      0.62        22
         16       0.50      0.24      0.32        21
         17       0.00      0.00      0.00   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.73      0.80        11
          1       0.80      0.92      0.86        13

avg / total       0.84      0.83      0.83        24

[ 8  3  1 12]
LR Accuracy:  0.8333333333333334
LR F1:  0.8285714285714285
For name:  e_ford
total sample size before apply threshold:  54
Counter({'0000-0002-7885-0019': 34, '0000-0001-5613-8509': 14, '0000-0001-7358-798X': 4, '0000-0003-0952-3660': 2})
['0000-0002-7885-0019', '0000-0001-5613-8509']
Total sample size after apply threshold:  48
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocab

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.43      0.25      0.32        12
          1       0.89      0.95      0.92        74

avg / total       0.82      0.85      0.83        86

[ 3  9  4 70]
MNB Accuracy:  0.8488372093023255
MNB F1:  0.6154110767113863
             precision    recall  f1-score   support

          0       0.60      0.50      0.55        12
          1       0.92      0.95      0.93        74

avg / total       0.88      0.88      0.88        86

[ 6  6  4 70]
svc Accuracy:  0.8837209302325582
svc F1:  0.7393939393939393
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.86      0.99      0.92        74

avg / total       0.74      0.85      0.79        86

[ 0 12  1 73]
LR Accuracy:  0.8488372093023255
LR F1:  0.4591194968553459
For name:  m_thomas
total sample size before apply threshold:  225
Counter({'0000-0002-2452-981X': 69, '0000-0001-5939-1155': 52,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.65      0.54      0.59        24
          1       1.00      0.80      0.89        10
          2       0.44      0.29      0.35        14
          3       0.00      0.00      0.00        13
          4       0.67      0.79      0.73        52
          5       0.68      0.91      0.78        69
          6       0.29      0.11      0.16        18

avg / total       0.59      0.66      0.61       200

[13  0  0  0  5  4  2  1  8  0  0  0  1  0  2  0  4  0  3  3  2  1  0  0
  0  4  7  1  1  0  3  0 41  7  0  0  0  0  2  4 63  0  2  0  2  0  4  8
  2]
svc Accuracy:  0.655
svc F1:  0.4987236516209198
             precision    recall  f1-score   support

          0       0.65      0.54      0.59        24
          1       1.00      0.20      0.33        10
          2       0.67      0.29      0.40        14
          3       0.00      0.00      0.00        13
          4       0.62      0.77      0.69        52
 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.43      0.27      0.33        11
          1       0.73      0.90      0.81        68
          2       1.00      0.73      0.84        11
          3       0.56      0.59      0.57        34
          4       0.71      0.45      0.56        11
          5       0.50      0.20      0.29        10

avg / total       0.67      0.68      0.66       145

[ 3  5  0  3  0  0  4 61  0  3  0  0  0  2  8  1  0  0  0 12  0 20  1  1
  0  1  0  4  5  1  0  2  0  5  1  2]
svc Accuracy:  0.6827586206896552
svc F1:  0.5660140048428651
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        11
          1       0.60      0.96      0.73        68
          2       1.00      0.73      0.84        11
          3       0.70      0.47      0.56        34
          4       0.00      0.00      0.00        11
          5       0.67      0.20      0.31        10

avg / total       0.56     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 26)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 18)
2
(57, 44)
             precision    recall  f1-score   support

          0       0.67

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.62      0.77        16
          1       0.57      0.29      0.38        14
          2       0.76      0.95      0.84        79
          3       0.83      0.53      0.65        19

avg / total       0.78      0.77      0.75       128

[10  0  6  0  0  4 10  0  0  2 75  2  0  1  8 10]
svc Accuracy:  0.7734375
svc F1:  0.6595102674298035
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        16
          1       0.00      0.00      0.00        14
          2       0.65      1.00      0.79        79
          3       1.00      0.11      0.19        19

avg / total       0.67      0.66      0.56       128

[ 4  0 12  0  0  0 14  0  0  0 79  0  0  0 17  2]
LR Accuracy:  0.6640625
LR F1:  0.344136460554371
For name:  s_clark
total sample size before apply threshold:  39
Counter({'0000-0001-5907-9671': 12, '0000-0002-7488-3438': 9, '0000-0001-7328-0726': 8, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.64      0.64      0.64        14
          1       0.62      0.45      0.53        11
          2       0.64      0.82      0.72        11

avg / total       0.64      0.64      0.63        36

[9 2 3 4 5 2 1 1 9]
LR Accuracy:  0.6388888888888888
LR F1:  0.6297243107769424
For name:  a_kirby
total sample size before apply threshold:  64
Counter({'0000-0002-2440-9316': 26, '0000-0001-5663-2961': 25, '0000-0002-6928-668X': 9, '0000-0003-0395-6684': 4})
['0000-0002-2440-9316', '0000-0001-5663-2961']
Total sample size after apply threshold:  51
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.54      0.35      0.42        20
          2       0.50      0.07      0.12        14
          3       0.53      0.87      0.66        47
          4       0.75      0.50      0.60        12

avg / total       0.60      0.56      0.52       103

[ 3  0  0  7  0  0  7  0 13  0  0  1  1 11  1  0  4  1 41  1  0  1  0  5
  6]
MNB Accuracy:  0.5631067961165048
MNB F1:  0.4544142416723062
             precision    recall  f1-score   support

          0       0.75      0.30      0.43        10
          1       0.56      0.45      0.50        20
          2       0.43      0.21      0.29        14
          3       0.55      0.79      0.65        47
          4       0.78      0.58      0.67        12

avg / total       0.58      0.57      0.55       103

[ 3  0  0  7  0  0  9  0 11  0  0  2  3  8  1  0  5  4 37  1  1  0  0  4
  7]
svc Accuracy:  0.5728155339805825


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.61      0.88      0.72       102
          1       1.00      0.40      0.57        40
          2       0.62      0.82      0.71        91
          3       0.90      0.22      0.35        41
          4       0.75      0.55      0.64        74

avg / total       0.72      0.66      0.64       348

[90  0  8  0  4 11 16 12  0  1 12  0 75  0  4 11  0 16  9  5 23  0  9  1
 41]
MNB Accuracy:  0.6637931034482759
MNB F1:  0.5987641405653102
             precision    recall  f1-score   support

          0       0.71      0.83      0.77       102
          1       1.00      0.68      0.81        40
          2       0.65      0.86      0.74        91
          3       0.80      0.39      0.52        41
          4       0.76      0.64      0.69        74

avg / total       0.75      0.73      0.72       348

[85  0  9  0  8  5 27  8  0  0  7  0 78  2  4  5  0 17 16  3 17  0  8  2
 47]
svc Accuracy:  0.7270114942528736


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(210, 73)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(210, 32)
2
(210, 105)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.54      0.50      0.52        14
          1       0.50      0.74      0.60        19
          2       0.44      0.33      0.38        12
          3       0.60      0.30      0.40        10

avg / total       0.52      0.51      0.49        55

[ 7  4  1  2  2 14  3  0  2  6  4  0  2  4  1  3]
MNB Accuracy:  0.509090909090909
MNB F1:  0.4738038950804908
             precision    recall  f1-score   support

          0       0.45      0.36      0.40        14
          1       0.48      0.84      0.62        19
          2       0.71      0.42      0.53        12
          3       0.50      0.20      0.29        10

avg / total       0.53      0.51      0.48        55

[ 5  6  1  2  2 16  1  0  1  6  5  0  3  5  0  2]
svc Accuracy:  0.509090909090909
svc F1:  0.4568536726431463
             precision    recall  f1-score   support

          0       0.50      0.43      0.46        14
          1       0.48      0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      1.00      0.91       149
          1       1.00      0.26      0.41        39

avg / total       0.87      0.85      0.81       188

[149   0  29  10]
LR Accuracy:  0.8457446808510638
LR F1:  0.6597391250078013
For name:  j_young
total sample size before apply threshold:  267
Counter({'0000-0002-1514-1522': 124, '0000-0003-4182-341X': 40, '0000-0003-3849-3392': 30, '0000-0002-1294-942X': 23, '0000-0002-2711-9701': 17, '0000-0001-7219-7824': 16, '0000-0003-4886-9517': 10, '0000-0003-1745-2401': 4, '0000-0001-9791-2513': 2, '0000-0001-6583-7643': 1})
['0000-0003-3849-3392', '0000-0002-1294-942X', '0000-0001-7219-7824', '0000-0003-4182-341X', '0000-0002-2711-9701', '0000-0003-4886-9517', '0000-0002-1514-1522']
Total sample size after apply threshold:  260
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.74      0.47      0.57        30
          1       0.76      0.57      0.65        23
          2       0.83      0.31      0.45        16
          3       1.00      0.75      0.86        40
          4       1.00      0.41      0.58        17
          5       1.00      0.60      0.75        10
          6       0.70      0.99      0.82       124

avg / total       0.80      0.76      0.74       260

[ 14   2   0   0   0   0  14   1  13   1   0   0   0   8   2   2   5   0
   0   0   7   0   0   0  30   0   0  10   1   0   0   0   7   0   9   0
   0   0   0   0   6   4   1   0   0   0   0   0 123]
svc Accuracy:  0.7615384615384615
svc F1:  0.6698846701952293
             precision    recall  f1-score   support

          0       0.64      0.30      0.41        30
          1       0.92      0.48      0.63        23
          2       0.00      0.00      0.00        16
          3       0.96      0.60      0.74   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(136, 49)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(136, 16)
2
(136, 65)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.64      0.58      0.61        12
          1       0.78      0.82      0.80        22

avg / total       0.73      0.74      0.73        34

[ 7  5  4 18]
svc Accuracy:  0.7352941176470589
svc F1:  0.7043478260869566
             precision    recall  f1-score   support

          0       0.86      0.50      0.63        12
          1       0.78      0.95      0.86        22

avg / total       0.81      0.79      0.78        34

[ 6  6  1 21]
LR Accuracy:  0.7941176470588235
LR F1:  0.7443609022556391
For name:  x_yang
total sample size before apply threshold:  164
Counter({'0000-0001-5207-4210': 40, '0000-0002-2036-1220': 32, '0000-0003-3454-3604': 13, '0000-0003-0437-2015': 12, '0000-0002-1142-3100': 10, '0000-0002-7398-4229': 7, '0000-0002-5118-7755': 6, '0000-0002-5083-1799': 6, '0000-0002-4862-7422': 6, '0000-0003-2642-4963': 4, '0000-0002-1375-4800': 4, '0000-0001-8231-5556': 3, '0000-0002-5095-6735': 3, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  36
Counter({'0000-0002-6360-849X': 13, '0000-0003-1592-4823': 8, '0000-0001-7935-7245': 7, '0000-0003-4982-8127': 3, '0000-0003-4645-0384': 3, '0000-0003-3804-3041': 1, '0000-0002-8171-3242': 1})
['0000-0002-6360-849X']
Total sample size after apply threshold:  13
For name:  a_cooper
total sample size before apply threshold:  265
Counter({'0000-0001-6709-7343': 112, '0000-0001-6050-3863': 72, '0000-0002-5897-2107': 23, '0000-0003-1025-0268': 16, '0000-0003-3975-3897': 15, '0000-0003-4588-2513': 12, '0000-0003-4097-5569': 4, '0000-0002-0815-0084': 4, '0000-0001-6027-8272': 3, '0000-0002-8305-8587': 2, '0000-0002-7328-4361': 1, '0000-0001-8763-8530': 1})
['0000-0002-5897-2107', '0000-0001-6709-7343', '0000-0003-3975-3897', '0000-0003-4588-2513', '0000-0001-6050-3863', '0000-0003-1025-0268']
Total sample size after apply threshold:  250
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.48      0.48      0.48        23
          1       0.78      0.89      0.83       112
          2       0.78      0.47      0.58        15
          3       0.17      0.08      0.11        12
          4       0.73      0.75      0.74        72
          5       0.80      0.50      0.62        16

avg / total       0.71      0.72      0.71       250

[ 11   7   1   0   4   0   3 100   0   0   7   2   1   3   7   3   1   0
   2   3   0   1   6   0   4  11   1   2  54   0   2   4   0   0   2   8]
svc Accuracy:  0.724
svc F1:  0.5601915483541452
             precision    recall  f1-score   support

          0       0.22      0.09      0.12        23
          1       0.67      0.95      0.78       112
          2       0.00      0.00      0.00        15
          3       0.00      0.00      0.00        12
          4       0.63      0.68      0.65        72
          5       1.00      0.25      0.40        16

avg 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


 43
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(43, 17)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(43, 12)
2
(43, 29)
             precision    recall  f1-score   support

          0       

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.26      0.42        19
          1       0.90      1.00      0.95       131

avg / total       0.92      0.91      0.88       150

[  5  14   0 131]
svc Accuracy:  0.9066666666666666
svc F1:  0.6829710144927537
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        19
          1       0.87      1.00      0.93       131

avg / total       0.76      0.87      0.81       150

[  0  19   0 131]
LR Accuracy:  0.8733333333333333
LR F1:  0.46619217081850534
For name:  m_moore
total sample size before apply threshold:  112
Counter({'0000-0002-5127-4509': 45, '0000-0003-3074-6631': 38, '0000-0002-7853-5756': 18, '0000-0003-4768-5329': 7, '0000-0002-7914-0166': 4})
['0000-0002-7853-5756', '0000-0003-3074-6631', '0000-0002-5127-4509']
Total sample size after apply threshold:  101
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        d

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(286, 133)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(286, 24)
2
(286, 157)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.38      0.24      0.29        21
          1       1.00      1.00      1.00        44
          2       0.50      0.12      0.20        16
          3       0.75      0.25      0.38        12
          4       0.75      0.95      0.84       114
          5       0.67      0.44      0.53        18
          6       0.64      0.81      0.72        47
          7       0.50      0.21      0.30        14

avg / total       0.71      0.74      0.70       286

[  5   0   0   0   4   2  10   0   0  44   0   0   0   0   0   0   0   0
   2   0   9   0   5   0   2   0   0   3   4   1   2   0   2   0   1   0
 108   0   0   3   0   0   0   0   7   8   3   0   4   0   1   1   3   0
  38   0   0   0   0   0   9   1   1   3]
svc Accuracy:  0.7377622377622378
svc F1:  0.5320801768491512
             precision    recall  f1-score   support

          0       1.00      0.10      0.17        21
          1       1.00      0.98     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 29)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 10)
2
(49, 39)
             precision    recall  f1-score   support

          0       0.87

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.67      0.15      0.25        13
          1       0.83      0.33      0.48        15
          2       0.75      0.33      0.46        18
          3       1.00      0.79      0.88        14
          4       0.63      0.77      0.69        61
          5       0.58      0.94      0.72        70
          6       1.00      0.27      0.43        11
          7       0.81      0.55      0.66        40
          8       0.57      0.42      0.48        19

avg / total       0.70      0.65      0.63       261

[ 2  1  1  0  1  8  0  0  0  0  5  0  0  7  2  0  1  0  1  0  6  0  4  6
  0  1  0  0  0  0 11  0  2  0  0  1  0  0  0  0 47 12  0  2  0  0  0  1
  0  3 66  0  0  0  0  0  0  0  4  2  3  0  2  0  0  0  0  1 14  0 22  3
  0  0  0  0  8  2  0  1  8]
svc Accuracy:  0.6513409961685823
svc F1:  0.5607147826661512
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.57      0.82      0.67        57
          1       0.75      0.40      0.52        30
          2       0.82      0.63      0.72        52

avg / total       0.70      0.66      0.66       139

[47  4  6 17 12  1 19  0 33]
svc Accuracy:  0.6618705035971223
svc F1:  0.6368530020703934
             precision    recall  f1-score   support

          0       0.57      0.77      0.66        57
          1       0.71      0.33      0.45        30
          2       0.77      0.71      0.74        52

avg / total       0.68      0.65      0.64       139

[44  4  9 18 10  2 15  0 37]
LR Accuracy:  0.6546762589928058
LR F1:  0.6170872908186341
For name:  y_zhang
total sample size before apply threshold:  1244
Counter({'0000-0001-8642-4071': 104, '0000-0002-3254-8965': 64, '0000-0001-7307-9408': 56, '0000-0002-9956-3879': 48, '0000-0003-2932-4159': 48, '0000-0003-2317-2190': 45, '0000-0003-2753-7601': 37, '0000-0001-6118-66

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(967, 24)
2
(967, 394)
             precision    recall  f1-score   support

          0       1.00      0.05      0.09        22
          1       0.00      0.00      0.00        14
          2       0.00      0.00      0.00        21
          3       0.00      0.00      0.00        20
          4       0.00      0.00      0.00        13
          5       0.00      0.00      0.00        22
          6       0.29      0.70      0.41        64
          7       0.36      0.21      0.27        56
          8       0.00      0.00      0.00        22
          9       0.50      0.08      0.13        26
         10       0.29      0.14      0.19        28
         11       0.94      0.60      0.73        48
         12       0.00      0.00      0.00        14
         13       0.00      0.00      0.00        12
         14       0.26      0.38      0.31        48
         15       0.33      0.07      0.12        27
         16       0.00      0.00      0.00        26
         17       0.44

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(51, 30)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(51, 14)
2
(51, 44)
             precision    recall  f1-score   support

          0       0.71      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  86
Counter({'0000-0002-8493-4649': 51, '0000-0001-6394-658X': 31, '0000-0002-8973-104X': 2, '0000-0001-9689-7040': 1, '0000-0002-1779-4535': 1})
['0000-0002-8493-4649', '0000-0001-6394-658X']
Total sample size after apply threshold:  82
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(82, 65)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=N

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.82      0.69      0.75        13
          1       0.82      0.64      0.72        22
          2       1.00      0.81      0.90        16
          3       0.35      0.64      0.45        67
          4       0.76      0.57      0.65        28
          5       0.85      0.49      0.62        35
          6       0.48      0.56      0.52        78
          7       1.00      0.07      0.13        14
          8       0.87      0.41      0.55        32
          9       0.50      0.25      0.33        20
         10       1.00      0.60      0.75        15
         11       1.00      0.29      0.44        28
         12       0.66      0.85      0.74       171
         13       0.80      0.50      0.62        56

avg / total       0.69      0.61      0.61       595

[  9   0   0   2   0   0   1   0   0   0   0   0   1   0   0  14   0   6
   1   0   0   0   0   0   0   0   0   1   0   0  13   0   0   0   1   0
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.86      0.89        14
          1       0.91      0.95      0.93        21

avg / total       0.91      0.91      0.91        35

[12  2  1 20]
MNB Accuracy:  0.9142857142857143
MNB F1:  0.9095607235142119
             precision    recall  f1-score   support

          0       0.92      0.86      0.89        14
          1       0.91      0.95      0.93        21

avg / total       0.91      0.91      0.91        35

[12  2  1 20]
svc Accuracy:  0.9142857142857143
svc F1:  0.9095607235142119
             precision    recall  f1-score   support

          0       0.92      0.86      0.89        14
          1       0.91      0.95      0.93        21

avg / total       0.91      0.91      0.91        35

[12  2  1 20]
LR Accuracy:  0.9142857142857143
LR F1:  0.9095607235142119
For name:  b_liu
total sample size before apply threshold:  298
Counter({'0000-0002-0956-2777': 97, '0000-0002-8662-0512': 24, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(78, 50)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(78, 13)
2
(78, 63)
             precision    recall  f1-score   support

          0       0.00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.93      0.88        67
          1       0.92      0.85      0.88        13
          2       0.82      0.67      0.74        21
          3       0.58      0.71      0.64        31
          4       1.00      0.18      0.31        11

avg / total       0.80      0.78      0.76       143

[62  0  0  5  0  0 11  0  2  0  3  0 14  4  0  6  1  2 22  0  3  0  1  5
  2]
svc Accuracy:  0.7762237762237763
svc F1:  0.688329639297846
             precision    recall  f1-score   support

          0       0.64      0.97      0.77        67
          1       1.00      0.77      0.87        13
          2       0.86      0.57      0.69        21
          3       0.67      0.39      0.49        31
          4       0.00      0.00      0.00        11

avg / total       0.66      0.69      0.65       143

[65  0  0  2  0  3 10  0  0  0  7  0 12  2  0 17  0  2 12  0  9  0  0  2
  0]
LR Accuracy:  0.6923076923076923
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.95      0.97        38
          1       0.55      0.43      0.48        14
          2       0.75      0.19      0.30        16
          3       0.71      0.93      0.80        55

avg / total       0.79      0.78      0.75       123

[36  0  0  2  0  6  0  8  0  2  3 11  0  3  1 51]
svc Accuracy:  0.7804878048780488
svc F1:  0.6390306448180464
             precision    recall  f1-score   support

          0       1.00      0.89      0.94        38
          1       0.33      0.07      0.12        14
          2       0.00      0.00      0.00        16
          3       0.63      0.98      0.77        55

avg / total       0.63      0.72      0.65       123

[34  0  0  4  0  1  0 13  0  1  0 15  0  1  0 54]
LR Accuracy:  0.7235772357723578
LR F1:  0.4570122375191211
For name:  s_ferreira
total sample size before apply threshold:  70
Counter({'0000-0001-7159-2769': 20, '0000-0001-8308-2862': 17, '0000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(85, 49)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(85, 12)
2
(85, 61)
             precision    recall  f1-score   support

          0       0.76

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.77      0.79      0.78        42
          1       0.78      0.58      0.67        12
          2       0.82      0.84      0.83        58

avg / total       0.79      0.79      0.79       112

[33  1  8  2  7  3  8  1 49]
svc Accuracy:  0.7946428571428571
svc F1:  0.7578819098260773
             precision    recall  f1-score   support

          0       0.76      0.76      0.76        42
          1       0.86      0.50      0.63        12
          2       0.81      0.88      0.84        58

avg / total       0.80      0.79      0.79       112

[32  1  9  3  6  3  7  0 51]
LR Accuracy:  0.7946428571428571
LR F1:  0.7454863052949178
For name:  h_tanaka
total sample size before apply threshold:  28
Counter({'0000-0002-4378-5747': 21, '0000-0003-1511-8557': 4, '0000-0002-3153-8802': 1, '0000-0002-1760-691X': 1, '0000-0001-8622-7422': 1})
['0000-0002-4378-5747']
Total sample size after apply threshold:  21
For name

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 15)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(21, 6)
2
(21, 21)
             precision    recall  f1-score   support

          0       0.67 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  786
Counter({'0000-0003-2861-8286': 158, '0000-0001-5525-0494': 156, '0000-0002-8984-8600': 74, '0000-0001-5790-5116': 41, '0000-0002-7524-9914': 39, '0000-0002-7802-8690': 39, '0000-0003-4968-5138': 30, '0000-0002-7713-1813': 22, '0000-0002-9968-3707': 18, '0000-0002-6332-5182': 16, '0000-0002-5668-7134': 16, '0000-0001-5554-7714': 14, '0000-0002-4839-8279': 14, '0000-0002-1112-1209': 11, '0000-0003-0423-2514': 10, '0000-0002-4386-5851': 10, '0000-0002-9679-8357': 10, '0000-0003-3786-2889': 10, '0000-0002-1673-2164': 10, '0000-0001-7604-792X': 8, '0000-0002-1840-1473': 8, '0000-0003-0393-1655': 7, '0000-0003-4212-5955': 7, '0000-0002-0067-0288': 5, '0000-0002-0634-0546': 5, '0000-0003-2002-4774': 4, '0000-0001-5470-9523': 4, '0000-0002-4364-4979': 4, '0000-0002-5388-1732': 3, '0000-0001-5203-5908': 3, '0000-0001-7231-7021': 3, '0000-0002-5334-0047': 3, '0000-0002-1718-0744': 2, '0000-0003-0384-4447': 2, '0000-0002-2100-7223': 2, '0000-0003-45

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[  8   0   0   0   0   0   6  24   1   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   1  16   0   0   0   0   0   0   0   1   0
   0   0   0   0   0   0   0   0   2  11   1   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   2   7   1   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   1   7   2   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   6   3   0   0   0   0
   0   1   0   0   0   0   1   0   0   0   0   0 122  23   6   0   0   0
   0   0   2   2   0   0   0   0   0   0   0   0   0   7 151   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  11  29  31   0
   0   0   0   0   3   0   0   0   0   0   0   0   0   0   0   2  18   2
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2   7
   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   2
   8   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   1  14   1   0   0   0   0   0   0   0   0   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.60      0.23      0.33        13
          1       0.82      0.87      0.85       103
          2       0.89      0.70      0.78        46
          3       0.79      0.91      0.85        69

avg / total       0.81      0.81      0.80       231

[ 3  5  1  4  1 90  2 10  1 10 32  3  0  5  1 63]
svc Accuracy:  0.8138528138528138
svc F1:  0.7011322861598027
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.72      0.86      0.78       103
          2       0.82      0.59      0.68        46
          3       0.76      0.81      0.78        69

avg / total       0.71      0.74      0.72       231

[ 0  7  2  4  0 89  3 11  0 16 27  3  0 12  1 56]
LR Accuracy:  0.7445887445887446
LR F1:  0.5627255140443118
For name:  m_kobayashi
total sample size before apply threshold:  51
Counter({'0000-0002-6657-1928': 33, '0000-0003-0219-9108': 9, '0000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 46)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(164, 25)
2
(164, 71)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      1.00      0.99       155
          1       1.00      0.80      0.89        20

avg / total       0.98      0.98      0.98       175

[155   0   4  16]
MNB Accuracy:  0.9771428571428571
MNB F1:  0.9380750176928521
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       155
          1       1.00      0.90      0.95        20

avg / total       0.99      0.99      0.99       175

[155   0   2  18]
svc Accuracy:  0.9885714285714285
svc F1:  0.9704790823211876
             precision    recall  f1-score   support

          0       0.97      1.00      0.99       155
          1       1.00      0.80      0.89        20

avg / total       0.98      0.98      0.98       175

[155   0   4  16]
LR Accuracy:  0.9771428571428571
LR F1:  0.9380750176928521
For name:  a_marino
total sample size before apply threshold:  15
Counter({'0000-0002-1709-538X': 7, '0000-0002-0528-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.78      0.94      0.86        54
          1       1.00      0.58      0.73        19
          2       0.76      0.68      0.72        28

avg / total       0.82      0.80      0.79       101

[51  0  3  5 11  3  9  0 19]
LR Accuracy:  0.801980198019802
LR F1:  0.769152440850554
For name:  t_becker
total sample size before apply threshold:  21
Counter({'0000-0002-4117-8249': 12, '0000-0002-5656-4564': 5, '0000-0003-3432-783X': 3, '0000-0002-5193-4044': 1})
['0000-0002-4117-8249']
Total sample size after apply threshold:  12
For name:  s_pedersen
total sample size before apply threshold:  322
Counter({'0000-0002-7838-8063': 166, '0000-0002-3044-7714': 80, '0000-0002-6500-9263': 40, '0000-0002-4786-6464': 21, '0000-0001-8055-3251': 11, '0000-0002-8566-7693': 1, '0000-0002-4355-1764': 1, '0000-0002-3822-5075': 1, '0000-0001-8017-4227': 1})
['0000-0002-7838-8063', '0000-0002-3044-7714', '0000-0002-4786-6464', '0000-

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.73      0.94      0.82       166
          1       0.75      0.55      0.63        80
          2       1.00      0.43      0.60        21
          3       1.00      0.45      0.62        11
          4       1.00      0.78      0.87        40

avg / total       0.79      0.77      0.76       318

[156  10   0   0   0  36  44   0   0   0   9   3   9   0   0   5   1   0
   5   0   8   1   0   0  31]
svc Accuracy:  0.7704402515723271
svc F1:  0.7104771186757044
             precision    recall  f1-score   support

          0       0.67      0.95      0.79       166
          1       0.76      0.51      0.61        80
          2       1.00      0.10      0.17        21
          3       1.00      0.09      0.17        11
          4       1.00      0.65      0.79        40

avg / total       0.77      0.72      0.68       318

[158   8   0   0   0  39  41   0   0   0  14   5   2   0   0  10   0   0
   1   0  14  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(567, 136)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(567, 27)
2
(567, 163)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      0.56      0.69        18
          1       0.90      0.69      0.78        55
          2       0.58      0.54      0.56        13
          3       0.61      0.38      0.47        29
          4       0.97      0.96      0.96       331
          5       1.00      0.47      0.64        15
          6       0.55      0.89      0.68        74
          7       0.97      0.88      0.92        32

avg / total       0.88      0.86      0.86       567

[ 10   0   1   0   1   0   5   1   0  38   1   0   6   0  10   0   0   0
   7   1   1   0   4   0   0   0   1  11   1   0  16   0   0   3   1   1
 318   0   8   0   0   0   0   0   1   7   7   0   0   1   1   5   1   0
  66   0   1   0   0   0   0   0   3  28]
svc Accuracy:  0.855379188712522
svc F1:  0.7129020055194237
             precision    recall  f1-score   support

          0       0.67      0.11      0.19        18
          1       0.87      0.62      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.97      0.89        32
          1       0.94      0.71      0.81        24

avg / total       0.87      0.86      0.85        56

[31  1  7 17]
LR Accuracy:  0.8571428571428571
LR F1:  0.8476190476190477
For name:  c_marshall
total sample size before apply threshold:  106
Counter({'0000-0003-4186-0368': 32, '0000-0002-7571-5700': 29, '0000-0001-5901-2004': 28, '0000-0002-1285-7648': 6, '0000-0002-8227-2354': 6, '0000-0001-6669-3231': 3, '0000-0002-7397-6472': 1, '0000-0002-0592-7716': 1})
['0000-0001-5901-2004', '0000-0003-4186-0368', '0000-0002-7571-5700']
Total sample size after apply threshold:  89
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_word

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      0.88      0.81        49
          1       0.84      0.67      0.74        24
          2       0.87      0.79      0.83        34

avg / total       0.81      0.80      0.80       107

[43  2  4  8 16  0  6  1 27]
svc Accuracy:  0.8037383177570093
svc F1:  0.79542534399928
             precision    recall  f1-score   support

          0       0.69      0.84      0.76        49
          1       0.79      0.46      0.58        24
          2       0.79      0.79      0.79        34

avg / total       0.75      0.74      0.73       107

[41  2  6 12 11  1  6  1 27]
LR Accuracy:  0.7383177570093458
LR F1:  0.7107747582463784
For name:  g_volpe
total sample size before apply threshold:  31
Counter({'0000-0001-9993-5348': 15, '0000-0001-5057-1846': 14, '0000-0002-3916-5393': 1, '0000-0003-0760-4627': 1})
['0000-0001-9993-5348', '0000-0001-5057-1846']
Total sample size after apply threshold:  29
(0, 0)
TfidfV

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(401, 142)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(401, 32)
2
(401, 174)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(166, 135)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(166, 3)
2
(166, 138)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.73      0.69      0.71        16
          1       0.71      0.75      0.73        16

avg / total       0.72      0.72      0.72        32

[11  5  4 12]
svc Accuracy:  0.71875
svc F1:  0.7184750733137829
             precision    recall  f1-score   support

          0       0.69      0.69      0.69        16
          1       0.69      0.69      0.69        16

avg / total       0.69      0.69      0.69        32

[11  5  5 11]
LR Accuracy:  0.6875
LR F1:  0.6875
For name:  s_jacobs
total sample size before apply threshold:  21
Counter({'0000-0002-6199-5748': 9, '0000-0002-9959-5627': 8, '0000-0003-4674-4817': 2, '0000-0002-9382-1646': 1, '0000-0002-8103-1700': 1})
[]
Total sample size after apply threshold:  0
For name:  c_hong
total sample size before apply threshold:  32
Counter({'0000-0002-1058-3073': 23, '0000-0001-7745-9205': 7, '0000-0002-5118-620X': 1, '0000-0002-7397-1671': 1})
['0000-0002-1058-3073']

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(109, 48)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(109, 15)
2
(109, 63)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.63      0.75      0.69        16
          1       0.50      0.36      0.42        11

avg / total       0.58      0.59      0.58        27

[12  4  7  4]
svc Accuracy:  0.5925925925925926
svc F1:  0.5533834586466165
             precision    recall  f1-score   support

          0       0.60      0.75      0.67        16
          1       0.43      0.27      0.33        11

avg / total       0.53      0.56      0.53        27

[12  4  8  3]
LR Accuracy:  0.5555555555555556
LR F1:  0.4999999999999999
For name:  d_lloyd
total sample size before apply threshold:  157
Counter({'0000-0002-0824-9682': 104, '0000-0003-0658-8995': 50, '0000-0003-3589-7383': 1, '0000-0003-1759-6106': 1, '0000-0003-1497-6808': 1})
['0000-0003-0658-8995', '0000-0002-0824-9682']
Total sample size after apply threshold:  154
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, enco

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.93      0.50      0.65        50
          1       0.80      0.98      0.88       104

avg / total       0.84      0.82      0.81       154

[ 25  25   2 102]
LR Accuracy:  0.8246753246753247
LR F1:  0.7662337662337662
For name:  a_mohammadi
total sample size before apply threshold:  8
Counter({'0000-0002-8345-8206': 3, '0000-0001-7845-1707': 2, '0000-0001-7491-6423': 1, '0000-0003-4272-2733': 1, '0000-0002-8477-0939': 1})
[]
Total sample size after apply threshold:  0
For name:  d_dean
total sample size before apply threshold:  189
Counter({'0000-0002-4512-9065': 174, '0000-0002-5688-703X': 10, '0000-0002-8599-773X': 2, '0000-0002-2279-3393': 2, '0000-0003-4793-6511': 1})
['0000-0002-5688-703X', '0000-0002-4512-9065']
Total sample size after apply threshold:  184
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.21      0.33        19
          1       0.73      0.99      0.84       322
          2       1.00      0.64      0.78        11
          3       0.75      0.27      0.40        11
          4       1.00      0.67      0.80        15
          5       0.97      0.78      0.87        37
          6       0.75      0.21      0.33        28
          7       0.93      0.64      0.76        61
          8       0.50      0.10      0.17        20
          9       0.62      0.42      0.50        12
         10       1.00      0.32      0.48        22

avg / total       0.79      0.77      0.73       558

[  4  13   0   0   0   0   0   1   0   1   0   1 318   0   0   0   1   1
   0   1   0   0   0   4   7   0   0   0   0   0   0   0   0   0   7   0
   3   0   0   0   1   0   0   0   0   5   0   0  10   0   0   0   0   0
   0   0   8   0   0   0  29   0   0   0   0   0   0  21   0   0   0   0
   6   0   1   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



             precision    recall  f1-score   support

          0       0.63      0.92      0.75        48
          1       0.85      0.52      0.65        21
          2       1.00      0.56      0.72        16
          3       0.69      0.48      0.56        23

avg / total       0.74      0.69      0.68       108

[44  1  0  3  9 11  0  1  6  0  9  1 11  1  0 11]
svc Accuracy:  0.6944444444444444
svc F1:  0.6692310248740957
             precision    recall  f1-score   support

          0       0.54      0.90      0.68        48
          1       0.80      0.38      0.52        21
          2       1.00      0.25      0.40        16
          3       0.60      0.39      0.47        23

avg / total       0.67      0.59      0.56       108

[43  1  0  4 12  8  0  1 11  0  4  1 13  1  0  9]
LR Accuracy:  0.5925925925925926
LR F1:  0.5167446492787723
For name:  i_wilson
total sample size before apply threshold:  220
Counter({'0000-0002-0246-738X': 102, '0000-0001-8996-1518': 85, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Counter({'0000-0002-3261-6868': 98, '0000-0001-5372-7885': 63, '0000-0003-2084-2718': 35, '0000-0002-6740-2472': 12, '0000-0002-3652-7029': 7, '0000-0001-9951-4674': 2})
['0000-0002-3261-6868', '0000-0003-2084-2718', '0000-0001-5372-7885', '0000-0002-6740-2472']
Total sample size after apply threshold:  208
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(208, 95)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.85      0.97      0.91       180
          1       0.83      0.58      0.69        60
          2       1.00      0.61      0.76        18

avg / total       0.86      0.86      0.85       258

[175   5   0  25  35   0   5   2  11]
MNB Accuracy:  0.8565891472868217
MNB F1:  0.7846620361833344
             precision    recall  f1-score   support

          0       0.88      1.00      0.94       180
          1       1.00      0.70      0.82        60
          2       1.00      0.61      0.76        18

avg / total       0.91      0.90      0.90       258

[180   0   0  18  42   0   7   0  11]
svc Accuracy:  0.9031007751937985
svc F1:  0.8390716788282712
             precision    recall  f1-score   support

          0       0.82      1.00      0.90       180
          1       1.00      0.45      0.62        60
          2       1.00      0.61      0.76        18

avg / total       0.87      0.84      0.83       2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(77, 47)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(77, 14)
2
(77, 61)
             precision    recall  f1-score   support

          0       0.67

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  56
Counter({'0000-0002-0086-252X': 32, '0000-0002-5655-3320': 13, '0000-0001-9075-7100': 4, '0000-0003-2465-3033': 4, '0000-0002-4303-1574': 3})
['0000-0002-0086-252X', '0000-0002-5655-3320']
Total sample size after apply threshold:  45
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(45, 16)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=N

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      1.00      0.96        32
          1       1.00      0.77      0.87        13

avg / total       0.94      0.93      0.93        45

[32  0  3 10]
LR Accuracy:  0.9333333333333333
LR F1:  0.9123945489941597
For name:  s_hasan
total sample size before apply threshold:  12
Counter({'0000-0002-9089-5367': 4, '0000-0002-0158-703X': 2, '0000-0002-7269-094X': 2, '0000-0001-5589-8741': 1, '0000-0001-7789-2842': 1, '0000-0003-4271-395X': 1, '0000-0001-6832-9150': 1})
[]
Total sample size after apply threshold:  0
For name:  m_teixeira
total sample size before apply threshold:  313
Counter({'0000-0003-4124-6237': 149, '0000-0002-5676-6174': 51, '0000-0002-4896-5982': 48, '0000-0001-9355-2143': 17, '0000-0002-9466-7951': 17, '0000-0002-6944-3008': 13, '0000-0001-7456-5192': 7, '0000-0002-3338-8588': 4, '0000-0003-3989-9474': 3, '0000-0002-2228-2673': 2, '0000-0003-1205-3233': 2})
['0000-0002-4896-5982', '0000-0002-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.73      0.98      0.84        48
          1       0.67      0.20      0.30        51
          2       0.00      0.00      0.00        17
          3       0.00      0.00      0.00        13
          4       0.69      0.99      0.81       149
          5       0.00      0.00      0.00        17

avg / total       0.58      0.69      0.60       295

[ 47   1   0   0   0   0  13  10   0   0  28   0   2   2   0   0  13   0
   0   0   0   0  13   0   0   1   0   0 148   0   2   1   0   0  14   0]
MNB Accuracy:  0.6949152542372882
MNB F1:  0.32554582023760104
             precision    recall  f1-score   support

          0       0.77      0.96      0.85        48
          1       0.64      0.45      0.53        51
          2       1.00      0.41      0.58        17
          3       0.50      0.15      0.24        13
          4       0.77      0.95      0.85       149
          5       1.00      0.18      0.30  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.86      0.90      0.88        21
          1       0.88      0.82      0.85        17

avg / total       0.87      0.87      0.87        38

[19  2  3 14]
svc Accuracy:  0.868421052631579
svc F1:  0.8661028893587033
             precision    recall  f1-score   support

          0       0.90      0.90      0.90        21
          1       0.88      0.88      0.88        17

avg / total       0.89      0.89      0.89        38

[19  2  2 15]
LR Accuracy:  0.8947368421052632
LR F1:  0.8935574229691876
For name:  s_lam
total sample size before apply threshold:  90
Counter({'0000-0003-3294-6637': 69, '0000-0001-7468-1142': 6, '0000-0002-5318-1760': 5, '0000-0002-2982-9192': 3, '0000-0002-1888-1067': 3, '0000-0001-7943-5004': 3, '0000-0002-1471-5176': 1})
['0000-0003-3294-6637']
Total sample size after apply threshold:  69
For name:  t_tran
total sample size before apply threshold:  54
Counter({'0000-0002-4686-8601': 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(115, 76)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(115, 14)
2
(115, 90)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      0.92      0.89        73
          1       0.77      0.67      0.71        30

avg / total       0.84      0.84      0.84       103

[67  6 10 20]
MNB Accuracy:  0.8446601941747572
MNB F1:  0.8038095238095238
             precision    recall  f1-score   support

          0       0.90      0.99      0.94        73
          1       0.96      0.73      0.83        30

avg / total       0.92      0.91      0.91       103

[72  1  8 22]
svc Accuracy:  0.912621359223301
svc F1:  0.8856825749167592
             precision    recall  f1-score   support

          0       0.87      1.00      0.93        73
          1       1.00      0.63      0.78        30

avg / total       0.91      0.89      0.88       103

[73  0 11 19]
LR Accuracy:  0.8932038834951457
LR F1:  0.8527232549070584
For name:  p_thompson
total sample size before apply threshold:  148
Counter({'0000-0002-5910-7625': 69, '0000-0002-2268-9748': 48

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  965
Counter({'0000-0002-4319-4285': 188, '0000-0002-6601-9180': 58, '0000-0003-2493-5209': 58, '0000-0003-3373-9621': 57, '0000-0002-0889-7057': 40, '0000-0001-8041-1608': 28, '0000-0002-9831-6796': 24, '0000-0002-1905-8750': 24, '0000-0002-9231-0844': 21, '0000-0002-7737-0785': 20, '0000-0002-5822-2226': 20, '0000-0002-5108-2072': 19, '0000-0002-1138-2556': 19, '0000-0002-8798-7316': 18, '0000-0002-2195-2997': 18, '0000-0003-3460-0867': 16, '0000-0003-4649-6526': 16, '0000-0001-5903-6487': 15, '0000-0002-1041-793X': 14, '0000-0001-8683-509X': 14, '0000-0002-7068-5135': 14, '0000-0002-9405-9024': 12, '0000-0001-9803-7140': 11, '0000-0002-8344-5907': 10, '0000-0003-1338-8887': 9, '0000-0003-0391-7298': 9, '0000-0002-1221-3033': 9, '0000-0003-3812-3850': 9, '0000-0002-6457-0235': 8, '0000-0003-1572-8339': 8, '0000-0001-8828-114X': 8, '0000-0001-5289-6062': 8, '0000-0001-8970-4466': 7, '0000-0003-1113-6264': 7, '0000-0003-3526-4586': 7, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.50      0.62        24
          1       0.00      0.00      0.00        12
          2       0.00      0.00      0.00        15
          3       0.96      0.79      0.86        28
          4       0.00      0.00      0.00        18
          5       0.00      0.00      0.00        16
          6       0.86      0.30      0.44        20
          7       0.00      0.00      0.00        10
          8       0.00      0.00      0.00        21
          9       0.37      0.98      0.54       188
         10       0.67      0.11      0.19        18
         11       0.00      0.00      0.00        24
         12       0.62      0.31      0.41        58
         13       1.00      0.25      0.40        16
         14       0.00      0.00      0.00        19
         15       0.00      0.00      0.00        14
         16       0.00      0.00      0.00        20
         17       1.00      0.16      0.27   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.25      0.38        12
          1       0.50      0.43      0.47        23
          2       0.58      0.37      0.45        19
          3       0.56      0.62      0.59        16
          4       1.00      0.80      0.89        20
          5       0.71      0.50      0.59        24
          6       0.46      0.79      0.58        34
          7       0.52      0.74      0.61        39
          8       0.00      0.00      0.00        12
          9       0.83      0.38      0.53        13

avg / total       0.58      0.56      0.54       212

[ 3  0  3  2  0  0  4  0  0  0  0 10  0  0  0  3  2  7  0  1  1  2  7  1
  0  0  3  4  1  0  0  0  0 10  0  0  5  1  0  0  0  1  0  0 16  0  0  3
  0  0  0  2  0  1  0 12  5  4  0  0  0  1  0  2  0  1 27  2  1  0  0  2
  0  0  0  1  6 29  1  0  0  0  1  2  0  0  6  3  0  0  0  2  1  0  0  0
  1  3  1  5]
svc Accuracy:  0.5613207547169812
svc F1:  0.5071706485

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.66      0.75        38
          1       0.75      0.50      0.60        12
          2       0.69      0.89      0.78        45

avg / total       0.77      0.75      0.74        95

[25  1 12  0  6  6  4  1 40]
LR Accuracy:  0.7473684210526316
LR F1:  0.7076558952808772
For name:  c_barros
total sample size before apply threshold:  34
Counter({'0000-0003-4666-5000': 16, '0000-0003-3244-7467': 13, '0000-0003-2330-398X': 2, '0000-0002-5863-2874': 2, '0000-0003-2236-4553': 1})
['0000-0003-3244-7467', '0000-0003-4666-5000']
Total sample size after apply threshold:  29
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(157, 17)
2
(157, 73)
             precision    recall  f1-score   support

          0       0.90      0.99      0.95       139
          1       0.75      0.17      0.27        18

avg / total       0.88      0.90      0.87       157

[138   1  15   3]
MNB Accuracy:  0.8980891719745223
MNB F1:  0.6089663760896639
             precision    recall  f1-score   support

          0       0.91      1.00      0.96       139
          1       1.00      0.28      0.43        18

avg / total       0.92      0.92      0.90       157

[139   0  13   5]
svc Accuracy:  0.9171974522292994
svc F1:  0.695054534588376
             precision    recall  f1-score   support

          0       0.89      1.00      0.94       139
          1       1.00      0.06      0.11        18

avg / total       0.90      0.89      0.85       157

[139   0  17   1]
LR Accuracy:  0.89171974522293
LR F1:  0.5238180196253346
For name:  m_pinto
total sample size before apply threshold:  201
Counter({'0000-0002-4676-1409': 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(311, 140)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(311, 19)
2
(311, 159)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.63      0.82      0.71        85
          1       1.00      0.58      0.74        12
          2       0.75      0.35      0.48        17
          3       0.43      0.39      0.41        23
          4       0.36      0.58      0.44        33
          5       1.00      0.60      0.75        15
          6       0.77      0.59      0.67        29
          7       0.70      0.52      0.59        31
          8       0.57      0.61      0.59        38
          9       0.92      0.71      0.80        17
         10       0.00      0.00      0.00        11

avg / total       0.63      0.60      0.60       311

[70  0  0  0  5  0  1  4  5  0  0  4  7  0  0  1  0  0  0  0  0  0  4  0
  6  1  3  0  0  0  0  0  3  1  0  0  9 13  0  0  0  0  0  0  5  0  0  8
 19  0  0  0  1  0  0  3  0  0  0  3  9  0  0  0  0  0  3  0  0  0  3  0
 17  0  5  0  1 10  0  0  1  2  0  0 16  2  0  0  5  0  0  2  4  0  2  1
 23  1  0  1  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.17      0.09      0.12        11
          1       0.43      0.53      0.48        19
          2       0.58      0.85      0.69        13
          3       0.64      0.50      0.56        14
          4       0.36      0.31      0.33        13

avg / total       0.45      0.47      0.45        70

[ 1  5  2  2  1  0 10  2  2  5  2  0 11  0  0  2  2  2  7  1  1  6  2  0
  4]
LR Accuracy:  0.4714285714285714
LR F1:  0.43493417366946785
For name:  j_choi
total sample size before apply threshold:  441
Counter({'0000-0002-2775-3315': 98, '0000-0002-8439-6035': 42, '0000-0003-0018-8712': 25, '0000-0003-2379-2226': 23, '0000-0001-9760-9514': 21, '0000-0002-7491-6711': 21, '0000-0003-2206-4593': 20, '0000-0002-4850-8204': 19, '0000-0003-4897-3277': 15, '0000-0003-3257-2508': 15, '0000-0001-5408-9029': 14, '0000-0002-1161-6586': 13, '0000-0002-9663-4790': 13, '0000-0001-7348-9861': 12, '0000-0002-7532-5315': 10, '0000-00

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.57      0.27      0.36        15
          1       1.00      0.40      0.57        25
          2       0.35      0.39      0.37        23
          3       0.60      0.23      0.33        13
          4       0.56      0.45      0.50        42
          5       0.80      0.38      0.52        21
          6       1.00      0.40      0.57        10
          7       1.00      0.67      0.80        21
          8       1.00      0.20      0.33        15
          9       1.00      0.69      0.82        13
         10       0.82      0.45      0.58        20
         11       0.46      0.96      0.62        98
         12       1.00      0.50      0.67        14
         13       1.00      0.63      0.77        19
         14       1.00      0.58      0.74        12
         15       1.00      0.80      0.89        10

avg / total       0.71      0.59      0.58       371

[ 4  0  4  0  1  0  0  0  0  0  0  6  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.62      0.71      0.67        14
          1       0.67      0.57      0.62        14

avg / total       0.65      0.64      0.64        28

[10  4  6  8]
svc Accuracy:  0.6428571428571429
svc F1:  0.641025641025641
             precision    recall  f1-score   support

          0       0.67      0.86      0.75        14
          1       0.80      0.57      0.67        14

avg / total       0.73      0.71      0.71        28

[12  2  6  8]
LR Accuracy:  0.7142857142857143
LR F1:  0.7083333333333333
For name:  j_barbosa
total sample size before apply threshold:  35
Counter({'0000-0002-1854-1572': 11, '0000-0001-5879-9458': 9, '0000-0002-8664-8152': 9, '0000-0003-4135-2347': 3, '0000-0002-7259-2901': 1, '0000-0002-7828-2912': 1, '0000-0001-7869-5533': 1})
['0000-0002-1854-1572']
Total sample size after apply threshold:  11
For name:  e_o'connor
total sample size before apply threshold:  18
Counter({'0000-0001-6727

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Total sample size after apply threshold:  133
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 71)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 25)
2
(133, 96)
             precision    r

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(107, 68)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(107, 23)
2
(107, 91)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.84      0.91       104
          1       0.72      0.46      0.56        39
          2       1.00      0.71      0.83        24
          3       0.79      0.98      0.88       487
          4       0.92      0.34      0.50        35
          5       0.87      0.46      0.60        57
          6       0.67      0.15      0.25        13
          7       0.94      0.65      0.77        23
          8       0.91      0.77      0.84       105

avg / total       0.84      0.83      0.82       887

[ 87   0   0  17   0   0   0   0   0   0  18   0  20   1   0   0   0   0
   0   0  17   7   0   0   0   0   0   0   4   0 479   0   3   0   0   1
   0   2   0  20  12   1   0   0   0   0   1   0  30   0  26   0   0   0
   0   0   0   4   0   0   2   0   7   0   0   0   8   0   0   0  15   0
   0   0   0  22   0   0   1   1  81]
svc Accuracy:  0.830890642615558
svc F1:  0.6811591199683865
             precision 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  176
Counter({'0000-0001-8238-798X': 76, '0000-0002-9554-7273': 27, '0000-0002-3270-0525': 19, '0000-0003-0274-7258': 11, '0000-0001-6322-2821': 11, '0000-0002-1568-0489': 11, '0000-0001-8322-7443': 9, '0000-0002-8524-229X': 5, '0000-0002-7707-656X': 3, '0000-0001-5326-2758': 2, '0000-0003-0488-1207': 1, '0000-0002-5193-7560': 1})
['0000-0002-3270-0525', '0000-0003-0274-7258', '0000-0002-9554-7273', '0000-0001-8238-798X', '0000-0001-6322-2821', '0000-0002-1568-0489']
Total sample size after apply threshold:  155
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabu

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.78      0.37      0.50        19
          1       0.00      0.00      0.00        11
          2       0.77      0.37      0.50        27
          3       0.58      0.99      0.73        76
          4       0.00      0.00      0.00        11
          5       1.00      0.36      0.53        11

avg / total       0.59      0.62      0.55       155

[ 7  0  1 11  0  0  0  0  1 10  0  0  1  0 10 16  0  0  0  0  1 75  0  0
  1  0  0 10  0  0  0  0  0  7  0  4]
LR Accuracy:  0.6193548387096774
LR F1:  0.3775067750677507
For name:  b_ferreira
total sample size before apply threshold:  29
Counter({'0000-0002-8565-3101': 10, '0000-0002-6781-2236': 6, '0000-0003-2156-2988': 6, '0000-0002-0221-3160': 3, '0000-0003-1388-5015': 3, '0000-0002-5612-5385': 1})
['0000-0002-8565-3101']
Total sample size after apply threshold:  10
For name:  r_neves
total sample size before apply threshold:  25
Counter({'0000-0003-4866-5215': 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(84, 54)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(84, 18)
2
(84, 72)
             precision    recall  f1-score   support

          0       0.76

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.81      0.85        57
          1       0.77      0.88      0.82        41

avg / total       0.85      0.84      0.84        98

[46 11  5 36]
svc Accuracy:  0.8367346938775511
svc F1:  0.835016835016835
             precision    recall  f1-score   support

          0       0.80      0.84      0.82        57
          1       0.76      0.71      0.73        41

avg / total       0.78      0.79      0.78        98

[48  9 12 29]
LR Accuracy:  0.7857142857142857
LR F1:  0.7773450178513469
For name:  r_dias
total sample size before apply threshold:  26
Counter({'0000-0002-9214-2166': 15, '0000-0001-7921-405X': 7, '0000-0002-6804-7409': 3, '0000-0003-1503-998X': 1})
['0000-0002-9214-2166']
Total sample size after apply threshold:  15
For name:  s_sengupta
total sample size before apply threshold:  149
Counter({'0000-0003-3357-1216': 64, '0000-0002-6365-1770': 31, '0000-0001-7441-5856': 31, '0000-0001-818

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[10  2  6  2  6  9  2  1 25]
MNB Accuracy:  0.6507936507936508
MNB F1:  0.6072775263951735
             precision    recall  f1-score   support

          0       0.67      0.67      0.67        18
          1       0.64      0.53      0.58        17
          2       0.74      0.82      0.78        28

avg / total       0.69      0.70      0.69        63

[12  3  3  3  9  5  3  2 23]
svc Accuracy:  0.6984126984126984
svc F1:  0.6756576149687139
             precision    recall  f1-score   support

          0       0.73      0.61      0.67        18
          1       0.60      0.35      0.44        17
          2       0.66      0.89      0.76        28

avg / total       0.66      0.67      0.65        63

[11  3  4  2  6  9  2  1 25]
LR Accuracy:  0.6666666666666666
LR F1:  0.622895622895623
For name:  v_wong
total sample size before apply threshold:  35
Counter({'0000-0001-6751-7942': 14, '0000-0002-2951-8108': 12, '0000-0001-9356-7556': 8, '0000-0003-2844-3789': 1})
['0000-0001-67

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(140, 51)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(140, 21)
2
(140, 72)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.60      0.53      0.56        17
          1       0.86      0.91      0.89        81
          2       0.90      0.82      0.86        33

avg / total       0.84      0.84      0.84       131

[ 9  8  0  4 74  3  2  4 27]
MNB Accuracy:  0.8396946564885496
MNB F1:  0.7686234673510123
             precision    recall  f1-score   support

          0       0.93      0.82      0.87        17
          1       0.94      0.94      0.94        81
          2       0.86      0.91      0.88        33

avg / total       0.92      0.92      0.92       131

[14  3  0  0 76  5  1  2 30]
svc Accuracy:  0.916030534351145
svc F1:  0.8985415153715807
             precision    recall  f1-score   support

          0       1.00      0.41      0.58        17
          1       0.82      0.96      0.89        81
          2       0.90      0.79      0.84        33

avg / total       0.86      0.85      0.84       131

[ 7 10  0  0 78

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(467, 211)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(467, 22)
2
(467, 233)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      0.89      0.93        53
          1       0.29      0.40      0.33        15
          2       0.00      0.00      0.00        13
          3       0.32      0.61      0.42        41
          4       0.61      0.80      0.69        71
          5       1.00      0.38      0.56        13
          6       1.00      0.75      0.86        12
          7       1.00      0.17      0.29        12
          8       0.55      0.26      0.35        42
          9       0.25      0.06      0.10        17
         10       1.00      0.20      0.33        10
         11       0.83      0.45      0.59        11
         12       0.74      0.78      0.76        32
         13       1.00      0.68      0.81        25
         14       0.35      0.73      0.48        48
         15       0.93      0.78      0.85        18
         16       0.29      0.12      0.17        17
         17       0.40      0.12      0.18   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.89      0.91      0.90       206
          1       1.00      0.21      0.35        14
          2       0.75      0.17      0.27        18
          3       0.74      0.87      0.80       120

avg / total       0.84      0.83      0.81       358

[187   0   0  19   5   3   0   6   4   0   3  11  15   0   1 104]
svc Accuracy:  0.8296089385474861
svc F1:  0.5806377358006642
             precision    recall  f1-score   support

          0       0.87      0.92      0.89       206
          1       1.00      0.21      0.35        14
          2       0.00      0.00      0.00        18
          3       0.74      0.85      0.79       120

avg / total       0.79      0.82      0.79       358

[189   0   0  17   5   3   0   6   5   0   0  13  18   0   0 102]
LR Accuracy:  0.8212290502793296
LR F1:  0.5093139680414471
For name:  s_keating
total sample size before apply threshold:  54
Counter({'0000-0002-8324-3694': 28, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(197, 71)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(197, 29)
2
(197, 100)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      0.69      0.69        13
          1       0.67      0.67      0.67        12

avg / total       0.68      0.68      0.68        25

[9 4 4 8]
MNB Accuracy:  0.68
MNB F1:  0.6794871794871795
             precision    recall  f1-score   support

          0       0.60      0.46      0.52        13
          1       0.53      0.67      0.59        12

avg / total       0.57      0.56      0.56        25

[6 7 4 8]
svc Accuracy:  0.56
svc F1:  0.5571658615136876
             precision    recall  f1-score   support

          0       0.67      0.62      0.64        13
          1       0.62      0.67      0.64        12

avg / total       0.64      0.64      0.64        25

[8 5 4 8]
LR Accuracy:  0.64
LR F1:  0.64
For name:  c_klein
total sample size before apply threshold:  106
Counter({'0000-0003-3522-9182': 47, '0000-0002-8230-8038': 21, '0000-0003-2991-1791': 11, '0000-0002-7580-8536': 11, '0000-0001-973

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  152
Counter({'0000-0001-7582-8791': 40, '0000-0002-7928-8216': 27, '0000-0003-4279-6572': 26, '0000-0002-4921-1461': 15, '0000-0002-3364-6647': 14, '0000-0002-1450-8046': 11, '0000-0002-7267-8377': 9, '0000-0003-3699-138X': 8, '0000-0003-0619-1074': 1, '0000-0001-5057-4908': 1})
['0000-0002-1450-8046', '0000-0003-4279-6572', '0000-0001-7582-8791', '0000-0002-3364-6647', '0000-0002-4921-1461', '0000-0002-7928-8216']
Total sample size after apply threshold:  133
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 79)
TfidfVectorizer(analyzer='word'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.29      0.42        14
          1       0.76      0.59      0.67        22
          2       1.00      0.43      0.60        14
          3       0.73      0.87      0.80        63
          4       1.00      0.73      0.85        30
          5       0.43      0.50      0.46        26
          6       0.54      0.74      0.62        38

avg / total       0.72      0.68      0.68       207

[ 4  0  0  3  0  4  3  0 13  0  4  0  3  2  0  0  6  2  0  1  5  0  1  0
 55  0  3  4  0  0  0  2 22  2  4  0  2  0  5  0 13  6  1  1  0  4  0  4
 28]
svc Accuracy:  0.6811594202898551
svc F1:  0.6310689328832514
             precision    recall  f1-score   support

          0       1.00      0.14      0.25        14
          1       1.00      0.55      0.71        22
          2       1.00      0.29      0.44        14
          3       0.62      0.90      0.74        63
          4       0.83      0.67      0.7

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(76, 38)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(76, 13)
2
(76, 51)
             precision    recall  f1-score   support

          0       0.74

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      0.42      0.56        12
          1       0.81      1.00      0.90        74
          2       0.89      0.40      0.55        20

avg / total       0.83      0.82      0.79       106

[ 5  6  1  0 74  0  1 11  8]
svc Accuracy:  0.8207547169811321
svc F1:  0.6680831301520956
             precision    recall  f1-score   support

          0       1.00      0.33      0.50        12
          1       0.73      1.00      0.85        74
          2       1.00      0.05      0.10        20

avg / total       0.81      0.75      0.66       106

[ 4  8  0  0 74  0  0 19  1]
LR Accuracy:  0.7452830188679245
LR F1:  0.4803174603174603
For name:  a_oliveira
total sample size before apply threshold:  302
Counter({'0000-0001-9103-6532': 41, '0000-0003-2186-8100': 24, '0000-0002-6714-5939': 24, '0000-0002-0107-9940': 24, '0000-0001-8012-4203': 20, '0000-0001-8638-5594': 20, '0000-0003-3787-9138': 12, '0000-0003-2790-6

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      0.42      0.56        12
          1       0.94      0.75      0.83        20
          2       0.92      0.55      0.69        20
          3       0.52      0.62      0.57        24
          4       0.89      0.67      0.76        12
          5       1.00      0.20      0.33        10
          6       0.28      0.38      0.32        24
          7       0.80      0.50      0.62        24
          8       0.41      0.66      0.50        41

avg / total       0.66      0.56      0.57       187

[ 5  0  0  2  0  0  2  0  3  0 15  0  2  0  0  1  0  2  0  0 11  2  0  0
  4  0  3  0  0  1 15  0  0  0  2  6  0  0  0  0  8  0  2  0  2  0  0  0
  3  0  2  1  0  4  1  0  0  1  0  0  9  0 13  0  0  0  2  1  0  3 12  6
  0  1  0  2  0  0 10  1 27]
svc Accuracy:  0.5561497326203209
svc F1:  0.5754612004428321
             precision    recall  f1-score   support

          0       1.00      0.08      0.15        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.97      0.81        69
          1       0.80      0.38      0.51        32
          2       0.00      0.00      0.00        13

avg / total       0.65      0.69      0.63       114

[67  0  2 19 12  1 10  3  0]
LR Accuracy:  0.6929824561403509
LR F1:  0.4409198366645175
For name:  k_brown
total sample size before apply threshold:  231
Counter({'0000-0003-2434-0037': 89, '0000-0002-0729-4959': 61, '0000-0003-3382-5546': 33, '0000-0002-6803-5336': 12, '0000-0003-2472-5754': 9, '0000-0001-7716-1425': 7, '0000-0001-9428-9420': 6, '0000-0002-1047-4328': 3, '0000-0001-6836-1572': 3, '0000-0001-8350-5888': 2, '0000-0001-7766-6810': 1, '0000-0002-0201-0558': 1, '0000-0002-2358-8578': 1, '0000-0002-9093-8742': 1, '0000-0001-5348-7893': 1, '0000-0001-5748-5123': 1})
['0000-0003-2434-0037', '0000-0003-3382-5546', '0000-0002-0729-4959', '0000-0002-6803-5336']
Total sample size after apply threshold:  195
(0, 0)
T

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.68      0.88      0.77        89
          1       1.00      0.48      0.65        33
          2       0.67      0.70      0.69        61
          3       1.00      0.08      0.15        12

avg / total       0.75      0.71      0.69       195

[78  0 11  0 11 16  6  0 18  0 43  0  7  0  4  1]
LR Accuracy:  0.7076923076923077
LR F1:  0.5658450711849726
For name:  s_hong
total sample size before apply threshold:  383
Counter({'0000-0002-8344-6774': 102, '0000-0002-8888-6007': 84, '0000-0002-0300-1944': 83, '0000-0002-6305-8731': 27, '0000-0001-7291-1020': 19, '0000-0002-0324-2414': 15, '0000-0003-3031-2753': 12, '0000-0003-2401-6368': 12, '0000-0002-2667-1983': 10, '0000-0003-4926-1044': 3, '0000-0002-0020-6215': 3, '0000-0002-8473-919X': 2, '0000-0002-4800-636X': 2, '0000-0001-8722-3124': 1, '0000-0002-6905-7932': 1, '0000-0002-3755-3683': 1, '0000-0002-2498-7546': 1, '0000-0002-9470-5700': 1, '0000-0003-1119-4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      0.90      0.75        84
          1       0.83      0.42      0.56        12
          2       0.50      0.20      0.29        10
          3       0.95      0.85      0.90       102
          4       0.65      0.58      0.61        19
          5       0.45      0.67      0.54        15
          6       0.71      0.42      0.53        12
          7       1.00      0.93      0.96        27
          8       0.93      0.81      0.86        83

avg / total       0.82      0.79      0.79       364

[76  0  0  1  0  5  0  0  2  5  5  0  0  0  1  1  0  0  4  0  2  0  3  1
  0  0  0  9  0  0 87  0  3  0  0  3  4  0  2  0 11  1  1  0  0  5  0  0
  0  0 10  0  0  0  4  1  0  0  2  0  5  0  0  0  0  0  0  1  1  0 25  0
 12  0  0  4  0  0  0  0 67]
svc Accuracy:  0.7912087912087912
svc F1:  0.6656630624852384
             precision    recall  f1-score   support

          0       0.57      0.82      0.67        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(106, 29)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(106, 14)
2
(106, 43)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.92      0.89      0.90        37
          1       0.90      0.93      0.92        41

avg / total       0.91      0.91      0.91        78

[33  4  3 38]
svc Accuracy:  0.9102564102564102
svc F1:  0.9098861198217527
             precision    recall  f1-score   support

          0       0.88      0.81      0.85        37
          1       0.84      0.90      0.87        41

avg / total       0.86      0.86      0.86        78

[30  7  4 37]
LR Accuracy:  0.8589743589743589
LR F1:  0.8578293289146645
For name:  c_meyer
total sample size before apply threshold:  136
Counter({'0000-0001-7599-3973': 34, '0000-0002-9877-1393': 29, '0000-0003-1334-2512': 27, '0000-0002-7214-9598': 18, '0000-0002-2268-3055': 14, '0000-0003-0851-2767': 6, '0000-0001-9958-8913': 5, '0000-0002-3166-3101': 3})
['0000-0002-9877-1393', '0000-0001-7599-3973', '0000-0003-1334-2512', '0000-0002-7214-9598', '0000-0002-2268-3055']
Total sample si

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.62      0.76        13
          1       0.87      0.68      0.76        19
          2       0.56      0.36      0.43        14
          3       0.43      0.25      0.32        12
          4       1.00      0.64      0.78        11
          5       0.57      0.90      0.70        59
          6       0.85      0.46      0.59        24

avg / total       0.71      0.66      0.65       152

[ 8  0  0  0  0  5  0  0 13  0  0  0  5  1  0  0  5  0  0  9  0  0  0  0
  3  0  9  0  0  0  1  0  7  3  0  0  0  2  3  0 53  1  0  2  1  1  0  9
 11]
svc Accuracy:  0.6578947368421053
svc F1:  0.6209890742946529
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        13
          1       0.85      0.58      0.69        19
          2       1.00      0.07      0.13        14
          3       0.25      0.08      0.12        12
          4       1.00      0.27      0.4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 32)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(46, 13)
2
(46, 45)
             precision    recall  f1-score   support

          0       0.78

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


[ 6  7  3 19]
LR Accuracy:  0.7142857142857143
LR F1:  0.6685606060606061
For name:  k_evans
total sample size before apply threshold:  29
Counter({'0000-0002-6856-8423': 16, '0000-0002-9819-1049': 9, '0000-0001-6981-7703': 3, '0000-0003-2850-7674': 1})
['0000-0002-6856-8423']
Total sample size after apply threshold:  16
For name:  k_yoo
total sample size before apply threshold:  10
Counter({'0000-0002-5213-4575': 7, '0000-0001-7952-7902': 1, '0000-0002-6186-7535': 1, '0000-0002-5539-345X': 1})
[]
Total sample size after apply threshold:  0
For name:  d_turner
total sample size before apply threshold:  71
Counter({'0000-0002-3754-6459': 27, '0000-0003-1603-7994': 26, '0000-0002-3447-7662': 5, '0000-0002-0249-4513': 4, '0000-0002-8891-9155': 3, '0000-0002-7369-8791': 3, '0000-0002-2891-2664': 2, '0000-0001-6802-1703': 1})
['0000-0002-3754-6459', '0000-0003-1603-7994']
Total sample size after apply threshold:  53
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(61, 11)
2
(61, 50)
             precision    recall  f1-score   support

          0       0.57      0.65      0.60        20
          1       0.60      0.60      0.60        20
          2       0.67      0.57      0.62        21

avg / total       0.61      0.61      0.61        61

[13  3  4  6 12  2  4  5 12]
MNB Accuracy:  0.6065573770491803
MNB F1:  0.6066785927251043
             precision    recall  f1-score   support

          0       0.54      0.65      0.59        20
          1       0.82      0.45      0.58        20
          2       0.50      0.62      0.55        21

avg / total       0.62      0.57      0.57        61

[13  0  7  5  9  6  6  2 13]
svc Accuracy:  0.5737704918032787
svc F1:  0.5749152471870386
             precision    recall  f1-score   support

          0       0.48      0.50      0.49        20
          1       0.63      0.60      0.62        20
          2       0.57      0.57      0.57        21

avg / total       0.56      0.56      0.56      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


41
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 18)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(41, 15)
2
(41, 33)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.97      0.94       111
          1       0.83      0.58      0.68        26

avg / total       0.89      0.90      0.89       137

[108   3  11  15]
svc Accuracy:  0.8978102189781022
svc F1:  0.8104743083003952
             precision    recall  f1-score   support

          0       0.85      0.98      0.91       111
          1       0.78      0.27      0.40        26

avg / total       0.84      0.85      0.81       137

[109   2  19   7]
LR Accuracy:  0.8467153284671532
LR F1:  0.6560669456066945
For name:  a_watts
total sample size before apply threshold:  25
Counter({'0000-0003-4299-2717': 14, '0000-0002-8385-1091': 9, '0000-0003-0623-4601': 1, '0000-0003-3480-582X': 1})
['0000-0003-4299-2717']
Total sample size after apply threshold:  14
For name:  b_oliveira
total sample size before apply threshold:  60
Counter({'0000-0002-7710-4284': 22, '0000-0002-7687-4746': 17, '0000-0002-6767-6596': 13, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.15      0.27        13
          1       0.81      0.98      0.89       125
          2       0.92      0.59      0.72        41

avg / total       0.85      0.83      0.81       179

[  2  11   0   0 123   2   0  17  24]
svc Accuracy:  0.8324022346368715
svc F1:  0.624796308313505
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.77      1.00      0.87       125
          2       0.94      0.37      0.53        41

avg / total       0.75      0.78      0.73       179

[  0  12   1   0 125   0   0  26  15]
LR Accuracy:  0.7821229050279329
LR F1:  0.46479044834308
For name:  c_guo
total sample size before apply threshold:  6
Counter({'0000-0001-9253-3469': 2, '0000-0002-0432-8121': 2, '0000-0002-4000-8141': 1, '0000-0003-2182-3287': 1})
[]
Total sample size after apply threshold:  0
For name:  m_hansen
total sample size before a

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.58      0.82      0.68        55
          1       0.92      0.65      0.76        17
          2       1.00      0.27      0.43        11
          3       0.70      0.58      0.64        24
          4       0.61      0.69      0.65        29
          5       0.60      0.56      0.58        27
          6       0.46      0.38      0.41        16
          7       0.97      0.88      0.92        40

avg / total       0.71      0.68      0.68       219

[45  0  0  2  5  2  1  0  3 11  0  0  0  2  0  1  3  0  3  0  3  2  0  0
  5  0  0 14  0  3  2  0  6  0  0  0 20  1  2  0  5  1  0  1  3 15  2  0
  6  0  0  2  2  0  6  0  4  0  0  1  0  0  0 35]
svc Accuracy:  0.680365296803653
svc F1:  0.6327880048351625
             precision    recall  f1-score   support

          0       0.63      0.80      0.70        55
          1       1.00      0.59      0.74        17
          2       1.00      0.18      0.31        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.20      0.33        10
          1       0.80      0.27      0.40        15
          2       0.90      0.93      0.91       141
          3       1.00      0.31      0.48        16
          4       0.71      0.98      0.82        98
          5       1.00      0.13      0.24        15

avg / total       0.84      0.81      0.78       295

[  2   0   1   0   7   0   0   4   0   0  11   0   0   1 131   0   9   0
   0   0   6   5   5   0   0   0   2   0  96   0   0   0   6   0   7   2]
svc Accuracy:  0.8135593220338984
svc F1:  0.5302907079995892
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       1.00      0.20      0.33        15
          2       0.87      0.94      0.90       141
          3       0.00      0.00      0.00        16
          4       0.67      0.97      0.79        98
          5       0.00      0.00      0.00   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.91      0.85        77
          1       0.86      0.72      0.78        60

avg / total       0.83      0.82      0.82       137

[70  7 17 43]
LR Accuracy:  0.8248175182481752
LR F1:  0.8177383592017737
For name:  b_zhou
total sample size before apply threshold:  20
Counter({'0000-0002-1535-6283': 13, '0000-0003-2846-1813': 2, '0000-0003-2634-1527': 1, '0000-0001-9774-2737': 1, '0000-0003-1560-4950': 1, '0000-0003-0638-2428': 1, '0000-0003-4421-9787': 1})
['0000-0002-1535-6283']
Total sample size after apply threshold:  13
For name:  x_yan
total sample size before apply threshold:  111
Counter({'0000-0002-6114-5743': 44, '0000-0001-8547-4210': 18, '0000-0003-3973-3669': 14, '0000-0002-7528-5771': 12, '0000-0001-9327-5756': 7, '0000-0003-2091-6967': 6, '0000-0001-8221-9345': 3, '0000-0001-5026-0239': 3, '0000-0001-5606-0158': 2, '0000-0002-8292-130X': 1, '0000-0002-1300-5498': 1})
['0000-0001-8547-4210

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.41      0.54      0.47        13
          2       0.62      0.77      0.69        13
          3       0.91      0.71      0.80        14
          4       0.65      0.99      0.78       244
          5       0.88      0.47      0.61        30
          6       1.00      0.20      0.33        10
          7       1.00      0.17      0.29        12
          8       0.83      0.85      0.84        67
          9       0.25      0.23      0.24        22
         10       1.00      0.50      0.67        20
         11       1.00      0.06      0.11        18
         12       1.00      0.50      0.67        10
         13       0.91      0.38      0.54        26
         14       1.00      0.83      0.91        30
         15       0.63      0.65      0.64        26
         16       1.00      0.29      0.44        21
         17       0.91      0.61      0.73   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(31, 15)
2
(31, 36)
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        18
          1       1.00      0.62      0.76        13

avg / total       0.87      0.84      0.83        31

[18  0  5  8]
MNB Accuracy:  0.8387096774193549
MNB F1:  0.8199767711962835
             precision    recall  f1-score   support

          0       0.77      0.94      0.85        18
          1       0.89      0.62      0.73        13

avg / total       0.82      0.81      0.80        31

[17  1  5  8]
svc Accuracy:  0.8064516129032258
svc F1:  0.7886363636363637
             precision    recall  f1-score   support

          0       0.78      1.00      0.88        18
          1       1.00      0.62      0.76        13

avg / total       0.87      0.84      0.83        31

[18  0  5  8]
LR Accuracy:  0.8387096774193549
LR F1:  0.8199767711962835
For name:  c_santos
total sample size before apply threshold:  293
Counter({'0000-0002-0405-3500': 68, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.64      0.66      0.65        41
          1       0.60      0.30      0.40        10
          2       0.58      0.28      0.38        25
          3       0.46      0.43      0.44        37
          4       0.57      0.32      0.41        38
          5       0.45      0.76      0.57        68
          6       0.27      0.18      0.21        17
          7       0.81      0.59      0.68        22

avg / total       0.54      0.52      0.50       258

[27  0  0  2  0 11  1  0  1  3  0  0  2  3  1  0  1  0  7  3  1 11  2  0
  3  0  0 16  2 13  1  2  1  2  3  2 12 16  2  0  5  0  2  5  3 52  0  1
  3  0  0  4  1  6  3  0  1  0  0  3  0  4  1 13]
svc Accuracy:  0.5155038759689923
svc F1:  0.4679898156730222
             precision    recall  f1-score   support

          0       0.49      0.61      0.54        41
          1       1.00      0.30      0.46        10
          2       1.00      0.12      0.21       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(70, 22)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(70, 20)
2
(70, 42)
             precision    recall  f1-score   support

          0       0.5

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.61      0.61      0.61        18
          1       0.55      0.81      0.65        21
          2       0.50      0.08      0.14        12

avg / total       0.56      0.57      0.52        51

[11  6  1  4 17  0  3  8  1]
LR Accuracy:  0.5686274509803921
LR F1:  0.46927146927146923
For name:  k_chong
total sample size before apply threshold:  39
Counter({'0000-0003-2587-1323': 28, '0000-0002-7350-597X': 4, '0000-0003-4754-8957': 4, '0000-0003-0786-842X': 3})
['0000-0003-2587-1323']
Total sample size after apply threshold:  28
For name:  j_kumar
total sample size before apply threshold:  16
Counter({'0000-0002-9754-3305': 9, '0000-0002-4153-1495': 3, '0000-0002-0159-0546': 2, '0000-0001-9666-8280': 2})
[]
Total sample size after apply threshold:  0
For name:  a_shenoy
total sample size before apply threshold:  33
Counter({'0000-0001-6228-9303': 24, '0000-0003-4306-7582': 3, '0000-0001-8639-2751': 3, '0000-0003-42

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.54      0.70        13
          1       1.00      0.45      0.62        11
          2       0.74      0.57      0.65        40
          3       0.77      0.60      0.68        45
          4       0.67      0.20      0.31        10
          5       0.74      0.50      0.60        34
          6       0.44      0.22      0.30        18
          7       0.46      0.82      0.59        44
          8       0.43      0.82      0.57        65
          9       0.80      0.52      0.63        31
         10       0.88      0.47      0.61        15
         11       0.33      0.09      0.14        11
         12       1.00      0.47      0.64        15

avg / total       0.66      0.58      0.58       352

[ 7  0  0  0  1  0  0  1  2  2  0  0  0  0  5  0  0  0  0  0  2  4  0  0
  0  0  0  0 23  2  0  2  0  6  6  1  0  0  0  0  0  1 27  0  0  4  2 11
  0  0  0  0  0  0  0  0  2  4  0  1  3  0  0  0  0  0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.50      0.10      0.17        10
          1       0.76      0.86      0.81        22
          2       0.72      0.88      0.79        24

avg / total       0.70      0.73      0.69        56

[ 1  4  5  0 19  3  1  2 21]
LR Accuracy:  0.7321428571428571
LR F1:  0.5892100450510728
For name:  m_foster
total sample size before apply threshold:  103
Counter({'0000-0001-9645-7491': 43, '0000-0002-4524-141X': 43, '0000-0001-6392-7418': 6, '0000-0002-4453-7788': 5, '0000-0002-3100-0885': 3, '0000-0003-2257-4825': 3})
['0000-0001-9645-7491', '0000-0002-4524-141X']
Total sample size after apply threshold:  86
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, st

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.94      0.91        32
          1       0.86      0.75      0.80        16

avg / total       0.87      0.88      0.87        48

[30  2  4 12]
svc Accuracy:  0.875
svc F1:  0.8545454545454545
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        32
          1       1.00      0.62      0.77        16

avg / total       0.89      0.88      0.87        48

[32  0  6 10]
LR Accuracy:  0.875
LR F1:  0.8417582417582418
For name:  p_moreira
total sample size before apply threshold:  217
Counter({'0000-0001-5177-6747': 133, '0000-0002-7035-7799': 68, '0000-0002-2800-3903': 6, '0000-0002-5454-7971': 4, '0000-0003-0452-6790': 2, '0000-0002-0004-851X': 2, '0000-0001-7247-6815': 1, '0000-0001-6919-0904': 1})
['0000-0001-5177-6747', '0000-0002-7035-7799']
Total sample size after apply threshold:  201
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_err

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      0.97      0.89       133
          1       0.91      0.57      0.70        68

avg / total       0.85      0.84      0.82       201

[129   4  29  39]
svc Accuracy:  0.835820895522388
svc F1:  0.7946503204235162
             precision    recall  f1-score   support

          0       0.74      0.95      0.84       133
          1       0.80      0.35      0.49        68

avg / total       0.76      0.75      0.72       201

[127   6  44  24]
LR Accuracy:  0.7512437810945274
LR F1:  0.6626611170784104
For name:  s_mukhopadhyay
total sample size before apply threshold:  119
Counter({'0000-0001-8033-5748': 49, '0000-0001-9660-2599': 37, '0000-0003-1242-9958': 18, '0000-0003-4790-3090': 8, '0000-0002-1838-2815': 5, '0000-0002-4056-2185': 1, '0000-0002-6290-6380': 1})
['0000-0003-1242-9958', '0000-0001-8033-5748', '0000-0001-9660-2599']
Total sample size after apply threshold:  104
(0, 0)
TfidfVectorizer(analyz

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(114, 52)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(114, 29)
2
(114, 81)
             precision    recall  f1-score   support

          0       0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.6416416416416416
             precision    recall  f1-score   support

          0       0.70      0.70      0.70        10
          1       0.62      0.76      0.68        17
          2       0.71      0.45      0.56        11

avg / total       0.67      0.66      0.65        38

[ 7  2  1  3 13  1  0  6  5]
LR Accuracy:  0.6578947368421053
LR F1:  0.646588693957115
For name:  w_smith
total sample size before apply threshold:  61
Counter({'0000-0002-4610-998X': 37, '0000-0003-2108-3899': 11, '0000-0002-5785-6489': 6, '0000-0001-9640-1172': 4, '0000-0003-1749-023X': 1, '0000-0001-6611-0817': 1, '0000-0002-8814-015X': 1})
['0000-0002-4610-998X', '0000-0003-2108-3899']
Total sample size after apply threshold:  48
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, sm

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        12
          1       0.83      0.45      0.59        11
          2       0.76      0.78      0.77        36
          3       1.00      0.50      0.67        14
          4       0.73      0.86      0.79       105
          5       0.62      0.60      0.61        55

avg / total       0.74      0.73      0.72       233

[ 6  1  1  0  2  2  0  5  2  0  2  2  0  0 28  0  6  2  0  0  3  7  3  1
  0  0  2  0 90 13  0  0  1  0 21 33]
svc Accuracy:  0.7253218884120172
svc F1:  0.6809715378511144
             precision    recall  f1-score   support

          0       1.00      0.25      0.40        12
          1       1.00      0.09      0.17        11
          2       0.71      0.56      0.63        36
          3       0.00      0.00      0.00        14
          4       0.59      0.88      0.70       105
          5       0.60      0.49      0.54        55

avg / total       0.62     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(138, 78)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(138, 24)
2
(138, 102)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.82      1.00      0.90        59
          1       1.00      0.79      0.88        14
          2       1.00      0.52      0.69        21

avg / total       0.89      0.86      0.85        94

[59  0  0  3 11  0 10  0 11]
MNB Accuracy:  0.8617021276595744
MNB F1:  0.8227544529262086
             precision    recall  f1-score   support

          0       0.84      1.00      0.91        59
          1       1.00      0.71      0.83        14
          2       1.00      0.67      0.80        21

avg / total       0.90      0.88      0.88        94

[59  0  0  4 10  0  7  0 14]
svc Accuracy:  0.8829787234042553
svc F1:  0.8493540051679588
             precision    recall  f1-score   support

          0       0.77      1.00      0.87        59
          1       1.00      0.71      0.83        14
          2       1.00      0.33      0.50        21

avg / total       0.85      0.81      0.78        94

[59  0  0  4 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.53      0.69        17
          1       0.93      0.76      0.83        49
          2       0.82      0.98      0.89        87

avg / total       0.87      0.86      0.85       153

[ 9  1  7  0 37 12  0  2 85]
svc Accuracy:  0.8562091503267973
svc F1:  0.8046069074953127
             precision    recall  f1-score   support

          0       1.00      0.12      0.21        17
          1       0.97      0.71      0.82        49
          2       0.76      1.00      0.86        87

avg / total       0.85      0.81      0.78       153

[ 2  1 14  0 35 14  0  0 87]
LR Accuracy:  0.8104575163398693
LR F1:  0.6318139553893469
For name:  s_chow
total sample size before apply threshold:  29
Counter({'0000-0001-9471-4722': 21, '0000-0002-3600-0497': 6, '0000-0003-0544-6928': 1, '0000-0002-4392-3863': 1})
['0000-0001-9471-4722']
Total sample size after apply threshold:  21
For name:  m_simon
total sample size

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(99, 43)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(99, 14)
2
(99, 57)
             precision    recall  f1-score   support

          0       0.95

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.33      0.31      0.32        13
          1       0.00      0.00      0.00        10
          2       0.28      0.44      0.34        25
          3       0.35      0.35      0.35        17
          4       0.69      0.56      0.62        16
          5       0.89      0.67      0.76        12

avg / total       0.42      0.41      0.41        93

[ 4  1  6  1  1  0  3  0  5  2  0  0  4  1 11  6  2  1  0  0 10  6  1  0
  1  0  5  1  9  0  0  0  3  1  0  8]
svc Accuracy:  0.40860215053763443
svc F1:  0.39899952200155037
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        13
          1       0.00      0.00      0.00        10
          2       0.24      0.44      0.31        25
          3       0.31      0.29      0.30        17
          4       0.50      0.50      0.50        16
          5       0.60      0.50      0.55        12

avg / total       0.29   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(117, 60)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(117, 25)
2
(117, 85)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.64      0.77      0.70        30
          1       0.59      0.43      0.50        23

avg / total       0.62      0.62      0.61        53

[23  7 13 10]
svc Accuracy:  0.6226415094339622
svc F1:  0.5984848484848485
             precision    recall  f1-score   support

          0       0.63      0.80      0.71        30
          1       0.60      0.39      0.47        23

avg / total       0.62      0.62      0.61        53

[24  6 14  9]
LR Accuracy:  0.6226415094339622
LR F1:  0.5897832817337462
For name:  f_esposito
total sample size before apply threshold:  342
Counter({'0000-0002-5099-9786': 92, '0000-0003-1051-5924': 91, '0000-0001-9340-6875': 53, '0000-0002-4420-2611': 44, '0000-0003-2550-0805': 26, '0000-0001-9725-7977': 25, '0000-0001-7781-2558': 7, '0000-0003-0586-5866': 2, '0000-0001-9962-1648': 1, '0000-0002-1075-3239': 1})
['0000-0003-2550-0805', '0000-0001-9340-6875', '0000-0002-4420-2611', '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.79      0.58      0.67        26
          1       0.74      0.55      0.63        53
          2       1.00      0.66      0.79        44
          3       0.54      0.88      0.67        92
          4       0.62      0.20      0.30        25
          5       0.90      0.85      0.87        91

avg / total       0.76      0.71      0.71       331

[15  0  0 10  0  1  0 29  0 19  2  3  0  2 29 12  0  1  3  4  0 81  0  4
  0  3  0 17  5  0  1  1  0 11  1 77]
svc Accuracy:  0.7129909365558912
svc F1:  0.6556883808382197
             precision    recall  f1-score   support

          0       0.73      0.31      0.43        26
          1       0.76      0.49      0.60        53
          2       1.00      0.55      0.71        44
          3       0.53      0.84      0.65        92
          4       0.60      0.12      0.20        25
          5       0.73      0.90      0.81        91

avg / total       0.71     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(562, 234)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(562, 16)
2
(562, 250)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.65      0.79        17
          1       0.00      0.00      0.00        10
          2       1.00      0.42      0.60        33
          3       0.36      0.86      0.51       108
          4       1.00      0.25      0.40        12
          5       0.30      0.21      0.25        14
          6       0.62      0.79      0.69        94
          7       1.00      0.21      0.35        14
          8       0.90      0.29      0.44        31
          9       0.47      0.35      0.40        26
         10       0.00      0.00      0.00        10
         11       0.29      0.11      0.15        19
         12       1.00      0.19      0.32        21
         13       0.83      0.29      0.43        17
         14       0.00      0.00      0.00        12
         15       0.65      0.42      0.51        36
         16       0.33      0.16      0.22        25
         17       0.33      0.37      0.35   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.46      0.62        26
          1       0.33      0.19      0.24        16
          2       0.77      0.93      0.84        98

avg / total       0.75      0.76      0.73       140

[12  0 14  0  3 13  1  6 91]
svc Accuracy:  0.7571428571428571
svc F1:  0.5659924026590694
             precision    recall  f1-score   support

          0       1.00      0.31      0.47        26
          1       0.00      0.00      0.00        16
          2       0.74      0.99      0.85        98

avg / total       0.70      0.75      0.68       140

[ 8  0 18  0  0 16  0  1 97]
LR Accuracy:  0.75
LR F1:  0.43924993578217314
For name:  a_santoro
total sample size before apply threshold:  189
Counter({'0000-0002-0798-6816': 83, '0000-0003-1709-9492': 58, '0000-0002-5086-1453': 21, '0000-0003-2503-8219': 10, '0000-0002-1014-197X': 9, '0000-0002-6193-2050': 8})
['0000-0003-1709-9492', '0000-0002-5086-1453', '0000-0003-2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      0.98      0.90        58
          1       0.85      0.52      0.65        21
          2       1.00      0.50      0.67        10
          3       0.88      0.90      0.89        83

avg / total       0.87      0.86      0.85       172

[57  0  0  1  4 11  0  6  2  0  5  3  6  2  0 75]
svc Accuracy:  0.8604651162790697
svc F1:  0.7760551070822029
             precision    recall  f1-score   support

          0       0.78      0.98      0.87        58
          1       0.67      0.10      0.17        21
          2       0.00      0.00      0.00        10
          3       0.79      0.92      0.85        83

avg / total       0.73      0.78      0.72       172

[57  0  0  1  9  2  0 10  1  0  0  9  6  1  0 76]
LR Accuracy:  0.7848837209302325
LR F1:  0.4715144213683597
For name:  q_lu
total sample size before apply threshold:  35
Counter({'0000-0002-2804-0827': 22, '0000-0002-4261-5121': 5, '0000-0002-4

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.47      0.58        19
          1       1.00      0.71      0.83        14
          2       0.70      0.97      0.81       130
          3       1.00      0.92      0.96        25
          4       0.79      0.64      0.71        42
          5       1.00      0.86      0.92        21
          6       1.00      0.54      0.70        13
          7       0.60      0.17      0.26        18
          8       1.00      0.38      0.56        13

avg / total       0.80      0.77      0.75       295

[  9   0   8   0   1   0   0   1   0   0  10   4   0   0   0   0   0   0
   1   0 126   0   3   0   0   0   0   0   0   2  23   0   0   0   0   0
   1   0  13   0  27   0   0   1   0   0   0   2   0   1  18   0   0   0
   0   0   6   0   0   0   7   0   0   1   0  13   0   1   0   0   3   0
   0   0   7   0   1   0   0   0   5]
svc Accuracy:  0.7728813559322034
svc F1:  0.7036255085182064
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.85      0.77        47
          1       0.81      0.71      0.75        41
          2       0.87      0.75      0.81        36

avg / total       0.79      0.77      0.77       124

[40  4  3 11 29  1  6  3 27]
svc Accuracy:  0.7741935483870968
svc F1:  0.7761492239104179
             precision    recall  f1-score   support

          0       0.69      0.85      0.76        47
          1       0.82      0.68      0.75        41
          2       0.81      0.72      0.76        36

avg / total       0.77      0.76      0.76       124

[40  3  4 11 28  2  7  3 26]
LR Accuracy:  0.7580645161290323
LR F1:  0.7577591036414567
For name:  t_han
total sample size before apply threshold:  53
Counter({'0000-0002-9063-4052': 42, '0000-0002-3095-7714': 8, '0000-0003-3535-8582': 2, '0000-0003-1404-1578': 1})
['0000-0002-9063-4052']
Total sample size after apply threshold:  42
For name:  m_sandberg
total sample si

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(606, 266)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(606, 20)
2
(606, 286)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.90      0.64      0.75        14
          1       0.83      0.40      0.54        25
          2       0.40      0.74      0.52        58
          3       0.40      0.18      0.25        11
          4       0.89      0.50      0.64        16
          5       0.71      0.63      0.67        19
          6       0.83      0.52      0.64        29
          7       0.71      0.71      0.71        17
          8       0.60      0.54      0.57        46
          9       0.46      0.71      0.56        63
         10       0.46      0.46      0.46        13
         11       0.80      0.36      0.50        11
         12       1.00      0.42      0.59        12
         13       0.41      0.39      0.40        18
         14       0.77      0.65      0.70        51
         15       0.80      0.30      0.43        27
         16       1.00      0.80      0.89        10
         17       0.84      0.90      0.87   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.73      0.70      0.71        23
          1       0.40      0.29      0.33        14
          2       0.85      0.94      0.90        50

avg / total       0.75      0.77      0.76        87

[16  4  3  5  4  5  1  2 47]
MNB Accuracy:  0.7701149425287356
MNB F1:  0.6465608465608464
             precision    recall  f1-score   support

          0       0.66      0.83      0.73        23
          1       1.00      0.43      0.60        14
          2       0.85      0.88      0.86        50

avg / total       0.82      0.79      0.79        87

[19  0  4  4  6  4  6  0 44]
svc Accuracy:  0.7931034482758621
svc F1:  0.7311714429361489
             precision    recall  f1-score   support

          0       0.68      0.65      0.67        23
          1       0.80      0.29      0.42        14
          2       0.78      0.94      0.85        50

avg / total       0.76      0.76      0.74        87

[15  1  7  4  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(53, 23)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(53, 10)
2
(53, 33)
             precision    recall  f1-score   support

          0       0.85

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  208
Counter({'0000-0002-2565-1825': 110, '0000-0002-8369-8349': 90, '0000-0002-3063-8844': 7, '0000-0001-9914-3850': 1})
['0000-0002-2565-1825', '0000-0002-8369-8349']
Total sample size after apply threshold:  200
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(200, 82)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.73      0.84        11
          1       0.79      1.00      0.88        11

avg / total       0.89      0.86      0.86        22

[ 8  3  0 11]
svc Accuracy:  0.8636363636363636
svc F1:  0.8610526315789474
             precision    recall  f1-score   support

          0       0.80      0.73      0.76        11
          1       0.75      0.82      0.78        11

avg / total       0.78      0.77      0.77        22

[8 3 2 9]
LR Accuracy:  0.7727272727272727
LR F1:  0.7722567287784678
For name:  x_gu
total sample size before apply threshold:  61
Counter({'0000-0002-9373-987X': 23, '0000-0002-8521-3667': 13, '0000-0003-2266-5516': 7, '0000-0002-0437-5606': 5, '0000-0003-2641-1740': 5, '0000-0001-8299-6451': 4, '0000-0003-3803-3951': 4})
['0000-0002-8521-3667', '0000-0002-9373-987X']
Total sample size after apply threshold:  36
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(135, 56)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(135, 16)
2
(135, 72)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(238, 95)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(238, 29)
2
(238, 124)
             precision    recall  f1-score   support

          0       0.83   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.38      0.19      0.25        16
          1       0.44      0.36      0.40        11
          2       1.00      0.40      0.57        10
          3       0.71      0.83      0.77        12
          4       0.65      0.76      0.70        17
          5       0.50      0.79      0.61        19

avg / total       0.59      0.58      0.55        85

[ 3  4  0  1  1  7  3  4  0  2  0  2  0  0  4  0  4  2  0  0  0 10  1  1
  0  1  0  0 13  3  2  0  0  1  1 15]
MNB Accuracy:  0.5764705882352941
MNB F1:  0.5509344902202046
             precision    recall  f1-score   support

          0       0.37      0.44      0.40        16
          1       1.00      0.64      0.78        11
          2       1.00      0.40      0.57        10
          3       0.67      0.67      0.67        12
          4       0.67      0.71      0.69        17
          5       0.52      0.68      0.59        19

avg / total       0.66     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.71      0.67      0.69        60
          1       0.94      0.64      0.76        25
          2       0.69      0.80      0.74        71

avg / total       0.74      0.72      0.72       156

[40  0 20  3 16  6 13  1 57]
svc Accuracy:  0.7243589743589743
svc F1:  0.7306065581927651
             precision    recall  f1-score   support

          0       0.71      0.57      0.63        60
          1       1.00      0.52      0.68        25
          2       0.64      0.86      0.73        71

avg / total       0.72      0.69      0.69       156

[34  0 26  4 13  8 10  0 61]
LR Accuracy:  0.6923076923076923
LR F1:  0.6829266383271878
For name:  s_yun
total sample size before apply threshold:  102
Counter({'0000-0001-7737-4746': 76, '0000-0002-1498-3701': 24, '0000-0002-3774-0622': 1, '0000-0002-9510-5133': 1})
['0000-0001-7737-4746', '0000-0002-1498-3701']
Total sample size after apply threshold:  100
(0, 0)
Tfid

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(45, 12)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(45, 14)
2
(45, 26)
             precision    recall  f1-score   support

          0       0.6

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.64      0.36      0.46        50
          2       0.46      0.76      0.57        71
          3       0.00      0.00      0.00        20
          4       0.44      0.61      0.51        71
          5       0.60      0.60      0.60        55
          6       0.35      0.23      0.27        40

avg / total       0.44      0.49      0.45       323

[ 0  2  8  0  3  2  1  0 18 18  0 10  1  3  0  0 54  0 15  0  2  0  2  6
  0  6  2  4  0  1 18  0 43  6  3  0  1  6  0 11 33  4  0  4  7  0  9 11
  9]
MNB Accuracy:  0.48606811145510836
MNB F1:  0.3458055116109828
             precision    recall  f1-score   support

          0       0.50      0.19      0.27        16
          1       0.84      0.42      0.56        50
          2       0.47      0.83      0.60        71
          3       0.50      0.10      0.17        20
          4       0.53      0.52      0.

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(75, 48)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(75, 20)
2
(75, 68)
             precision    recall  f1-score   support

          0       0.66

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.84      0.89        19
          1       0.92      0.88      0.90        74
          2       0.84      0.90      0.87        72

avg / total       0.89      0.88      0.89       165

[16  0  3  0 65  9  1  6 65]
svc Accuracy:  0.8848484848484849
svc F1:  0.8859746115011099
             precision    recall  f1-score   support

          0       1.00      0.37      0.54        19
          1       0.85      0.89      0.87        74
          2       0.81      0.90      0.86        72

avg / total       0.85      0.84      0.82       165

[ 7  5  7  0 66  8  0  7 65]
LR Accuracy:  0.8363636363636363
LR F1:  0.7540485829959515
For name:  l_song
total sample size before apply threshold:  58
Counter({'0000-0003-0585-8519': 38, '0000-0003-1691-9583': 15, '0000-0002-0400-8283': 3, '0000-0003-2454-1576': 1, '0000-0002-7299-5719': 1})
['0000-0003-1691-9583', '0000-0003-0585-8519']
Total sample size after apply t

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        28
          1       1.00      0.14      0.25        14
          2       0.55      0.87      0.67        98
          3       0.00      0.00      0.00        11
          4       1.00      0.19      0.32        16
          5       0.59      0.49      0.53        55
          6       0.00      0.00      0.00        12
          7       0.43      0.69      0.53        65
          8       0.33      0.12      0.17        17

avg / total       0.48      0.52      0.45       316

[ 0  0 12  0  0  4  0 10  2  0  2  6  0  0  2  0  4  0  0  0 85  0  0  3
  0  9  1  0  0  1  0  0  2  0  7  1  0  0  5  0  3  1  0  7  0  1  0 17
  0  0 27  0 10  0  0  0  5  0  0  2  0  5  0  0  0 17  0  0  3  0 45  0
  0  0  6  0  0  2  0  7  2]
MNB Accuracy:  0.5189873417721519
MNB F1:  0.2757226150900451
             precision    recall  f1-score   support

          0       0.71      0.36      0.48        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 32)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(57, 16)
2
(57, 48)
             precision    recall  f1-score   support

          0       0.77

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      1.00      0.89        34
          1       1.00      0.20      0.33        10

avg / total       0.85      0.82      0.77        44

[34  0  8  2]
LR Accuracy:  0.8181818181818182
LR F1:  0.6140350877192983
For name:  a_young
total sample size before apply threshold:  442
Counter({'0000-0002-1202-6297': 138, '0000-0001-5702-4220': 78, '0000-0002-4163-6772': 54, '0000-0002-9367-9213': 38, '0000-0002-7288-3469': 31, '0000-0001-8551-5078': 25, '0000-0003-3969-3249': 22, '0000-0002-0077-137X': 22, '0000-0001-6251-0944': 20, '0000-0002-8486-0643': 8, '0000-0002-8127-7380': 2, '0000-0002-1994-9211': 1, '0000-0002-1486-5561': 1, '0000-0001-6800-1454': 1, '0000-0003-4822-6335': 1})
['0000-0001-6251-0944', '0000-0001-8551-5078', '0000-0002-4163-6772', '0000-0002-7288-3469', '0000-0002-1202-6297', '0000-0001-5702-4220', '0000-0003-3969-3249', '0000-0002-0077-137X', '0000-0002-9367-9213']
Total sample size after app

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.50      0.67        20
          1       1.00      0.48      0.65        25
          2       0.90      0.65      0.75        54
          3       0.85      0.74      0.79        31
          4       0.61      0.93      0.74       138
          5       0.81      0.64      0.71        78
          6       0.54      0.59      0.57        22
          7       1.00      0.77      0.87        22
          8       0.85      0.61      0.71        38

avg / total       0.78      0.73      0.73       428

[ 10   0   1   0   8   0   0   0   1   0  12   0   1   6   1   5   0   0
   0   0  35   1  14   1   2   0   1   0   0   1  23   4   3   0   0   0
   0   0   1   0 129   5   1   0   2   0   0   0   1  24  50   3   0   0
   0   0   0   1   8   0  13   0   0   0   0   0   0   5   0   0  17   0
   0   0   1   0  12   2   0   0  23]
svc Accuracy:  0.7289719626168224
svc F1:  0.7179418367840286
             precision

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.27      0.38        15
          1       0.52      0.59      0.55        29
          2       0.83      0.29      0.43        17
          3       0.69      0.90      0.78        71
          4       0.62      0.36      0.45        14

avg / total       0.66      0.65      0.62       146

[ 4  3  0  8  0  1 17  1  9  1  0  4  5  6  2  1  6  0 64  0  0  3  0  6
  5]
svc Accuracy:  0.6506849315068494
svc F1:  0.519831069169146
             precision    recall  f1-score   support

          0       1.00      0.27      0.42        15
          1       0.64      0.48      0.55        29
          2       1.00      0.18      0.30        17
          3       0.59      0.94      0.73        71
          4       1.00      0.29      0.44        14

avg / total       0.73      0.63      0.58       146

[ 4  3  0  8  0  0 14  0 15  0  0  0  3 14  0  0  4  0 67  0  0  1  0  9
  4]
LR Accuracy:  0.6301369863013698
LR

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.43      0.59        28
          1       1.00      0.64      0.78        22
          2       0.74      0.98      0.84       140
          3       0.80      0.54      0.65        37
          4       0.78      0.57      0.66        37

avg / total       0.80      0.77      0.76       264

[ 12   0  16   0   0   0  14   8   0   0   0   0 137   2   1   1   0  11
  20   5   0   0  13   3  21]
svc Accuracy:  0.7727272727272727
svc F1:  0.7015263689671636
             precision    recall  f1-score   support

          0       1.00      0.11      0.19        28
          1       1.00      0.27      0.43        22
          2       0.63      0.98      0.77       140
          3       0.87      0.35      0.50        37
          4       0.78      0.49      0.60        37

avg / total       0.76      0.67      0.62       264

[  3   0  25   0   0   0   6  16   0   0   0   0 137   2   1   0   0  20
  13   4   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.45      0.85      0.59        53
          1       0.20      0.08      0.11        13
          2       0.18      0.17      0.17        12
          3       1.00      0.19      0.32        16
          4       0.67      0.27      0.39        22
          5       0.00      0.00      0.00        12

avg / total       0.47      0.45      0.38       128

[45  2  4  0  2  0 11  1  0  0  0  1 10  0  2  0  0  0 12  1  0  3  0  0
 12  1  3  0  6  0  9  0  2  0  1  0]
svc Accuracy:  0.4453125
svc F1:  0.2633359442708376
             precision    recall  f1-score   support

          0       0.44      0.89      0.58        53
          1       0.33      0.08      0.12        13
          2       0.17      0.08      0.11        12
          3       1.00      0.06      0.12        16
          4       0.67      0.27      0.39        22
          5       0.00      0.00      0.00        12

avg / total       0.47      0.44    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.56      0.75      0.64        12
          1       0.84      0.80      0.82        20
          2       0.67      0.24      0.35        17
          3       0.56      0.71      0.63        35
          4       0.38      0.33      0.35        18
          5       0.82      0.82      0.82        40

avg / total       0.66      0.65      0.64       142

[ 9  0  0  2  1  0  0 16  0  1  1  2  1  1  4  5  2  4  3  0  2 25  4  1
  3  1  0  8  6  0  0  1  0  4  2 33]
svc Accuracy:  0.6549295774647887
svc F1:  0.602356204466179
             precision    recall  f1-score   support

          0       0.54      0.58      0.56        12
          1       0.89      0.85      0.87        20
          2       1.00      0.18      0.30        17
          3       0.52      0.66      0.58        35
          4       0.38      0.17      0.23        18
          5       0.62      0.85      0.72        40

avg / total       0.64      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(77, 39)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(77, 23)
2
(77, 62)
             precision    recall  f1-score   support

          0       0.71

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.97      0.91        76
          1       0.85      0.48      0.61        23

avg / total       0.86      0.86      0.84        99

[74  2 12 11]
svc Accuracy:  0.8585858585858586
svc F1:  0.7623456790123457
             precision    recall  f1-score   support

          0       0.78      0.99      0.87        76
          1       0.67      0.09      0.15        23

avg / total       0.75      0.78      0.71        99

[75  1 21  2]
LR Accuracy:  0.7777777777777778
LR F1:  0.5129695885509838
For name:  m_schneider
total sample size before apply threshold:  367
Counter({'0000-0001-9645-1938': 110, '0000-0002-9570-3491': 91, '0000-0002-9260-7357': 56, '0000-0001-7190-3379': 34, '0000-0002-7114-2060': 29, '0000-0002-1223-1266': 14, '0000-0001-7147-8915': 10, '0000-0002-3842-2618': 10, '0000-0003-1488-4743': 8, '0000-0001-7534-5431': 2, '0000-0001-9846-7132': 2, '0000-0002-4918-1389': 1})
['0000-0002-9570-34

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.81      0.62        91
          1       0.94      0.57      0.71        56
          2       0.71      0.82      0.76       110
          3       1.00      0.62      0.76        34
          4       0.75      0.30      0.43        10
          5       0.67      0.20      0.31        10
          6       0.88      0.52      0.65        29
          7       1.00      0.14      0.25        14

avg / total       0.75      0.68      0.67       354

[74  0 17  0  0  0  0  0 14 32  9  0  0  0  1  0 17  0 90  0  1  1  1  0
 10  0  3 21  0  0  0  0  6  0  1  0  3  0  0  0  6  0  2  0  0  2  0  0
  9  2  3  0  0  0 15  0 11  0  1  0  0  0  0  2]
svc Accuracy:  0.6751412429378532
svc F1:  0.5622182159946585
             precision    recall  f1-score   support

          0       0.47      0.77      0.59        91
          1       0.71      0.54      0.61        56
          2       0.70      0.85      0.77       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



total sample size before apply threshold:  83
Counter({'0000-0002-1505-5173': 67, '0000-0002-9724-4540': 9, '0000-0001-9028-1990': 6, '0000-0001-7887-5016': 1})
['0000-0002-1505-5173']
Total sample size after apply threshold:  67
For name:  s_shim
total sample size before apply threshold:  41
Counter({'0000-0001-8043-2257': 14, '0000-0003-4143-7383': 10, '0000-0001-5203-6038': 10, '0000-0002-5188-688X': 7})
['0000-0001-8043-2257', '0000-0003-4143-7383', '0000-0001-5203-6038']
Total sample size after apply threshold:  34
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(34, 21)
TfidfVectoriz

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.94      0.90      0.92        51
          1       0.90      0.93      0.91        46

avg / total       0.92      0.92      0.92        97

[46  5  3 43]
MNB Accuracy:  0.9175257731958762
MNB F1:  0.9174468085106383
             precision    recall  f1-score   support

          0       0.94      0.98      0.96        51
          1       0.98      0.93      0.96        46

avg / total       0.96      0.96      0.96        97

[50  1  3 43]
svc Accuracy:  0.9587628865979382
svc F1:  0.9585470085470087
             precision    recall  f1-score   support

          0       0.91      0.94      0.92        51
          1       0.93      0.89      0.91        46

avg / total       0.92      0.92      0.92        97

[48  3  5 41]
LR Accuracy:  0.9175257731958762
LR F1:  0.917094017094017
For name:  m_acosta
total sample size before apply threshold:  47
Counter({'0000-0002-5018-339X': 24, '0000-0003-4827-7271': 17, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.71      0.91      0.80        11
          1       0.96      0.85      0.90        27

avg / total       0.89      0.87      0.87        38

[10  1  4 23]
MNB Accuracy:  0.868421052631579
MNB F1:  0.8509803921568628
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.96      1.00      0.98        27

avg / total       0.97      0.97      0.97        38

[10  1  0 27]
svc Accuracy:  0.9736842105263158
svc F1:  0.967099567099567
             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.96      1.00      0.98        27

avg / total       0.97      0.97      0.97        38

[10  1  0 27]
LR Accuracy:  0.9736842105263158
LR F1:  0.967099567099567
For name:  j_weiner
total sample size before apply threshold:  61
Counter({'0000-0002-3352-2847': 35, '0000-0002-0736-7943': 15, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(93, 56)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(93, 19)
2
(93, 75)
             precision    recall  f1-score   support

          0       0.76

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.75      0.82        24
          1       0.70      0.98      0.82        62
          2       0.00      0.00      0.00        11
          3       1.00      0.17      0.29        12

avg / total       0.71      0.74      0.68       109

[18  6  0  0  1 61  0  0  0 11  0  0  1  9  0  2]
svc Accuracy:  0.7431192660550459
svc F1:  0.4806720125512072
             precision    recall  f1-score   support

          0       0.93      0.54      0.68        24
          1       0.67      1.00      0.80        62
          2       0.00      0.00      0.00        11
          3       1.00      0.17      0.29        12

avg / total       0.69      0.71      0.64       109

[13 11  0  0  0 62  0  0  0 11  0  0  1  9  0  2]
LR Accuracy:  0.7064220183486238
LR F1:  0.4424812030075188
For name:  c_pan
total sample size before apply threshold:  161
Counter({'0000-0002-2652-5134': 66, '0000-0002-6654-9309': 33, '0000-000

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.45      0.61      0.52        33
          1       0.45      0.39      0.42        23
          2       0.50      0.09      0.15        11
          3       0.90      0.97      0.93        66
          4       0.73      0.53      0.62        15

avg / total       0.68      0.69      0.67       148

[20  7  1  4  1 11  9  0  1  2  8  1  1  1  0  2  0  0 64  0  3  3  0  1
  8]
LR Accuracy:  0.6891891891891891
LR F1:  0.528324501843429
For name:  x_cao
total sample size before apply threshold:  74
Counter({'0000-0002-3004-7518': 25, '0000-0001-7222-5450': 14, '0000-0002-3476-9833': 12, '0000-0002-4782-853X': 11, '0000-0001-7571-6482': 10, '0000-0002-6771-0571': 1, '0000-0001-8124-7491': 1})
['0000-0001-7571-6482', '0000-0002-3004-7518', '0000-0001-7222-5450', '0000-0002-4782-853X', '0000-0002-3476-9833']
Total sample size after apply threshold:  72
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      0.92      0.87        66
          1       0.78      0.44      0.56        16
          2       0.81      0.76      0.78        38

avg / total       0.81      0.81      0.80       120

[61  1  4  6  7  3  8  1 29]
svc Accuracy:  0.8083333333333333
svc F1:  0.7363440035780462
             precision    recall  f1-score   support

          0       0.78      0.95      0.86        66
          1       1.00      0.19      0.32        16
          2       0.81      0.76      0.78        38

avg / total       0.82      0.79      0.76       120

[63  0  3  9  3  4  9  0 29]
LR Accuracy:  0.7916666666666666
LR F1:  0.6522387048702839
For name:  h_chen
total sample size before apply threshold:  986
Counter({'0000-0001-5108-8338': 147, '0000-0002-5799-6705': 93, '0000-0003-0708-6073': 73, '0000-0001-6758-1995': 49, '0000-0001-5051-9896': 40, '0000-0003-0676-4610': 40, '0000-0002-7748-4440': 39, '0000-0001-6883-3752

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.00      0.00      0.00        20
          2       0.00      0.00      0.00        18
          3       0.00      0.00      0.00        12
          4       0.82      0.23      0.36        39
          5       0.00      0.00      0.00        36
          6       1.00      0.25      0.40        20
          7       0.00      0.00      0.00        25
          8       0.00      0.00      0.00        10
          9       1.00      0.05      0.10        19
         10       0.00      0.00      0.00        28
         11       0.60      0.07      0.13        40
         12       0.63      0.70      0.66        73
         13       0.60      0.07      0.13        40
         14       0.00      0.00      0.00        15
         15       0.31      0.26      0.28        93
         16       0.00      0.00      0.00        14
         17       0.00      0.00      0.00   

             precision    recall  f1-score   support

          0       0.50      0.06      0.11        16
          1       1.00      0.50      0.67        20
          2       0.00      0.00      0.00        18
          3       0.00      0.00      0.00        12
          4       0.67      0.41      0.51        39
          5       0.18      0.08      0.11        36
          6       0.55      0.30      0.39        20
          7       0.60      0.12      0.20        25
          8       1.00      0.40      0.57        10
          9       0.89      0.84      0.86        19
         10       0.47      0.29      0.36        28
         11       0.33      0.12      0.18        40
         12       0.59      0.74      0.65        73
         13       0.79      0.28      0.41        40
         14       0.86      0.40      0.55        15
         15       0.24      0.65      0.35        93
         16       0.50      0.21      0.30        14
         17       0.00      0.00      0.00   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.50      0.33      0.40        36
          1       0.75      0.23      0.35        13
          2       1.00      1.00      1.00        10
          3       0.67      0.13      0.22        15
          4       0.50      0.90      0.64       117
          5       0.64      0.38      0.47        48
          6       1.00      0.94      0.97        17
          7       0.20      0.07      0.11        14
          8       0.75      0.21      0.33        14
          9       0.92      0.33      0.49        33

avg / total       0.62      0.57      0.53       317

[ 12   0   0   0  22   2   0   0   0   0   1   3   0   0   8   1   0   0
   0   0   0   0  10   0   0   0   0   0   0   0   0   1   0   2  11   1
   0   0   0   0   7   0   0   1 105   2   0   1   0   1   4   0   0   0
  26  18   0   0   0   0   0   0   0   0   1   0  16   0   0   0   0   0
   0   0  11   1   0   1   1   0   0   0   0   0   9   1   0   1   3 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(168, 82)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(168, 20)
2
(168, 102)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(629, 243)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(629, 28)
2
(629, 271)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.92      0.38      0.54        29
          1       0.96      0.79      0.87        33
          2       0.00      0.00      0.00        11
          3       0.33      0.10      0.15        10
          4       0.77      0.57      0.66        42
          5       1.00      0.77      0.87        13
          6       0.00      0.00      0.00        22
          7       0.83      0.52      0.64        56
          8       0.61      0.96      0.74       219
          9       0.95      0.79      0.86       194

avg / total       0.75      0.74      0.72       629

[ 11   0   0   0   0   0   0   1  16   1   0  26   0   0   0   0   1   0
   5   1   0   0   0   1   1   0   0   0   9   0   0   0   0   1   0   0
   0   1   8   0   0   0   0   0  24   0   0   0  16   2   0   0   0   0
   0  10   0   0   3   0   0   1   0   0   0   0   0   2  17   2   0   0
   0   0   0   0   2  29  24   1   1   0   1   1   4   0   1   0 210 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.89      0.40      0.55        40
          2       0.56      0.46      0.51        48
          3       0.86      0.38      0.52        16
          4       0.72      0.85      0.78        48
          5       0.78      0.24      0.37        29
          6       0.51      0.90      0.65        81

avg / total       0.64      0.61      0.57       272

[ 0  0  2  0  0  0  8  0 16  3  0  7  0 14  0  1 22  0  1  1 23  0  0  3
  6  1  1  5  0  0  0  0 41  0  7  0  0  7  1  2  7 12  0  1  2  0  5  0
 73]
MNB Accuracy:  0.6066176470588235
MNB F1:  0.4833274783665616
             precision    recall  f1-score   support

          0       1.00      0.40      0.57        10
          1       0.92      0.55      0.69        40
          2       0.47      0.75      0.58        48
          3       0.92      0.75      0.83        16
          4       0.93      0.83      0.8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.82      0.86        34
          1       0.41      0.41      0.41        17
          2       0.51      0.69      0.59        29
          3       0.60      0.30      0.40        10
          4       0.79      0.70      0.74        37
          5       0.61      0.69      0.65        16

avg / total       0.68      0.66      0.67       143

[28  1  1  0  0  4  0  7  6  2  1  1  0  4 20  0  4  1  0  3  3  3  1  0
  0  1  9  0 26  1  3  1  0  0  1 11]
svc Accuracy:  0.6643356643356644
svc F1:  0.6085757379875026
             precision    recall  f1-score   support

          0       0.90      0.82      0.86        34
          1       0.64      0.41      0.50        17
          2       0.61      0.69      0.65        29
          3       0.67      0.20      0.31        10
          4       0.68      0.81      0.74        37
          5       0.52      0.69      0.59        16

avg / total       0.69     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.81      0.98      0.89       134
          2       1.00      0.11      0.19        19

avg / total       0.77      0.80      0.73       167

[  0  14   0   3 131   0   1  16   2]
svc Accuracy:  0.7964071856287425
svc F1:  0.35953726123217655
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       0.80      1.00      0.89       134
          2       0.00      0.00      0.00        19

avg / total       0.64      0.80      0.71       167

[  0  14   0   0 134   0   0  19   0]
LR Accuracy:  0.8023952095808383
LR F1:  0.2967884828349945
For name:  l_rocha
total sample size before apply threshold:  81
Counter({'0000-0001-9402-887X': 24, '0000-0002-4345-6994': 20, '0000-0002-5469-0911': 11, '0000-0001-7832-058X': 8, '0000-0001-8184-8801': 6, '0000-0003-2146-9708': 5, '0000-0002-7219-1518': 5, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(23, 13)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(23, 11)
2
(23, 24)
             precision    recall  f1-score   support

          0       1.00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.65      0.76        26
          1       1.00      0.32      0.48        22
          2       0.73      0.98      0.84        63

avg / total       0.82      0.77      0.75       111

[17  0  9  1  7 14  1  0 62]
svc Accuracy:  0.7747747747747747
svc F1:  0.6920506713610162
             precision    recall  f1-score   support

          0       0.85      0.42      0.56        26
          1       1.00      0.14      0.24        22
          2       0.65      0.98      0.78        63

avg / total       0.77      0.68      0.63       111

[11  0 15  1  3 18  1  0 62]
LR Accuracy:  0.6846846846846847
LR F1:  0.5296375635616143
For name:  m_rodriguez
total sample size before apply threshold:  214
Counter({'0000-0001-6328-6497': 195, '0000-0001-8926-2987': 8, '0000-0002-9380-6614': 4, '0000-0002-4476-004X': 3, '0000-0001-6778-1663': 2, '0000-0002-4452-7627': 1, '0000-0002-2640-5888': 1})
['0000-0001-6328-649

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


total sample size before apply threshold:  354
Counter({'0000-0002-0026-9494': 194, '0000-0001-7630-7285': 31, '0000-0001-8594-9554': 28, '0000-0003-0541-0431': 23, '0000-0003-3672-6631': 11, '0000-0002-7249-2580': 11, '0000-0001-9096-9728': 9, '0000-0001-6159-1842': 9, '0000-0003-3430-8110': 7, '0000-0001-6275-0235': 6, '0000-0001-7065-1157': 6, '0000-0003-4680-7080': 4, '0000-0002-1698-0408': 3, '0000-0001-9753-0777': 2, '0000-0002-7196-5825': 2, '0000-0001-6136-7111': 2, '0000-0002-6319-2068': 2, '0000-0002-3708-5859': 1, '0000-0003-1523-2368': 1, '0000-0002-0669-7860': 1, '0000-0002-8765-5178': 1})
['0000-0003-0541-0431', '0000-0003-3672-6631', '0000-0001-7630-7285', '0000-0002-7249-2580', '0000-0001-8594-9554', '0000-0002-0026-9494']
Total sample size after apply threshold:  298
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, m

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.57      0.57      0.57        23
          1       0.00      0.00      0.00        11
          2       0.95      0.68      0.79        31
          3       0.67      0.18      0.29        11
          4       1.00      0.68      0.81        28
          5       0.83      0.95      0.88       194

avg / total       0.80      0.80      0.79       298

[ 13   3   0   0   0   7   2   0   0   0   0   9   0   0  21   0   0  10
   2   1   0   2   0   6   1   1   0   0  19   7   5   3   1   1   0 184]
svc Accuracy:  0.802013422818792
svc F1:  0.5557315250502247
             precision    recall  f1-score   support

          0       0.82      0.39      0.53        23
          1       0.00      0.00      0.00        11
          2       1.00      0.39      0.56        31
          3       0.00      0.00      0.00        11
          4       1.00      0.25      0.40        28
          5       0.72      0.99      0.83    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.55      0.71        11
          1       0.75      0.91      0.82        44
          2       0.65      0.83      0.73        29
          3       1.00      0.50      0.67        12
          4       0.93      0.65      0.76        20

avg / total       0.81      0.77      0.76       116

[ 6  2  3  0  0  0 40  4  0  0  0  4 24  0  1  0  5  1  6  0  0  2  5  0
 13]
LR Accuracy:  0.7672413793103449
LR F1:  0.7378539794549497
For name:  t_williams
total sample size before apply threshold:  190
Counter({'0000-0003-3414-3440': 78, '0000-0002-5857-3851': 42, '0000-0003-1072-0223': 25, '0000-0001-6299-3747': 24, '0000-0003-1710-3914': 9, '0000-0002-3866-1344': 6, '0000-0003-0072-3316': 3, '0000-0002-9319-1701': 2, '0000-0003-3463-9200': 1})
['0000-0001-6299-3747', '0000-0003-1072-0223', '0000-0003-3414-3440', '0000-0002-5857-3851']
Total sample size after apply threshold:  169
(0, 0)
TfidfVectorizer(analyzer=

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(174, 64)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(174, 22)
2
(174, 86)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      0.97      0.85        40
          1       0.76      0.79      0.77        47
          2       0.88      0.68      0.77        41
          3       0.72      0.62      0.67        37

avg / total       0.78      0.77      0.77       165

[39  0  0  1  4 37  3  3  4  4 28  5  5  8  1 23]
MNB Accuracy:  0.7696969696969697
MNB F1:  0.7631123436569387
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        40
          1       0.79      0.79      0.79        47
          2       0.79      0.66      0.72        41
          3       0.66      0.78      0.72        37

avg / total       0.81      0.81      0.81       165

[40  0  0  0  0 37  4  6  0  5 27  9  0  5  3 29]
svc Accuracy:  0.806060606060606
svc F1:  0.8058208563173103
             precision    recall  f1-score   support

          0       1.00      0.93      0.96        40
          1       0.66      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.82      0.85      0.84       111
          2       0.74      0.82      0.78        92

avg / total       0.74      0.79      0.76       215

[ 0  3  9  0 94 17  0 17 75]
MNB Accuracy:  0.786046511627907
MNB F1:  0.5375858760314718
             precision    recall  f1-score   support

          0       1.00      0.83      0.91        12
          1       0.90      0.85      0.87       111
          2       0.82      0.89      0.85        92

avg / total       0.87      0.87      0.87       215

[10  1  1  0 94 17  0 10 82]
svc Accuracy:  0.8651162790697674
svc F1:  0.8778759820426486
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.82      0.86      0.84       111
          2       0.76      0.80      0.78        92

avg / total       0.75      0.79      0.77       215

[ 0  3  9  0 96

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.75      0.50      0.60        18
          1       0.75      0.90      0.82        30

avg / total       0.75      0.75      0.74        48

[ 9  9  3 27]
LR Accuracy:  0.75
LR F1:  0.7090909090909091
For name:  a_green
total sample size before apply threshold:  169
Counter({'0000-0002-2753-4841': 79, '0000-0002-1268-4951': 39, '0000-0003-2058-1204': 35, '0000-0003-0454-1798': 8, '0000-0002-3674-4242': 4, '0000-0001-7666-5584': 2, '0000-0002-1241-4230': 1, '0000-0003-3404-4995': 1})
['0000-0002-1268-4951', '0000-0002-2753-4841', '0000-0003-2058-1204']
Total sample size after apply threshold:  153
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_ac

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 54)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 10)
2
(73, 64)
             precision    recall  f1-score   support

          0       0.8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.56      0.92      0.70        38
          1       0.50      0.31      0.38        16
          2       1.00      0.64      0.78        11
          3       1.00      0.61      0.76        18
          4       1.00      0.59      0.74        17

avg / total       0.75      0.68      0.68       100

[35  3  0  0  0 11  5  0  0  0  4  0  7  0  0  6  1  0 11  0  6  1  0  0
 10]
svc Accuracy:  0.68
svc F1:  0.6723509185578151
             precision    recall  f1-score   support

          0       0.50      0.89      0.64        38
          1       0.57      0.25      0.35        16
          2       1.00      0.27      0.43        11
          3       0.83      0.56      0.67        18
          4       0.90      0.53      0.67        17

avg / total       0.69      0.60      0.58       100

[34  1  0  2  1 12  4  0  0  0  8  0  3  0  0  7  1  0 10  0  7  1  0  0
  9]
LR Accuracy:  0.6
LR F1:  0.5502480565647095
For

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



(124, 69)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(124, 16)
2
(124, 85)
             precision    recall  f1-score   support

          0       0.66      0.91      0.76        46
          1       1.00      0.50      0.67        14
          2       0.71      0.75      0.73        36
          3       0.73      0.39      0.51        28

avg / total       0.73      0.70      0.69       124

[42  0  2  2  2  7  4  1  8  0 27  1 12  0  5 11]
MNB Accuracy:  0.7016129032258065
MNB F1:  0.667915166752376
             precision    recall  f1-score   support

          0       0.79      0.83     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.81      0.93      0.86        27
          1       0.78      0.54      0.64        13

avg / total       0.80      0.80      0.79        40

[25  2  6  7]
svc Accuracy:  0.8
svc F1:  0.7492163009404389
             precision    recall  f1-score   support

          0       0.71      1.00      0.83        27
          1       1.00      0.15      0.27        13

avg / total       0.80      0.72      0.65        40

[27  0 11  2]
LR Accuracy:  0.725
LR F1:  0.5487179487179488
For name:  y_yao
total sample size before apply threshold:  58
Counter({'0000-0001-5827-8716': 19, '0000-0002-4338-2606': 18, '0000-0002-0814-6675': 11, '0000-0002-2943-5994': 3, '0000-0003-4892-052X': 3, '0000-0003-1132-592X': 1, '0000-0001-6502-6226': 1, '0000-0001-9359-2030': 1, '0000-0003-3612-3742': 1})
['0000-0002-4338-2606', '0000-0001-5827-8716', '0000-0002-0814-6675']
Total sample size after apply threshold:  48
(0, 0)
TfidfVectorizer(

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.23      0.27      0.25        11
          1       0.47      0.60      0.53        15
          2       0.12      0.14      0.13        14
          3       0.55      0.61      0.58        36
          4       0.52      0.60      0.56        20
          5       1.00      0.57      0.73        14
          6       0.77      0.82      0.79        33
          7       0.89      0.40      0.55        20

avg / total       0.61      0.56      0.56       163

[ 3  0  2  3  1  0  2  0  1  9  2  2  1  0  0  0  3  3  2  5  0  0  1  0
  1  2  6 22  1  0  3  1  2  2  0  3 12  0  1  0  1  0  1  2  2  8  0  0
  1  0  0  1  4  0 27  0  1  3  3  2  2  0  1  8]
svc Accuracy:  0.558282208588957
svc F1:  0.5153683142008219
             precision    recall  f1-score   support

          0       0.75      0.27      0.40        11
          1       0.57      0.53      0.55        15
          2       0.20      0.07      0.11        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        28
          1       0.00      0.00      0.00        19
          2       0.00      0.00      0.00        10
          3       0.00      0.00      0.00        16
          4       0.00      0.00      0.00        12
          5       1.00      0.10      0.19        29
          6       0.00      0.00      0.00        13
          7       0.36      0.99      0.53       194
          8       1.00      0.05      0.10        39
          9       1.00      0.03      0.06        30
         10       0.00      0.00      0.00        14
         11       1.00      0.07      0.14        27
         12       0.00      0.00      0.00        15
         13       0.46      0.45      0.45       101
         14       0.00      0.00      0.00        21
         15       0.00      0.00      0.00        13
         16       1.00      0.16      0.27        19
         17       0.69      0.38      0.49   

Total sample size after apply threshold:  373
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(373, 113)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(373, 23)
2
(373, 136)
             precision   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[356   0  17   0]
LR Accuracy:  0.9544235924932976
LR F1:  0.4883401920438957
For name:  k_yamamoto
total sample size before apply threshold:  106
Counter({'0000-0002-7935-7015': 93, '0000-0003-0866-3207': 4, '0000-0002-7590-3568': 4, '0000-0001-6642-7961': 2, '0000-0002-6831-5346': 2, '0000-0002-1619-4407': 1})
['0000-0002-7935-7015']
Total sample size after apply threshold:  93
For name:  j_silva
total sample size before apply threshold:  268
Counter({'0000-0001-9523-9441': 128, '0000-0003-3977-7418': 28, '0000-0002-3696-3955': 22, '0000-0002-6725-5767': 14, '0000-0003-2583-9518': 13, '0000-0001-9959-4272': 8, '0000-0002-5656-0897': 7, '0000-0001-9487-4259': 6, '0000-0002-6041-1763': 6, '0000-0002-1520-0799': 4, '0000-0001-8055-8925': 3, '0000-0001-9708-1043': 3, '0000-0002-7268-6465': 3, '0000-0003-1224-1699': 2, '0000-0002-7211-1661': 2, '0000-0002-7206-0550': 2, '0000-0003-1244-6483': 2, '0000-0003-4180-565X': 2, '0000-0001-9522-6181': 2, '0000-0001-9554-8797': 2, '0000-0002-6455-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.79      0.88        14
          1       0.71      0.38      0.50        13
          2       0.93      0.50      0.65        28
          3       0.83      0.23      0.36        22
          4       0.75      0.98      0.85       128

avg / total       0.80      0.78      0.75       205

[ 11   0   0   0   3   0   5   0   0   8   0   1  14   0  13   0   0   0
   5  17   0   1   1   1 125]
svc Accuracy:  0.7804878048780488
svc F1:  0.6477291567789907
             precision    recall  f1-score   support

          0       1.00      0.43      0.60        14
          1       1.00      0.15      0.27        13
          2       1.00      0.29      0.44        28
          3       1.00      0.09      0.17        22
          4       0.68      1.00      0.81       128

avg / total       0.80      0.71      0.64       205

[  6   0   0   0   8   0   2   0   0  11   0   0   8   0  20   0   0   0
   2  20   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(32, 24)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(32, 16)
2
(32, 40)
             precision    recall  f1-score   support

          0       0.71

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.67      0.80        18
          1       0.35      0.26      0.30        27
          2       1.00      0.56      0.71        18
          3       0.52      0.76      0.62        33
          4       0.45      0.53      0.49        32

avg / total       0.60      0.55      0.56       128

[12  2  0  2  2  0  7  0  9 11  0  1 10  4  3  0  3  0 25  5  0  7  0  8
 17]
svc Accuracy:  0.5546875
svc F1:  0.5830312582085633
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        18
          1       0.35      0.26      0.30        27
          2       0.88      0.39      0.54        18
          3       0.49      0.73      0.59        33
          4       0.45      0.59      0.51        32

avg / total       0.58      0.52      0.51       128

[ 9  2  0  5  2  0  7  0 10 10  0  2  7  3  6  0  3  1 24  5  0  6  0  7
 19]
LR Accuracy:  0.515625
LR F1:  0.5203759825

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.60      0.78      0.68        27
          1       1.00      0.09      0.17        11
          2       1.00      0.25      0.40        12
          3       0.00      0.00      0.00        12
          4       0.53      0.86      0.65        36

avg / total       0.59      0.57      0.49        98

[21  0  0  0  6  2  1  0  0  8  4  0  3  0  5  3  0  0  0  9  5  0  0  0
 31]
LR Accuracy:  0.5714285714285714
LR F1:  0.379343520090549
For name:  y_dong
total sample size before apply threshold:  76
Counter({'0000-0003-0016-9028': 42, '0000-0002-1737-6536': 16, '0000-0003-4550-2322': 9, '0000-0003-1294-4888': 4, '0000-0002-4129-4637': 2, '0000-0001-8595-2868': 2, '0000-0003-1774-1553': 1})
['0000-0002-1737-6536', '0000-0003-0016-9028']
Total sample size after apply threshold:  58
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.00      0.00      0.00        15
          2       0.00      0.00      0.00        10
          3       0.50      0.96      0.66        80
          4       1.00      0.29      0.44        14
          5       0.57      0.25      0.35        16
          6       0.38      0.26      0.31        23
          7       0.72      0.39      0.51        33
          8       0.82      0.90      0.86        10

avg / total       0.49      0.54      0.46       211

[ 0  0  0  7  0  1  2  0  0  0  0  0 15  0  0  0  0  0  0  0  0  7  0  0
  3  0  0  0  1  0 77  0  0  0  2  0  0  0  0  6  4  1  2  1  0  0  0  0
  9  0  4  3  0  0  0  0  0 13  0  1  6  2  1  0  0  0 19  0  0  0 13  1
  0  0  0  1  0  0  0  0  9]
MNB Accuracy:  0.5355450236966824
MNB F1:  0.34722547510271295
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.47      0.40      0.43        20
          1       0.80      0.27      0.40        15
          2       0.89      0.98      0.94       251
          3       1.00      0.58      0.73        26

avg / total       0.87      0.88      0.86       312

[  8   0  12   0   2   4   9   0   5   0 246   0   2   1   8  15]
svc Accuracy:  0.875
svc F1:  0.6248752415589103
             precision    recall  f1-score   support

          0       0.33      0.05      0.09        20
          1       0.00      0.00      0.00        15
          2       0.81      0.99      0.89       251
          3       0.00      0.00      0.00        26

avg / total       0.67      0.80      0.72       312

[  1   0  19   0   0   0  15   0   2   0 249   0   0   0  26   0]
LR Accuracy:  0.8012820512820513
LR F1:  0.24406055900621118
For name:  y_lim
total sample size before apply threshold:  76
Counter({'0000-0002-3484-045X': 21, '0000-0002-8472-2

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 47)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(73, 14)
2
(73, 61)
             precision    recall  f1-score   support

          0       0.80

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.94      1.00      0.97        34

avg / total       0.96      0.95      0.95        44

[ 8  2  0 34]
svc Accuracy:  0.9545454545454546
svc F1:  0.9301587301587302
             precision    recall  f1-score   support

          0       1.00      0.30      0.46        10
          1       0.83      1.00      0.91        34

avg / total       0.87      0.84      0.81        44

[ 3  7  0 34]
LR Accuracy:  0.8409090909090909
LR F1:  0.6841025641025641
For name:  w_cao
total sample size before apply threshold:  126
Counter({'0000-0002-2447-1486': 91, '0000-0002-8952-9159': 27, '0000-0002-5369-9682': 7, '0000-0001-6209-3482': 1})
['0000-0002-8952-9159', '0000-0002-2447-1486']
Total sample size after apply threshold:  118
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content'

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.52      0.58        27
          1       0.87      0.92      0.89        91

avg / total       0.82      0.83      0.82       118

[14 13  7 84]
MNB Accuracy:  0.8305084745762712
MNB F1:  0.7384751773049645
             precision    recall  f1-score   support

          0       0.77      0.37      0.50        27
          1       0.84      0.97      0.90        91

avg / total       0.82      0.83      0.81       118

[10 17  3 88]
svc Accuracy:  0.8305084745762712
svc F1:  0.6989795918367347
             precision    recall  f1-score   support

          0       0.83      0.19      0.30        27
          1       0.80      0.99      0.89        91

avg / total       0.81      0.81      0.75       118

[ 5 22  1 90]
LR Accuracy:  0.8050847457627118
LR F1:  0.5948649052097328
For name:  c_ma
total sample size before apply threshold:  126
Counter({'0000-0001-8818-6396': 31, '0000-0002-7480-5528': 28, '00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  27
Counter({'0000-0003-3932-8639': 13, '0000-0002-1840-325X': 9, '0000-0002-2878-8544': 3, '0000-0001-9809-6976': 1, '0000-0001-7626-6778': 1})
['0000-0003-3932-8639']
Total sample size after apply threshold:  13
For name:  h_kwon
total sample size before apply threshold:  35
Counter({'0000-0003-4979-8749': 13, '0000-0002-6919-833X': 7, '0000-0002-0960-0198': 5, '0000-0001-6941-4808': 3, '0000-0003-4026-4572': 3, '0000-0002-2936-1358': 1, '0000-0002-8509-3968': 1, '0000-0003-4465-2708': 1, '0000-0001-9772-1354': 1})
['0000-0003-4979-8749']
Total sample size after apply threshold:  13
For name:  s_gao
total sample size before apply threshold:  31
Counter({'0000-0002-7020-037X': 28, '0000-0002-8919-1338': 1, '0000-0002-3574-6393': 1, '0000-0003-3320-8505': 1})
['0000-0002-7020-037X']
Total sample size after apply threshold:  28
For name:  f_tian
total sample size before apply threshold:  17
Counter({'0000-0002-1247-6896': 9, '0000-0003-3580-022X

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.20      0.31        10
          1       0.69      0.95      0.80        19

avg / total       0.68      0.69      0.63        29

[ 2  8  1 18]
LR Accuracy:  0.6896551724137931
LR F1:  0.5538461538461538
For name:  s_wolf
total sample size before apply threshold:  363
Counter({'0000-0003-2972-3440': 173, '0000-0002-7467-7028': 102, '0000-0002-5337-5063': 46, '0000-0003-0832-6315': 15, '0000-0002-3747-8097': 12, '0000-0003-1752-6175': 9, '0000-0003-3921-6629': 3, '0000-0001-7717-6993': 2, '0000-0002-6748-3911': 1})
['0000-0002-7467-7028', '0000-0002-5337-5063', '0000-0003-0832-6315', '0000-0003-2972-3440', '0000-0002-3747-8097']
Total sample size after apply threshold:  348
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.91      0.84      0.87       102
          1       0.67      0.26      0.38        46
          2       0.00      0.00      0.00        15
          3       0.73      0.99      0.84       173
          4       0.00      0.00      0.00        12

avg / total       0.72      0.78      0.72       348

[ 86   1   0  15   0   7  12   0  27   0   1   2   0  12   0   1   0   0
 172   0   0   3   0   9   0]
MNB Accuracy:  0.7758620689655172
MNB F1:  0.4182467403204937
             precision    recall  f1-score   support

          0       0.99      0.86      0.92       102
          1       0.67      0.48      0.56        46
          2       0.83      0.33      0.48        15
          3       0.80      0.99      0.88       173
          4       0.50      0.17      0.25        12

avg / total       0.83      0.83      0.81       348

[ 88   3   0  11   0   1  22   1  20   2   0   3   5   7   0   0   1   0
 172   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.97      0.93      0.95        40
          1       0.93      0.97      0.95        39

avg / total       0.95      0.95      0.95        79

[37  3  1 38]
svc Accuracy:  0.9493670886075949
svc F1:  0.9493589743589745
             precision    recall  f1-score   support

          0       0.95      0.90      0.92        40
          1       0.90      0.95      0.92        39

avg / total       0.93      0.92      0.92        79

[36  4  2 37]
LR Accuracy:  0.9240506329113924
LR F1:  0.9240384615384616
For name:  m_adams
total sample size before apply threshold:  190
Counter({'0000-0003-0435-8651': 59, '0000-0001-8989-508X': 46, '0000-0001-6310-1472': 30, '0000-0002-7743-4515': 29, '0000-0003-2849-9096': 12, '0000-0002-5277-5487': 7, '0000-0002-3878-7684': 5, '0000-0002-3602-6849': 1, '0000-0002-4645-2593': 1})
['0000-0002-7743-4515', '0000-0001-6310-1472', '0000-0003-2849-9096', '0000-0003-0435-8651', '0000-0001-8

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 51)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(130, 17)
2
(130, 68)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.62      0.94      0.75        53
          1       1.00      0.14      0.25        14
          2       0.00      0.00      0.00        10
          3       0.81      0.75      0.78        64

avg / total       0.70      0.71      0.66       141

[50  0  0  3  5  2  0  7  9  0  0  1 16  0  0 48]
MNB Accuracy:  0.7092198581560284
MNB F1:  0.4455918760315423
             precision    recall  f1-score   support

          0       0.91      0.92      0.92        53
          1       1.00      0.36      0.53        14
          2       1.00      0.50      0.67        10
          3       0.81      0.97      0.88        64

avg / total       0.88      0.86      0.84       141

[49  0  0  4  1  5  0  8  2  0  5  3  2  0  0 62]
svc Accuracy:  0.8581560283687943
svc F1:  0.7470757326802789
             precision    recall  f1-score   support

          0       0.82      0.89      0.85        53
          1       1.00     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.56      0.72        16
          1       1.00      0.08      0.14        13
          2       1.00      0.44      0.61        16
          3       0.50      0.74      0.60        35
          4       0.52      0.68      0.59        37

avg / total       0.70      0.58      0.56       117

[ 9  0  0  5  2  0  1  0  6  6  0  0  7  3  6  0  0  0 26  9  0  0  0 12
 25]
LR Accuracy:  0.5811965811965812
LR F1:  0.5314978477147981
For name:  a_moura
total sample size before apply threshold:  36
Counter({'0000-0003-0339-1230': 15, '0000-0002-2105-7319': 14, '0000-0003-2140-0196': 4, '0000-0002-1513-5448': 3})
['0000-0002-2105-7319', '0000-0003-0339-1230']
Total sample size after apply threshold:  29
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.71      0.65      0.68        23
          1       0.61      0.81      0.70        27
          2       0.70      0.41      0.52        17

avg / total       0.67      0.66      0.65        67

[15  7  1  3 22  2  3  7  7]
MNB Accuracy:  0.6567164179104478
MNB F1:  0.6329164662497996
             precision    recall  f1-score   support

          0       0.71      0.65      0.68        23
          1       0.68      0.85      0.75        27
          2       0.83      0.59      0.69        17

avg / total       0.73      0.72      0.71        67

[15  7  1  3 23  1  3  4 10]
svc Accuracy:  0.7164179104477612
svc F1:  0.708523904962571
             precision    recall  f1-score   support

          0       0.68      0.65      0.67        23
          1       0.56      0.81      0.67        27
          2       0.67      0.24      0.35        17

avg / total       0.63      0.61      0.59        67

[15  8  0  3 22

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(111, 65)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(111, 22)
2
(111, 87)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.57      0.73        14
          1       0.58      0.41      0.48        17
          2       0.89      0.62      0.73        13
          3       0.85      0.98      0.91        95

avg / total       0.83      0.83      0.82       139

[ 8  1  0  5  0  7  1  9  0  2  8  3  0  2  0 93]
svc Accuracy:  0.8345323741007195
svc F1:  0.7111552871014604
             precision    recall  f1-score   support

          0       1.00      0.29      0.44        14
          1       0.50      0.12      0.19        17
          2       1.00      0.23      0.38        13
          3       0.74      1.00      0.85        95

avg / total       0.76      0.75      0.69       139

[ 4  1  0  9  0  2  0 15  0  1  3  9  0  0  0 95]
LR Accuracy:  0.7482014388489209
LR F1:  0.46548464303509146
For name:  d_gao
total sample size before apply threshold:  23
Counter({'0000-0003-1821-2741': 14, '0000-0002-9391-1756': 7, '0000-0001

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.67      0.80        15
          1       0.81      0.81      0.81        16
          2       0.76      0.92      0.83        24

avg / total       0.84      0.82      0.82        55

[10  1  4  0 13  3  0  2 22]
MNB Accuracy:  0.8181818181818182
MNB F1:  0.8142295597484277
             precision    recall  f1-score   support

          0       1.00      0.80      0.89        15
          1       0.82      0.56      0.67        16
          2       0.69      0.92      0.79        24

avg / total       0.81      0.78      0.78        55

[12  0  3  0  9  7  0  2 22]
svc Accuracy:  0.7818181818181819
svc F1:  0.7804232804232805
             precision    recall  f1-score   support

          0       1.00      0.67      0.80        15
          1       0.81      0.81      0.81        16
          2       0.76      0.92      0.83        24

avg / total       0.84      0.82      0.82        55

[10  1  4  0 1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.79      0.97      0.87       120
          1       0.90      0.72      0.80        25
          2       0.67      0.31      0.42        13
          3       0.93      0.77      0.84        52
          4       0.50      0.10      0.17        10

avg / total       0.81      0.82      0.80       220

[117   2   1   0   0   4  18   1   2   0   9   0   4   0   0  11   0   0
  40   1   8   0   0   1   1]
svc Accuracy:  0.8181818181818182
svc F1:  0.619942607447988
             precision    recall  f1-score   support

          0       0.70      0.99      0.82       120
          1       1.00      0.52      0.68        25
          2       0.50      0.08      0.13        13
          3       0.94      0.65      0.77        52
          4       0.00      0.00      0.00        10

avg / total       0.75      0.76      0.72       220

[119   0   1   0   0  10  13   0   2   0  12   0   1   0   0  18   0   0
  34   0  10   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 0.6039603960396039
LR F1:  0.49966814297298984
For name:  r_reis
total sample size before apply threshold:  615
Counter({'0000-0002-4295-6129': 423, '0000-0002-9639-7940': 113, '0000-0002-9872-9865': 27, '0000-0001-9689-4085': 21, '0000-0002-0681-4721': 10, '0000-0003-0328-1840': 7, '0000-0003-0937-8045': 7, '0000-0003-3746-6894': 4, '0000-0002-6618-2412': 2, '0000-0002-6935-3459': 1})
['0000-0002-9639-7940', '0000-0001-9689-4085', '0000-0002-9872-9865', '0000-0002-0681-4721', '0000-0002-4295-6129']
Total sample size after apply threshold:  594
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=Non

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.80      0.54      0.65       113
          1       0.80      0.19      0.31        21
          2       0.93      0.52      0.67        27
          3       1.00      0.60      0.75        10
          4       0.84      0.98      0.90       423

avg / total       0.84      0.84      0.82       594

[ 61   0   1   0  51   4   4   0   0  13   2   0  14   0  11   1   0   0
   6   3   8   1   0   0 414]
svc Accuracy:  0.8400673400673401
svc F1:  0.6549559305297011
             precision    recall  f1-score   support

          0       0.80      0.35      0.49       113
          1       0.00      0.00      0.00        21
          2       1.00      0.26      0.41        27
          3       0.00      0.00      0.00        10
          4       0.78      0.99      0.87       423

avg / total       0.75      0.78      0.73       594

[ 40   0   0   0  73   2   0   0   0  19   1   0   7   0  19   2   0   0
   0   8   5  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.55      0.69      0.61        16
          1       0.60      0.82      0.69        22
          2       0.67      0.43      0.52        14
          3       1.00      0.30      0.46        10

avg / total       0.67      0.61      0.60        62

[11  5  0  0  3 18  1  0  5  3  6  0  1  4  2  3]
LR Accuracy:  0.6129032258064516
LR F1:  0.5716740988480119
For name:  j_wong
total sample size before apply threshold:  183
Counter({'0000-0003-2953-7728': 59, '0000-0003-2592-3226': 30, '0000-0002-7213-4898': 24, '0000-0001-5572-4143': 21, '0000-0002-8167-540X': 17, '0000-0001-8268-5610': 10, '0000-0001-8080-1294': 8, '0000-0002-9206-3257': 5, '0000-0002-9329-1075': 4, '0000-0003-3897-7725': 4, '0000-0002-6317-2067': 1})
['0000-0003-2592-3226', '0000-0001-5572-4143', '0000-0002-8167-540X', '0000-0001-8268-5610', '0000-0003-2953-7728', '0000-0002-7213-4898']
Total sample size after apply threshold:  161
(0, 0)
TfidfVecto

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(48, 34)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(48, 14)
2
(48, 48)
             precision    recall  f1-score   support

          0       0.58

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.71      0.36      0.48        14
          1       1.00      0.69      0.82        13
          2       0.47      0.58      0.52        24
          3       0.83      0.59      0.69        17
          4       0.47      0.77      0.59        35
          5       1.00      0.80      0.89        10
          6       1.00      0.73      0.85        15
          7       0.00      0.00      0.00        11
          8       0.78      0.78      0.78        36

avg / total       0.67      0.64      0.64       175

[ 5  0  4  0  5  0  0  0  0  0  9  1  0  1  0  0  1  1  1  0 14  0  9  0
  0  0  0  0  0  1 10  1  0  0  0  5  1  0  5  0 27  0  0  2  0  0  0  1
  0  1  8  0  0  0  0  0  1  0  3  0 11  0  0  0  0  1  0  8  0  0  0  2
  0  0  2  2  2  0  0  2 28]
svc Accuracy:  0.64
svc F1:  0.6224803355404721
             precision    recall  f1-score   support

          0       0.83      0.36      0.50        14
          1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 28)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(49, 11)
2
(49, 39)
             precision    recall  f1-score   support

          0       1.00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



['0000-0002-0958-2639', '0000-0002-1269-1895', '0000-0003-0731-8006']
Total sample size after apply threshold:  155
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(155, 58)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.53      0.95      0.68        38
          1       0.50      0.15      0.24        13
          2       0.62      0.67      0.64        12
          3       0.00      0.00      0.00        10
          4       0.71      0.67      0.69        18
          5       0.71      0.26      0.38        19

avg / total       0.55      0.57      0.51       110

[36  0  1  0  1  0  8  2  1  1  1  0  3  0  8  0  0  1  9  0  0  0  1  0
  4  1  0  0 12  1  8  1  3  0  2  5]
MNB Accuracy:  0.5727272727272728
MNB F1:  0.4374781784992661
             precision    recall  f1-score   support

          0       0.63      0.89      0.74        38
          1       0.40      0.31      0.35        13
          2       0.71      0.83      0.77        12
          3       0.75      0.30      0.43        10
          4       0.76      0.72      0.74        18
          5       0.91      0.53      0.67        19

avg / total       0.69     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.63      1.00      0.77        94
          1       0.00      0.00      0.00        13
          2       0.00      0.00      0.00        12
          3       0.90      0.36      0.51        25
          4       1.00      0.32      0.48        22

avg / total       0.63      0.66      0.58       166

[94  0  0  0  0 12  0  0  1  0 12  0  0  0  0 16  0  0  9  0 15  0  0  0
  7]
MNB Accuracy:  0.6626506024096386
MNB F1:  0.35414137728313977
             precision    recall  f1-score   support

          0       0.72      1.00      0.84        94
          1       1.00      0.54      0.70        13
          2       1.00      0.42      0.59        12
          3       0.94      0.60      0.73        25
          4       1.00      0.32      0.48        22

avg / total       0.83      0.77      0.74       166

[94  0  0  0  0  5  7  0  1  0  7  0  5  0  0 10  0  0 15  0 15  0  0  0
  7]
svc Accuracy:  0.7710843373493976

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(288, 103)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(288, 37)
2
(288, 140)
             precision    recall  f1-score   support

          0       0.88 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 20)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 12)
2
(33, 32)
             precision    recall  f1-score   support

          0       0.68

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):



Total sample size after apply threshold:  36
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(36, 25)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(36, 10)
2
(36, 35)
             precision    reca

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.73      0.42      0.54        26
          1       0.78      0.93      0.85        56

avg / total       0.76      0.77      0.75        82

[11 15  4 52]
LR Accuracy:  0.7682926829268293
LR F1:  0.6910569105691057
For name:  y_cheng
total sample size before apply threshold:  177
Counter({'0000-0001-9150-4690': 38, '0000-0001-7112-8835': 29, '0000-0002-4423-4381': 17, '0000-0003-0125-4267': 16, '0000-0002-2583-228X': 15, '0000-0001-9776-395X': 14, '0000-0002-7529-4408': 11, '0000-0001-6874-8187': 9, '0000-0003-2571-4707': 8, '0000-0002-2077-5335': 5, '0000-0003-4912-9879': 3, '0000-0002-5939-0010': 2, '0000-0002-2431-3197': 2, '0000-0002-1468-6686': 2, '0000-0001-5858-6161': 2, '0000-0002-5906-7694': 1, '0000-0002-2352-8647': 1, '0000-0003-1137-2099': 1, '0000-0003-0822-4458': 1})
['0000-0001-9776-395X', '0000-0002-4423-4381', '0000-0003-0125-4267', '0000-0002-2583-228X', '0000-0002-7529-4408', '0000-0001-7112-88

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(788, 288)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(788, 20)
2
(788, 308)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.11      0.34      0.17        41
          1       0.47      0.57      0.52        28
          2       0.58      0.50      0.54        14
          3       0.45      0.45      0.45        38
          4       0.48      0.62      0.54        42
          5       0.44      0.24      0.31        17
          6       0.58      0.52      0.55        21
          7       0.35      0.41      0.38        37
          8       0.50      0.17      0.25        12
          9       0.48      0.55      0.51        22
         10       0.44      0.62      0.51        50
         11       0.20      0.19      0.19        27
         12       0.35      0.33      0.34        18
         13       0.36      0.25      0.30        16
         14       0.35      0.33      0.34        18
         15       0.00      0.00      0.00        11
         16       0.57      0.53      0.55        15
         17       0.50      0.23      0.32   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


133
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 58)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(133, 16)
2
(133, 74)
             precision    recall  f1-score   support

          0    

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.67      0.18      0.29        11
          1       0.80      0.97      0.88        36

avg / total       0.77      0.79      0.74        47

[ 2  9  1 35]
svc Accuracy:  0.7872340425531915
svc F1:  0.5803571428571429
             precision    recall  f1-score   support

          0       1.00      0.09      0.17        11
          1       0.78      1.00      0.88        36

avg / total       0.83      0.79      0.71        47

[ 1 10  0 36]
LR Accuracy:  0.7872340425531915
LR F1:  0.5223577235772359
For name:  j_christensen
total sample size before apply threshold:  203
Counter({'0000-0002-4299-9479': 100, '0000-0003-1414-1886': 53, '0000-0002-7641-8302': 32, '0000-0002-6741-5839': 13, '0000-0002-2689-1169': 1, '0000-0002-9231-8029': 1, '0000-0003-4225-3359': 1, '0000-0003-2370-2702': 1, '0000-0002-2495-8905': 1})
['0000-0002-7641-8302', '0000-0002-4299-9479', '0000-0002-6741-5839', '0000-0003-1414-1886']
Total 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(31, 25)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(31, 10)
2
(31, 35)
             precision    recall  f1-score   support

          0       0.73

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.44      0.58        16
          1       0.93      0.99      0.96       128

avg / total       0.93      0.93      0.92       144

[  7   9   1 127]
svc Accuracy:  0.9305555555555556
svc F1:  0.7727272727272727
             precision    recall  f1-score   support

          0       1.00      0.19      0.32        16
          1       0.91      1.00      0.95       128

avg / total       0.92      0.91      0.88       144

[  3  13   0 128]
LR Accuracy:  0.9097222222222222
LR F1:  0.6337311680688711
For name:  a_das
total sample size before apply threshold:  74
Counter({'0000-0002-0883-1816': 14, '0000-0002-7033-1441': 10, '0000-0003-0740-8140': 8, '0000-0001-5924-4235': 6, '0000-0001-7383-9606': 5, '0000-0002-5196-9589': 5, '0000-0002-7510-1805': 5, '0000-0003-1801-7487': 4, '0000-0002-1733-626X': 3, '0000-0003-0616-9715': 3, '0000-0002-7473-6139': 2, '0000-0003-4305-6007': 2, '0000-0002-2101-9056': 2, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.53      0.87      0.66        54
          1       0.75      0.40      0.52        30
          2       1.00      0.92      0.96        25
          3       0.50      0.14      0.22        14
          4       0.29      0.12      0.17        16

avg / total       0.63      0.62      0.58       139

[47  2  0  2  3 17 12  0  0  1  2  0 23  0  0 10  1  0  2  1 13  1  0  0
  2]
svc Accuracy:  0.6187050359712231
svc F1:  0.5067100773622513
             precision    recall  f1-score   support

          0       0.48      0.87      0.62        54
          1       0.62      0.27      0.37        30
          2       1.00      0.88      0.94        25
          3       0.00      0.00      0.00        14
          4       0.20      0.06      0.10        16

avg / total       0.52      0.56      0.50       139

[47  3  0  2  2 21  8  0  0  1  3  0 22  0  0 12  1  0  0  1 14  1  0  0
  1]
LR Accuracy:  0.5611510791366906
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 24)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(33, 10)
2
(33, 34)
             precision    recall  f1-score   support

          0       0.80

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.89      0.77      0.83        22
          1       0.79      0.94      0.86        33
          2       1.00      0.80      0.89        15
          3       1.00      1.00      1.00        49

avg / total       0.92      0.92      0.92       119

[17  5  0  0  2 31  0  0  0  3 12  0  0  0  0 49]
svc Accuracy:  0.9159663865546218
svc F1:  0.8948170731707318
             precision    recall  f1-score   support

          0       0.88      0.64      0.74        22
          1       0.76      0.94      0.84        33
          2       0.85      0.73      0.79        15
          3       1.00      1.00      1.00        49

avg / total       0.89      0.88      0.88       119

[14  7  1  0  1 31  1  0  1  3 11  0  0  0  0 49]
LR Accuracy:  0.8823529411764706
LR F1:  0.8400985572038203
For name:  y_nishikawa
total sample size before apply threshold:  21
Counter({'0000-0002-0739-8491': 10, '0000-0003-3313-1990': 8, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.82      0.51      0.63        35
          1       0.53      0.97      0.69        63
          2       0.00      0.00      0.00        13
          3       0.00      0.00      0.00        11
          4       1.00      0.43      0.60        30

avg / total       0.61      0.61      0.55       152

[18 15  2  0  0  2 61  0  0  0  2 11  0  0  0  0 11  0  0  0  0 17  0  0
 13]
LR Accuracy:  0.6052631578947368
LR F1:  0.38432467371721696
For name:  v_fernandes
total sample size before apply threshold:  55
Counter({'0000-0001-6060-9035': 17, '0000-0002-3873-2034': 16, '0000-0003-3979-7523': 15, '0000-0002-9671-3923': 6, '0000-0003-0568-2920': 1})
['0000-0003-3979-7523', '0000-0002-3873-2034', '0000-0001-6060-9035']
Total sample size after apply threshold:  48
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        low

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.85      0.93      0.89        30
          1       0.85      0.69      0.76        16

avg / total       0.85      0.85      0.84        46

[28  2  5 11]
MNB Accuracy:  0.8478260869565217
MNB F1:  0.8237547892720307
             precision    recall  f1-score   support

          0       0.85      0.93      0.89        30
          1       0.85      0.69      0.76        16

avg / total       0.85      0.85      0.84        46

[28  2  5 11]
svc Accuracy:  0.8478260869565217
svc F1:  0.8237547892720307
             precision    recall  f1-score   support

          0       0.80      0.93      0.86        30
          1       0.82      0.56      0.67        16

avg / total       0.81      0.80      0.79        46

[28  2  7  9]
LR Accuracy:  0.8043478260869565
LR F1:  0.764102564102564
For name:  j_petersen
total sample size before apply threshold:  41
Counter({'0000-0001-9615-1310': 12, '0000-0001-6116-5114': 10,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(91, 31)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(91, 18)
2
(91, 49)
             precision    recall  f1-score   support

          0       0.90

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      1.00      0.93        21
          1       0.60      0.83      0.70        18
          2       0.00      0.00      0.00        12

avg / total       0.57      0.71      0.63        51

[21  0  0  1 15  2  2 10  0]
LR Accuracy:  0.7058823529411765
LR F1:  0.5436692506459949
For name:  g_coppola
total sample size before apply threshold:  142
Counter({'0000-0002-9574-0081': 61, '0000-0002-8510-6925': 57, '0000-0003-0147-6142': 16, '0000-0003-2675-783X': 7, '0000-0001-7139-3719': 1})
['0000-0002-9574-0081', '0000-0002-8510-6925', '0000-0003-0147-6142']
Total sample size after apply threshold:  134
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, st

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.69      0.64      0.67        14
          1       0.69      0.73      0.71        15

avg / total       0.69      0.69      0.69        29

[ 9  5  4 11]
LR Accuracy:  0.6896551724137931
LR F1:  0.6881720430107526
For name:  m_ramos
total sample size before apply threshold:  251
Counter({'0000-0002-7554-8324': 187, '0000-0002-8950-2079': 22, '0000-0003-3230-8045': 13, '0000-0002-2157-9774': 8, '0000-0001-6176-5048': 7, '0000-0001-8849-6386': 3, '0000-0001-5224-5665': 3, '0000-0002-2582-7616': 2, '0000-0001-5832-0945': 1, '0000-0001-6594-6591': 1, '0000-0001-6821-3692': 1, '0000-0002-3117-4498': 1, '0000-0003-1133-4164': 1, '0000-0002-9480-782X': 1})
['0000-0002-8950-2079', '0000-0003-3230-8045', '0000-0002-7554-8324']
Total sample size after apply threshold:  222
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.27      0.43        22
          1       0.75      0.23      0.35        13
          2       0.88      0.99      0.93       187

avg / total       0.88      0.88      0.85       222

[  6   0  16   0   3  10   0   1 186]
LR Accuracy:  0.8783783783783784
LR F1:  0.571281144036562
For name:  j_tsai
total sample size before apply threshold:  153
Counter({'0000-0003-2723-6841': 83, '0000-0002-8657-3744': 38, '0000-0002-5227-8894': 16, '0000-0001-5202-722X': 7, '0000-0002-8666-2739': 5, '0000-0002-5332-2818': 2, '0000-0003-1693-9437': 1, '0000-0003-4921-3982': 1})
['0000-0003-2723-6841', '0000-0002-8657-3744', '0000-0002-5227-8894']
Total sample size after apply threshold:  137
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(29, 19)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(29, 10)
2
(29, 29)
             precision    recall  f1-score   support

          0       1.00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  262
Counter({'0000-0002-7198-8621': 202, '0000-0002-9161-8070': 39, '0000-0001-9028-5481': 20, '0000-0002-5031-736X': 1})
['0000-0002-9161-8070', '0000-0002-7198-8621', '0000-0001-9028-5481']
Total sample size after apply threshold:  261
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(261, 106)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocesso

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(238, 79)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(238, 18)
2
(238, 97)
             precision    recall  f1-score   support

          0       1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.83      0.87        64
          1       0.61      0.48      0.54        29
          2       0.61      0.81      0.70        37

avg / total       0.76      0.75      0.75       130

[53  4  7  3 14 12  2  5 30]
svc Accuracy:  0.7461538461538462
svc F1:  0.701662805360861
             precision    recall  f1-score   support

          0       0.69      0.89      0.78        64
          1       0.70      0.48      0.57        29
          2       0.67      0.49      0.56        37

avg / total       0.68      0.68      0.67       130

[57  3  4 10 14  5 16  3 18]
LR Accuracy:  0.6846153846153846
LR F1:  0.6364795918367346
For name:  c_cao
total sample size before apply threshold:  74
Counter({'0000-0003-2139-1648': 25, '0000-0003-2830-4383': 20, '0000-0001-8621-8403': 19, '0000-0002-0320-1110': 5, '0000-0002-3407-7837': 4, '0000-0001-6909-5739': 1})
['0000-0003-2830-4383', '0000-0003-2139-1648', '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(335, 176)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(335, 30)
2
(335, 206)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.57      0.86      0.69        85
          1       0.80      0.36      0.50        11
          2       0.85      0.67      0.75        49
          3       0.93      0.64      0.76        44
          4       0.58      0.54      0.56        28
          5       0.86      0.46      0.60        13
          6       0.88      0.64      0.74        33
          7       0.46      0.53      0.49        60
          8       0.17      0.08      0.11        12

avg / total       0.67      0.64      0.63       335

[73  0  1  0  1  1  0  9  0  1  4  1  0  1  0  0  2  2  7  0 33  0  1  0
  0  8  0  7  0  0 28  0  0  1  6  2  6  0  1  0 15  0  0  6  0  4  0  1
  0  0  6  0  2  0 10  0  0  0  0  0 21  2  0 17  0  2  0  6  0  2 32  1
  3  1  0  2  2  0  0  3  1]
svc Accuracy:  0.6358208955223881
svc F1:  0.5764465811537716
             precision    recall  f1-score   support

          0       0.48      0.89      0.62        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


90
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(90, 41)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(90, 19)
2
(90, 60)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.79      0.88      0.83        59
          1       0.82      0.43      0.56        21
          2       0.93      0.95      0.94       129

avg / total       0.88      0.88      0.87       209

[ 52   1   6   9   9   3   5   1 123]
svc Accuracy:  0.8803827751196173
svc F1:  0.7790095785440613
             precision    recall  f1-score   support

          0       0.84      0.80      0.82        59
          1       1.00      0.05      0.09        21
          2       0.83      0.98      0.90       129

avg / total       0.85      0.83      0.79       209

[ 47   0  12   6   1  14   3   0 126]
LR Accuracy:  0.8325358851674641
LR F1:  0.6016991827606094
For name:  p_antunes
total sample size before apply threshold:  41
Counter({'0000-0002-3553-2678': 25, '0000-0003-3324-4151': 10, '0000-0001-9129-3539': 5, '0000-0003-1969-1860': 1})
['0000-0002-3553-2678', '0000-0003-3324-4151']
Total sample size after apply thresh

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(536, 22)
2
(536, 249)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.00      0.00      0.00        17
          2       0.00      0.00      0.00        22
          3       0.00      0.00      0.00        15
          4       0.00      0.00      0.00        34
          5       0.00      0.00      0.00        23
          6       0.00      0.00      0.00        14
          7       0.00      0.00      0.00        11
          8       1.00      0.15      0.26        27
          9       0.64      0.80      0.71        95
         10       0.31      0.94      0.46       109
         11       0.00      0.00      0.00        23
         12       0.63      0.75      0.69        48
         13       0.00      0.00      0.00        10
         14       1.00      0.22      0.36        27
         15       0.94      0.44      0.60        39

avg / total       0.40      0.45      0.36       536

[  0   0   0   0   0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.83      0.45      0.59        22
          1       0.67      0.24      0.35        17
          2       0.28      0.23      0.25        22
          3       0.73      0.53      0.62        15
          4       0.08      0.12      0.10        34
          5       0.50      0.09      0.15        23
          6       0.75      0.21      0.33        14
          7       0.60      0.27      0.37        11
          8       0.89      0.59      0.71        27
          9       0.76      0.78      0.77        95
         10       0.45      0.90      0.60       109
         11       0.50      0.09      0.15        23
         12       1.00      0.85      0.92        48
         13       1.00      0.30      0.46        10
         14       0.76      0.59      0.67        27
         15       0.96      0.59      0.73        39

avg / total       0.65      0.58      0.57       536

[10  0  3  0  2  0  0  0  0  3  4  0  0  0 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.62      0.70        13
          1       0.75      0.95      0.84        38
          2       0.33      0.10      0.15        10

avg / total       0.69      0.74      0.70        61

[ 8  4  1  1 36  1  1  8  1]
MNB Accuracy:  0.7377049180327869
MNB F1:  0.5622358766949263
             precision    recall  f1-score   support

          0       0.82      0.69      0.75        13
          1       0.80      0.95      0.87        38
          2       0.60      0.30      0.40        10

avg / total       0.77      0.79      0.77        61

[ 9  3  1  1 36  1  1  6  3]
svc Accuracy:  0.7868852459016393
svc F1:  0.6724899598393574
             precision    recall  f1-score   support

          0       1.00      0.54      0.70        13
          1       0.70      0.97      0.81        38
          2       0.00      0.00      0.00        10

avg / total       0.65      0.72      0.66        61

[ 7  6  0  0 3

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.41      0.56        17
          1       0.89      0.62      0.73        64
          2       0.59      0.92      0.71        71
          3       0.92      0.73      0.81        15
          4       0.87      0.76      0.81        70

avg / total       0.79      0.74      0.74       237

[ 7  0  7  0  3  0 40 21  1  2  1  3 65  0  2  0  1  2 11  1  0  1 16  0
 53]
svc Accuracy:  0.7426160337552743
svc F1:  0.7264411577144962
             precision    recall  f1-score   support

          0       1.00      0.35      0.52        17
          1       0.72      0.67      0.69        64
          2       0.59      0.68      0.63        71
          3       1.00      0.73      0.85        15
          4       0.65      0.73      0.69        70

avg / total       0.70      0.67      0.67       237

[ 6  0  4  0  7  0 43 14  0  7  0 11 48  0 12  0  1  2 11  1  0  5 14  0
 51]
LR Accuracy:  0.6708860759493671
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(125, 84)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(125, 25)
2
(125, 109)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      1.00      0.91        34
          1       1.00      0.59      0.74        17

avg / total       0.89      0.86      0.85        51

[34  0  7 10]
MNB Accuracy:  0.8627450980392157
MNB F1:  0.8237037037037037
             precision    recall  f1-score   support

          0       0.85      1.00      0.92        34
          1       1.00      0.65      0.79        17

avg / total       0.90      0.88      0.87        51

[34  0  6 11]
svc Accuracy:  0.8823529411764706
svc F1:  0.8523166023166023
             precision    recall  f1-score   support

          0       0.83      1.00      0.91        34
          1       1.00      0.59      0.74        17

avg / total       0.89      0.86      0.85        51

[34  0  7 10]
LR Accuracy:  0.8627450980392157
LR F1:  0.8237037037037037
For name:  a_norman
total sample size before apply threshold:  28
Counter({'0000-0002-1282-394X': 16, '0000-0002-4208-2708': 4, '

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.20      0.33        10
          1       0.80      0.53      0.64        15
          2       0.55      0.72      0.62        47
          3       0.56      0.89      0.69        66
          4       0.67      0.36      0.47        11
          5       1.00      0.64      0.78        25
          6       0.75      0.43      0.55        14
          7       1.00      0.89      0.94        27
          8       0.00      0.00      0.00        10
          9       0.92      0.46      0.61        24

avg / total       0.71      0.66      0.64       249

[ 2  1  1  6  0  0  0  0  0  0  0  8  5  2  0  0  0  0  0  0  0  1 34  9
  0  0  0  0  3  0  0  0  4 59  0  0  1  0  1  1  0  0  1  5  4  0  1  0
  0  0  0  0  4  5  0 16  0  0  0  0  0  0  2  4  2  0  6  0  0  0  0  0
  1  2  0  0  0 24  0  0  0  0  8  2  0  0  0  0  0  0  0  0  2 11  0  0
  0  0  0 11]
svc Accuracy:  0.6586345381526104
svc F1:  0.5636063191

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.84      0.96      0.90        28
          1       0.86      0.55      0.67        11

avg / total       0.85      0.85      0.83        39

[27  1  5  6]
LR Accuracy:  0.8461538461538461
LR F1:  0.7833333333333332
For name:  d_morgan
total sample size before apply threshold:  86
Counter({'0000-0002-2291-1740': 50, '0000-0002-7410-6591': 27, '0000-0001-8725-9477': 7, '0000-0001-7403-4586': 1, '0000-0002-4911-0046': 1})
['0000-0002-2291-1740', '0000-0002-7410-6591']
Total sample size after apply threshold:  77
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.67      0.31      0.42        13
          1       0.86      0.63      0.73        19
          2       0.87      0.97      0.92       587
          3       0.83      1.00      0.91        10
          4       0.74      0.67      0.71        43
          5       1.00      0.15      0.27        13
          6       1.00      0.21      0.35        19
          7       0.60      0.10      0.17        31

avg / total       0.86      0.86      0.84       735

[  4   0   8   1   0   0   0   0   0  12   7   0   0   0   0   0   2   2
 571   0  10   0   0   2   0   0   0  10   0   0   0   0   0   0  14   0
  29   0   0   0   0   0  11   0   0   2   0   0   0   0  14   1   0   0
   4   0   0   0  28   0   0   0   0   3]
svc Accuracy:  0.8639455782312925
svc F1:  0.5583575629173319
             precision    recall  f1-score   support

          0       1.00      0.23      0.38        13
          1       1.00      0.26     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.33      0.50        12
          1       0.78      1.00      0.88        28

avg / total       0.84      0.80      0.76        40

[ 4  8  0 28]
svc Accuracy:  0.8
svc F1:  0.6875
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.70      1.00      0.82        28

avg / total       0.49      0.70      0.58        40

[ 0 12  0 28]
LR Accuracy:  0.7
LR F1:  0.4117647058823529
For name:  a_grant
total sample size before apply threshold:  45
Counter({'0000-0002-1147-2375': 22, '0000-0001-6146-101X': 9, '0000-0001-7205-5869': 7, '0000-0002-7032-3716': 4, '0000-0001-9746-2989': 2, '0000-0002-1553-596X': 1})
['0000-0002-1147-2375']
Total sample size after apply threshold:  22
For name:  v_kumar
total sample size before apply threshold:  98
Counter({'0000-0003-3522-1121': 18, '0000-0001-6643-7465': 15, '0000-0002-9795-5967': 15, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.86      0.60      0.71        10
          1       0.82      1.00      0.90        59
          2       1.00      0.71      0.83        17
          3       0.75      0.64      0.69        28
          4       1.00      0.94      0.97        17

avg / total       0.85      0.85      0.84       131

[ 6  0  0  4  0  0 59  0  0  0  0  4 12  1  0  1  9  0 18  0  0  0  0  1
 16]
svc Accuracy:  0.8473282442748091
svc F1:  0.8192473161242033
             precision    recall  f1-score   support

          0       0.50      0.20      0.29        10
          1       0.74      0.98      0.85        59
          2       0.90      0.53      0.67        17
          3       0.64      0.57      0.60        28
          4       1.00      0.82      0.90        17

avg / total       0.76      0.76      0.74       131

[ 2  1  0  7  0  0 58  1  0  0  0  7  9  1  0  1 11  0 16  0  1  1  0  1
 14]
LR Accuracy:  0.7557251908396947
L

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.20      0.33        15
          1       1.00      0.27      0.43        11
          2       0.00      0.00      0.00        10
          3       0.67      0.25      0.36        16
          4       0.67      0.99      0.80        83

avg / total       0.68      0.68      0.61       135

[ 3  0  0  0 12  0  3  0  0  8  0  0  0  2  8  0  0  0  4 12  0  0  1  0
 82]
LR Accuracy:  0.6814814814814815
LR F1:  0.3851082251082251
For name:  c_baker
total sample size before apply threshold:  112
Counter({'0000-0001-6861-8964': 49, '0000-0002-4434-3107': 36, '0000-0001-9134-2994': 10, '0000-0002-7622-1251': 6, '0000-0002-9391-2468': 5, '0000-0002-1171-563X': 3, '0000-0002-2675-1078': 2, '0000-0002-6274-0579': 1})
['0000-0001-9134-2994', '0000-0001-6861-8964', '0000-0002-4434-3107']
Total sample size after apply threshold:  95
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        d

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(188, 98)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(188, 31)
2
(188, 129)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.63      0.60      0.62        81
          1       1.00      0.05      0.10        20
          2       0.46      0.62      0.53        61

avg / total       0.61      0.54      0.52       162

[49  0 32  6  1 13 23  0 38]
MNB Accuracy:  0.5432098765432098
MNB F1:  0.41312269142457825
             precision    recall  f1-score   support

          0       0.63      0.77      0.69        81
          1       1.00      0.40      0.57        20
          2       0.60      0.54      0.57        61

avg / total       0.66      0.64      0.63       162

[62  0 19  9  8  3 28  0 33]
svc Accuracy:  0.6358024691358025
svc F1:  0.6097609925196132
             precision    recall  f1-score   support

          0       0.59      0.79      0.68        81
          1       0.00      0.00      0.00        20
          2       0.54      0.48      0.50        61

avg / total       0.50      0.57      0.53       162

[64  0 17 12 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.91      0.91      0.91        23
          1       0.86      0.86      0.86        14

avg / total       0.89      0.89      0.89        37

[21  2  2 12]
svc Accuracy:  0.8918918918918919
svc F1:  0.8850931677018633
             precision    recall  f1-score   support

          0       0.84      0.91      0.87        23
          1       0.83      0.71      0.77        14

avg / total       0.84      0.84      0.83        37

[21  2  4 10]
LR Accuracy:  0.8378378378378378
LR F1:  0.8221153846153846
For name:  m_ferrari
total sample size before apply threshold:  150
Counter({'0000-0002-3041-2917': 74, '0000-0002-7579-4031': 25, '0000-0002-2986-1272': 22, '0000-0001-6370-605X': 12, '0000-0003-3723-5957': 6, '0000-0001-8535-7348': 5, '0000-0002-7447-6146': 2, '0000-0003-0990-0403': 1, '0000-0003-0283-4263': 1, '0000-0001-7009-6552': 1, '0000-0002-3310-7715': 1})
['0000-0002-2986-1272', '0000-0002-3041-2917', '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        22
          1       0.61      0.97      0.75        74
          2       1.00      0.17      0.29        12
          3       0.82      0.36      0.50        25

avg / total       0.58      0.62      0.54       133

[ 0 22  0  0  2 72  0  0  0  8  2  2  0 16  0  9]
LR Accuracy:  0.6240601503759399
LR F1:  0.38392857142857145
For name:  j_paredes
total sample size before apply threshold:  68
Counter({'0000-0002-1076-1343': 44, '0000-0002-7788-8939': 9, '0000-0002-0974-8109': 7, '0000-0002-1566-9044': 5, '0000-0002-0620-0770': 3})
['0000-0002-1076-1343']
Total sample size after apply threshold:  44
For name:  z_zhao
total sample size before apply threshold:  186
Counter({'0000-0003-0654-1193': 79, '0000-0003-2743-9008': 28, '0000-0002-1279-2207': 15, '0000-0002-1876-1284': 15, '0000-0001-6079-1631': 14, '0000-0002-1701-3751': 7, '0000-0002-0862-8471': 6, '0000-0002-2901-5033': 6, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.33      0.50        15
          1       0.56      0.18      0.27        28
          2       0.60      0.21      0.32        14
          3       0.58      0.96      0.73        79
          4       0.50      0.07      0.12        15

avg / total       0.61      0.60      0.52       151

[ 5  0  0 10  0  0  5  0 23  0  0  1  3  9  1  0  2  1 76  0  0  1  1 12
  1]
LR Accuracy:  0.5960264900662252
LR F1:  0.3861959060101475
For name:  j_cao
total sample size before apply threshold:  39
Counter({'0000-0002-3586-2319': 11, '0000-0002-1544-7441': 10, '0000-0001-5938-6604': 8, '0000-0001-5196-8239': 5, '0000-0001-7414-7660': 4, '0000-0001-6171-1170': 1})
['0000-0002-3586-2319', '0000-0002-1544-7441']
Total sample size after apply threshold:  21
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercas

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.62      0.83      0.71        18
          1       0.62      0.36      0.45        14

avg / total       0.62      0.62      0.60        32

[15  3  9  5]
MNB Accuracy:  0.625
MNB F1:  0.5844155844155844
             precision    recall  f1-score   support

          0       0.76      0.72      0.74        18
          1       0.67      0.71      0.69        14

avg / total       0.72      0.72      0.72        32

[13  5  4 10]
svc Accuracy:  0.71875
svc F1:  0.7162561576354679
             precision    recall  f1-score   support

          0       0.67      0.78      0.72        18
          1       0.64      0.50      0.56        14

avg / total       0.65      0.66      0.65        32

[14  4  7  7]
LR Accuracy:  0.65625
LR F1:  0.6389743589743591
For name:  m_o'brien
total sample size before apply threshold:  34
Counter({'0000-0002-8509-3650': 20, '0000-0002-1721-0464': 9, '0000-0003-1096-1991': 4, '0000-000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.86      0.48      0.62        25
          1       0.82      0.37      0.51        38
          2       1.00      0.59      0.74        22
          3       0.78      0.75      0.76       180
          4       0.64      0.91      0.75       265
          5       0.54      0.20      0.29        71
          6       0.43      0.10      0.17        29

avg / total       0.69      0.69      0.66       630

[ 12   0   0   0  12   0   1   0  14   0  19   5   0   0   0   0  13   1
   8   0   0   0   3   0 135  41   0   1   0   0   0  11 242  11   1   0
   0   0   6  50  14   1   2   0   0   2  21   1   3]
svc Accuracy:  0.6873015873015873
svc F1:  0.5481319696073716
             precision    recall  f1-score   support

          0       1.00      0.32      0.48        25
          1       0.71      0.13      0.22        38
          2       1.00      0.50      0.67        22
          3       0.70      0.71      0.70   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.89      0.98      0.93       268
          1       0.63      0.39      0.48        44
          2       0.00      0.00      0.00        10

avg / total       0.83      0.87      0.84       322

[262   6   0  27  17   0   6   4   0]
MNB Accuracy:  0.8664596273291926
MNB F1:  0.46986716033322495
             precision    recall  f1-score   support

          0       0.90      0.99      0.94       268
          1       0.81      0.50      0.62        44
          2       1.00      0.20      0.33        10

avg / total       0.89      0.89      0.88       322

[264   4   0  22  22   0   7   1   2]
svc Accuracy:  0.8944099378881988
svc F1:  0.6314093712602412
             precision    recall  f1-score   support

          0       0.87      1.00      0.93       268
          1       1.00      0.30      0.46        44
          2       0.00      0.00      0.00        10

avg / total       0.86      0.87      0.84       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      1.00      0.93       212
          1       1.00      0.11      0.20        36

avg / total       0.89      0.87      0.82       248

[212   0  32   4]
LR Accuracy:  0.8709677419354839
LR F1:  0.5649122807017544
For name:  s_oliveira
total sample size before apply threshold:  143
Counter({'0000-0003-4984-4805': 48, '0000-0002-6011-2122': 25, '0000-0001-7919-4191': 23, '0000-0001-8240-0013': 17, '0000-0002-6914-5529': 8, '0000-0002-7322-1184': 8, '0000-0003-0649-2694': 4, '0000-0002-7654-1909': 4, '0000-0002-3504-5749': 3, '0000-0002-8901-9757': 2, '0000-0002-3840-6781': 1})
['0000-0003-4984-4805', '0000-0001-7919-4191', '0000-0002-6011-2122', '0000-0001-8240-0013']
Total sample size after apply threshold:  113
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=N

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


56
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(56, 29)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(56, 15)
2
(56, 44)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.38      0.18      0.24        17
          1       0.80      0.40      0.53        10
          2       0.61      0.95      0.74        44
          3       1.00      0.27      0.42        15

avg / total       0.65      0.62      0.56        86

[ 3  1 13  0  1  4  5  0  2  0 42  0  2  0  9  4]
LR Accuracy:  0.6162790697674418
LR F1:  0.4844371991926719
For name:  m_cruz
total sample size before apply threshold:  141
Counter({'0000-0001-9759-5466': 57, '0000-0001-9846-6754': 46, '0000-0003-1822-0514': 30, '0000-0002-4767-530X': 3, '0000-0001-8152-3054': 3, '0000-0003-3311-7582': 2})
['0000-0001-9846-6754', '0000-0001-9759-5466', '0000-0003-1822-0514']
Total sample size after apply threshold:  133
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.95      0.93       110
          1       0.50      0.29      0.37        17

avg / total       0.84      0.87      0.85       127

[105   5  12   5]
svc Accuracy:  0.8661417322834646
svc F1:  0.6477402512644803
             precision    recall  f1-score   support

          0       0.88      1.00      0.94       110
          1       1.00      0.12      0.21        17

avg / total       0.90      0.88      0.84       127

[110   0  15   2]
LR Accuracy:  0.8818897637795275
LR F1:  0.5733482642777156
For name:  b_white
total sample size before apply threshold:  47
Counter({'0000-0002-4293-6128': 29, '0000-0002-0684-5210': 7, '0000-0003-3365-939X': 7, '0000-0002-7477-9956': 3, '0000-0003-4191-3511': 1})
['0000-0002-4293-6128']
Total sample size after apply threshold:  29
For name:  p_graham
total sample size before apply threshold:  89
Counter({'0000-0002-3745-0940': 33, '0000-0003-2890-2447': 27, '0000-00

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


32
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(32, 21)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(32, 15)
2
(32, 36)
             precision    recall  f1-score   support

          0       0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


total sample size before apply threshold:  2116
Counter({'0000-0003-1835-9436': 200, '0000-0003-3477-1172': 146, '0000-0003-1232-5307': 124, '0000-0001-6537-0350': 78, '0000-0003-0934-3344': 73, '0000-0001-7964-106X': 56, '0000-0003-2337-6935': 52, '0000-0003-2068-7287': 51, '0000-0002-3573-638X': 46, '0000-0003-4085-293X': 41, '0000-0002-6349-6950': 41, '0000-0002-6931-8581': 38, '0000-0002-4171-3803': 38, '0000-0003-0373-5080': 36, '0000-0002-1299-4300': 36, '0000-0002-8383-8524': 33, '0000-0002-0087-1151': 32, '0000-0002-3500-7494': 32, '0000-0002-4687-6732': 31, '0000-0001-5979-5774': 30, '0000-0001-9660-6303': 29, '0000-0002-1903-8354': 28, '0000-0002-5390-8763': 27, '0000-0003-0767-1918': 26, '0000-0002-4747-9763': 25, '0000-0003-0103-7457': 24, '0000-0003-4035-0438': 23, '0000-0003-2841-147X': 23, '0000-0003-0693-1415': 23, '0000-0002-3566-3379': 19, '0000-0003-4978-1867': 18, '0000-0002-9570-4216': 18, '0000-0001-5080-7097': 17, '0000-0002-1672-5730': 17, '0000-0002-9159-0733':

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       0.00      0.00      0.00        29
          2       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        16
          4       0.00      0.00      0.00        10
          5       0.68      0.38      0.48        56
          6       0.00      0.00      0.00        14
          7       0.00      0.00      0.00        12
          8       0.00      0.00      0.00        17
          9       0.00      0.00      0.00        24
         10       0.39      0.60      0.47       124
         11       0.00      0.00      0.00        11
         12       0.00      0.00      0.00        31
         13       0.00      0.00      0.00        32
         14       0.44      0.30      0.36        73
         15       0.00      0.00      0.00        32
         16       0.00      0.00      0.00        18
         17       0.92      0.26      0.41   

  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.58      0.74        12
          1       0.90      0.47      0.62        19
          2       0.30      0.23      0.26        13
          3       0.67      0.20      0.31        30
          4       0.88      0.58      0.70        36
          5       0.75      0.98      0.85       194
          6       0.70      0.56      0.62        34
          7       1.00      0.50      0.67        10

avg / total       0.76      0.75      0.72       348

[  7   0   0   0   0   5   0   0   0   9   1   0   0   9   0   0   0   1
   3   2   0   5   2   0   0   0   4   6   1  16   3   0   0   0   1   0
  21  13   1   0   0   0   0   0   2 191   1   0   0   0   1   1   0  13
  19   0   0   0   0   0   0   4   1   5]
svc Accuracy:  0.75
svc F1:  0.5955750010716196
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        12
          1       1.00      0.32      0.48        1

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.80      0.62      0.70        26
          1       0.81      0.96      0.88        81
          2       1.00      0.18      0.31        11

avg / total       0.83      0.81      0.79       118

[16 10  0  3 78  0  1  8  2]
svc Accuracy:  0.8135593220338984
svc F1:  0.6282334712695803
             precision    recall  f1-score   support

          0       0.90      0.35      0.50        26
          1       0.75      1.00      0.86        81
          2       0.00      0.00      0.00        11

avg / total       0.71      0.76      0.70       118

[ 9 17  0  0 81  0  1 10  0]
LR Accuracy:  0.7627118644067796
LR F1:  0.4523809523809524
For name:  a_reynolds
total sample size before apply threshold:  40
Counter({'0000-0002-0836-746X': 23, '0000-0001-9534-8699': 7, '0000-0002-6768-5716': 5, '0000-0002-9919-4161': 3, '0000-0003-0554-8107': 1, '0000-0002-6364-6250': 1})
['0000-0002-0836-746X']
Total sample size after a

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.79      1.00      0.88        83
          1       1.00      0.42      0.59        19
          2       0.97      0.75      0.85        48

avg / total       0.88      0.85      0.83       150

[83  0  0 10  8  1 12  0 36]
svc Accuracy:  0.8466666666666667
svc F1:  0.7742100465087532
             precision    recall  f1-score   support

          0       0.70      0.99      0.82        83
          1       1.00      0.05      0.10        19
          2       0.94      0.62      0.75        48

avg / total       0.81      0.75      0.71       150

[82  0  1 17  1  1 18  0 30]
LR Accuracy:  0.7533333333333333
LR F1:  0.5566666666666666
For name:  r_nunes
total sample size before apply threshold:  46
Counter({'0000-0001-7425-5717': 28, '0000-0002-1377-9899': 13, '0000-0001-8633-4404': 3, '0000-0002-9014-0570': 2})
['0000-0002-1377-9899', '0000-0001-7425-5717']
Total sample size after apply threshold:  41
(0, 0)
Tfid

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(392, 163)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(392, 20)
2
(392, 183)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.37      0.54        19
          1       0.68      0.76      0.71       119
          2       0.97      0.58      0.73        48
          3       0.92      0.71      0.80        17
          4       0.73      0.47      0.57        17
          5       0.95      0.78      0.86        23
          6       0.54      0.85      0.66        85
          7       0.87      0.56      0.68        36
          8       0.46      0.39      0.42        28

avg / total       0.73      0.68      0.68       392

[ 7  5  0  0  1  0  6  0  0  0 90  0  0  0  0 16  0 13  0  6 28  0  0  0
 14  0  0  0  1  0 12  0  1  3  0  0  0  2  0  0  8  0  7  0  0  0  4  0
  1  0 18  0  0  0  0  8  1  0  1  0 72  3  0  0  4  0  0  1  0 11 20  0
  0 13  0  0  0  0  4  0 11]
svc Accuracy:  0.6785714285714286
svc F1:  0.6633538768976492
             precision    recall  f1-score   support

          0       1.00      0.05      0.10        

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.64      0.74        11
          1       0.89      0.62      0.73        13
          2       0.78      0.94      0.85        33

avg / total       0.82      0.81      0.80        57

[ 7  0  4  0  8  5  1  1 31]
LR Accuracy:  0.8070175438596491
LR F1:  0.771143300343012
For name:  h_lu
total sample size before apply threshold:  108
Counter({'0000-0003-1720-6526': 20, '0000-0002-8340-2739': 19, '0000-0003-2180-3091': 17, '0000-0003-4025-3160': 9, '0000-0001-9732-0833': 6, '0000-0002-1440-9902': 6, '0000-0002-3940-3283': 5, '0000-0002-0017-4276': 5, '0000-0003-3604-7145': 5, '0000-0002-6708-0223': 5, '0000-0002-0349-2181': 4, '0000-0002-9090-258X': 3, '0000-0002-5177-3391': 2, '0000-0002-6881-660X': 1, '0000-0002-9443-4031': 1})
['0000-0002-8340-2739', '0000-0003-1720-6526', '0000-0003-2180-3091']
Total sample size after apply threshold:  56
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_er

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.67      0.40      0.50        10
          1       0.86      1.00      0.92       252
          2       0.60      0.12      0.19        26
          3       0.67      0.12      0.21        16
          4       0.70      0.64      0.67        11

avg / total       0.82      0.85      0.81       315

[  4   3   2   0   1   0 251   0   1   0   2  19   3   0   2   0  14   0
   2   0   0   4   0   0   7]
svc Accuracy:  0.8476190476190476
svc F1:  0.4990469847761446
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.81      1.00      0.89       252
          2       0.00      0.00      0.00        26
          3       0.00      0.00      0.00        16
          4       0.00      0.00      0.00        11

avg / total       0.64      0.80      0.71       315

[  0  10   0   0   0   0 252   0   0   0   0  24   0   0   2   0  16   0
   0   0   0  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 108)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 23)
2
(191, 131)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.19      0.76      0.31        91
          1       0.00      0.00      0.00        14
          2       1.00      0.17      0.30        23
          3       0.20      0.04      0.07        25
          4       0.00      0.00      0.00        25
          5       0.00      0.00      0.00        19
          6       0.25      0.07      0.11        30
          7       0.00      0.00      0.00        11
          8       0.00      0.00      0.00        11
          9       1.00      0.67      0.80        15
         10       0.00      0.00      0.00        10
         11       0.60      0.10      0.17        31
         12       0.89      0.33      0.48        24
         13       0.00      0.00      0.00        26
         14       0.00      0.00      0.00        18
         15       1.00      0.20      0.33        30
         16       0.00      0.00      0.00        10
         17       1.00      0.08      0.14   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      0.83      0.89        60
          1       1.00      0.89      0.94        18
          2       0.82      0.98      0.89        97
          3       1.00      0.50      0.67        18

avg / total       0.90      0.88      0.88       193

[50  0 10  0  0 16  2  0  2  0 95  0  0  0  9  9]
svc Accuracy:  0.8808290155440415
svc F1:  0.848179764863692
             precision    recall  f1-score   support

          0       0.98      0.77      0.86        60
          1       1.00      0.56      0.71        18
          2       0.72      0.99      0.83        97
          3       1.00      0.17      0.29        18

avg / total       0.85      0.80      0.78       193

[46  0 14  0  0 10  8  0  1  0 96  0  0  0 15  3]
LR Accuracy:  0.8031088082901554
LR F1:  0.6736489232019504
For name:  m_aguilar
total sample size before apply threshold:  108
Counter({'0000-0002-1935-6619': 59, '0000-0001-7395-5754': 18, '0000-

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(61, 22)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(61, 19)
2
(61, 41)
             precision    recall  f1-score   support

          0       0.95

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.92      0.96        36
          1       0.81      0.94      0.87       116
          2       0.70      0.41      0.52        39

avg / total       0.82      0.83      0.81       191

[ 33   3   0   0 109   7   0  23  16]
svc Accuracy:  0.8272251308900523
svc F1:  0.7803922226009474
             precision    recall  f1-score   support

          0       1.00      0.78      0.88        36
          1       0.74      0.98      0.84       116
          2       0.62      0.13      0.21        39

avg / total       0.76      0.77      0.72       191

[ 28   7   1   0 114   2   0  34   5]
LR Accuracy:  0.7696335078534031
LR F1:  0.6430314569103138
For name:  y_yang
total sample size before apply threshold:  665
Counter({'0000-0002-8633-0873': 115, '0000-0002-6266-9864': 97, '0000-0003-1391-8040': 73, '0000-0001-8839-8161': 50, '0000-0002-6782-2813': 43, '0000-0001-7896-1184': 39, '0000-0002-3598-7218': 35, '0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        10
          1       0.00      0.00      0.00        16
          2       1.00      0.23      0.37        35
          3       0.63      0.44      0.52        43
          4       0.00      0.00      0.00        11
          5       0.39      0.59      0.47        97
          6       0.50      0.03      0.05        39
          7       0.00      0.00      0.00        12
          8       0.53      0.88      0.66       115
          9       0.31      0.45      0.37        73
         10       0.58      0.50      0.54        50
         11       0.00      0.00      0.00        26

avg / total       0.44      0.46      0.40       527

[  0   0   0   0   0   0   0   0   1   8   1   0   0   0   0   4   0   4
   0   0   2   4   2   0   0   0   8   1   0  13   0   0   7   5   1   0
   0   0   0  19   0  10   0   0   5   6   3   0   0   0   0   0   0   2
   0   0   5   4   0   0   0   0   0

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.87      0.81      0.84        88
          1       0.76      0.83      0.79        65

avg / total       0.82      0.82      0.82       153

[71 17 11 54]
MNB Accuracy:  0.8169934640522876
MNB F1:  0.8147058823529412
             precision    recall  f1-score   support

          0       0.82      0.92      0.87        88
          1       0.87      0.72      0.79        65

avg / total       0.84      0.84      0.83       153

[81  7 18 47]
svc Accuracy:  0.8366013071895425
svc F1:  0.8281130634071809
             precision    recall  f1-score   support

          0       0.79      0.91      0.85        88
          1       0.85      0.68      0.75        65

avg / total       0.82      0.81      0.81       153

[80  8 21 44]
LR Accuracy:  0.8104575163398693
LR F1:  0.7993487993487993
For name:  c_baptista
total sample size before apply threshold:  19
Counter({'0000-0002-1263-7880': 7, '0000-0002-8158-4743': 7, 

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        16
          1       0.00      0.00      0.00        27
          2       1.00      0.11      0.19        19
          3       0.00      0.00      0.00        15
          4       0.00      0.00      0.00        17
          5       1.00      0.05      0.09        22
          6       0.00      0.00      0.00        17
          7       1.00      0.60      0.75        40
          8       0.38      0.28      0.33        53
          9       0.50      0.58      0.54        64
         10       0.75      0.12      0.21        25
         11       1.00      0.35      0.52        31
         12       0.43      0.39      0.41        56
         13       0.29      0.85      0.43        98
         14       0.00      0.00      0.00        17
         15       0.00      0.00      0.00        14
         16       0.83      0.17      0.28        30
         17       0.30      0.63      0.41   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.90      0.69      0.78        13
          1       0.83      0.95      0.88        20

avg / total       0.86      0.85      0.84        33

[ 9  4  1 19]
LR Accuracy:  0.8484848484848485
LR F1:  0.8331648129423661
For name:  j_castro
total sample size before apply threshold:  39
Counter({'0000-0001-6169-3822': 15, '0000-0002-0382-553X': 10, '0000-0001-8984-475X': 7, '0000-0003-0794-3178': 3, '0000-0002-1939-7859': 2, '0000-0002-7468-5220': 1, '0000-0003-0868-1894': 1})
['0000-0001-6169-3822', '0000-0002-0382-553X']
Total sample size after apply threshold:  25
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       0.75      0.64      0.69        33
          2       0.65      0.90      0.76        40

avg / total       0.74      0.69      0.66        85

[ 2  3  7  0 21 12  0  4 36]
MNB Accuracy:  0.6941176470588235
MNB F1:  0.5773778709067751
             precision    recall  f1-score   support

          0       0.57      0.33      0.42        12
          1       0.69      0.67      0.68        33
          2       0.67      0.78      0.72        40

avg / total       0.66      0.67      0.66        85

[ 4  3  5  1 22 10  2  7 31]
svc Accuracy:  0.6705882352941176
svc F1:  0.606301980353388
             precision    recall  f1-score   support

          0       1.00      0.17      0.29        12
          1       0.74      0.61      0.67        33
          2       0.64      0.90      0.75        40

avg / total       0.73      0.68      0.65        85

[ 2  3  7  0 20

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.53      0.83      0.65        24
          1       0.71      0.33      0.45        15
          2       1.00      0.92      0.96        12
          3       0.58      0.68      0.62        22
          4       0.62      0.45      0.53        11
          5       0.68      0.52      0.59        25

avg / total       0.66      0.63      0.62       109

[20  2  0  0  0  2  8  5  0  1  0  1  0  0 11  1  0  0  3  0  0 15  2  2
  2  0  0  3  5  1  5  0  0  6  1 13]
LR Accuracy:  0.6330275229357798
LR F1:  0.6330755607302074
For name:  i_martins
total sample size before apply threshold:  54
Counter({'0000-0002-9284-8599': 12, '0000-0002-0136-1671': 11, '0000-0002-8521-2613': 8, '0000-0002-5362-9801': 7, '0000-0001-6797-2558': 7, '0000-0002-3412-9377': 6, '0000-0003-0897-8807': 1, '0000-0003-4328-7286': 1, '0000-0003-3291-0079': 1})
['0000-0002-0136-1671', '0000-0002-9284-8599']
Total sample size after apply threshold:  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.75      0.21      0.33        14
          1       0.88      0.64      0.74        11
          2       0.76      0.64      0.69        39
          3       1.00      0.52      0.69        21
          4       0.88      0.64      0.74        11
          5       0.98      0.78      0.87        58
          6       0.82      0.99      0.90       222

avg / total       0.85      0.84      0.83       376

[  3   0   2   0   0   0   9   0   7   1   0   1   0   2   1   0  25   0
   0   0  13   0   0   1  11   0   0   9   0   1   0   0   7   0   3   0
   0   2   0   0  45  11   0   0   2   0   0   1 219]
svc Accuracy:  0.8430851063829787
svc F1:  0.7074125124707523
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        14
          1       1.00      0.45      0.62        11
          2       0.66      0.49      0.56        39
          3       1.00      0.29      0.44   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.96      1.00      0.98       267
          1       1.00      0.58      0.74        24

avg / total       0.97      0.97      0.96       291

[267   0  10  14]
svc Accuracy:  0.9656357388316151
svc F1:  0.8592298761609907
             precision    recall  f1-score   support

          0       0.92      1.00      0.96       267
          1       1.00      0.04      0.08        24

avg / total       0.93      0.92      0.89       291

[267   0  23   1]
LR Accuracy:  0.9209621993127147
LR F1:  0.5193536804308797
For name:  m_hartmann
total sample size before apply threshold:  88
Counter({'0000-0001-8069-5284': 28, '0000-0001-6937-5677': 25, '0000-0002-8207-3806': 21, '0000-0001-6046-0365': 10, '0000-0002-4774-2787': 4})
['0000-0001-6937-5677', '0000-0001-8069-5284', '0000-0002-8207-3806', '0000-0001-6046-0365']
Total sample size after apply threshold:  84
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_e

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 106)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(191, 19)
2
(191, 125)
             precision    recall  f1-score   support

          0      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(109, 78)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(109, 26)
2
(109, 104)
             precision    recall  f1-score   support

          0       

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.70      0.70      0.70        10
          1       0.86      0.86      0.86        22

avg / total       0.81      0.81      0.81        32

[ 7  3  3 19]
MNB Accuracy:  0.8125
MNB F1:  0.7818181818181817
             precision    recall  f1-score   support

          0       1.00      0.70      0.82        10
          1       0.88      1.00      0.94        22

avg / total       0.92      0.91      0.90        32

[ 7  3  0 22]
svc Accuracy:  0.90625
svc F1:  0.8798498122653317
             precision    recall  f1-score   support

          0       1.00      0.50      0.67        10
          1       0.81      1.00      0.90        22

avg / total       0.87      0.84      0.83        32

[ 5  5  0 22]
LR Accuracy:  0.84375
LR F1:  0.782312925170068
For name:  s_nielsen
total sample size before apply threshold:  290
Counter({'0000-0003-2417-0787': 108, '0000-0001-6391-7455': 72, '0000-0001-5341-1055': 44, '0000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.84      0.61      0.71        44
          1       1.00      0.95      0.98        21
          2       0.70      0.54      0.61        13
          3       1.00      0.27      0.43        11
          4       0.79      0.74      0.76        72
          5       0.70      0.89      0.78       108

avg / total       0.78      0.77      0.76       269

[27  0  1  0  1 15  0 20  0  0  1  0  0  0  7  0  1  5  0  0  0  3  2  6
  3  0  1  0 53 15  2  0  1  0  9 96]
svc Accuracy:  0.7657992565055762
svc F1:  0.7116110916796142
             precision    recall  f1-score   support

          0       0.76      0.66      0.71        44
          1       1.00      0.81      0.89        21
          2       1.00      0.08      0.14        13
          3       1.00      0.18      0.31        11
          4       0.75      0.79      0.77        72
          5       0.71      0.89      0.79       108

avg / total       0.78     

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(501, 232)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(501, 18)
2
(501, 250)
             precision    recall  f1-score   support

          0      

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      0.21      0.35        14
          1       0.93      0.82      0.87        33
          2       0.47      0.93      0.63       120
          3       1.00      0.27      0.43        11
          4       0.00      0.00      0.00        10
          5       1.00      0.57      0.72        23
          6       0.78      0.58      0.67        12
          7       0.46      0.26      0.33        23
          8       0.67      0.20      0.31        10
          9       0.94      0.74      0.83        43
         10       1.00      0.77      0.87        35
         11       0.53      0.45      0.48        65
         12       0.44      0.30      0.36        23
         13       0.52      0.52      0.52        23
         14       0.57      0.41      0.48        39
         15       1.00      0.47      0.64        17

avg / total       0.66      0.61      0.59       501

[  3   0   8   0   0   0   0   0   0   0   

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.83      0.71      0.77        21
          1       0.86      0.92      0.89        39

avg / total       0.85      0.85      0.85        60

[15  6  3 36]
MNB Accuracy:  0.85
MNB F1:  0.829059829059829
             precision    recall  f1-score   support

          0       1.00      0.86      0.92        21
          1       0.93      1.00      0.96        39

avg / total       0.95      0.95      0.95        60

[18  3  0 39]
svc Accuracy:  0.95
svc F1:  0.9430199430199431
             precision    recall  f1-score   support

          0       1.00      0.67      0.80        21
          1       0.85      1.00      0.92        39

avg / total       0.90      0.88      0.88        60

[14  7  0 39]
LR Accuracy:  0.8833333333333333
LR F1:  0.8588235294117648
For name:  r_mckay
total sample size before apply threshold:  53
Counter({'0000-0001-7781-1539': 31, '0000-0003-2723-5371': 17, '0000-0002-5602-6985': 4, '000

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.58      0.88      0.70        48
          1       0.70      0.79      0.74        61
          2       0.20      0.20      0.20        10
          3       0.78      0.58      0.67        36
          4       0.86      0.51      0.64        35
          5       0.72      0.48      0.58        27

avg / total       0.69      0.66      0.66       217

[42  5  1  0  0  0  5 48  4  2  0  2  0  6  2  0  0  2  6  5  0 21  3  1
 10  2  2  3 18  0  9  3  1  1  0 13]
svc Accuracy:  0.663594470046083
svc F1:  0.5876271876271876
             precision    recall  f1-score   support

          0       0.62      0.79      0.70        48
          1       0.60      0.84      0.70        61
          2       0.00      0.00      0.00        10
          3       0.81      0.61      0.70        36
          4       0.80      0.57      0.67        35
          5       0.62      0.37      0.47        27

avg / total       0.65      

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(65, 36)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
(65, 20)
2
(65, 56)
             precision    recall  f1-score   support

          0       0.92

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.60      0.60      0.60        15
          1       0.62      0.62      0.62        16

avg / total       0.61      0.61      0.61        31

[ 9  6  6 10]
LR Accuracy:  0.6129032258064516
LR F1:  0.6125
For name:  c_adams
total sample size before apply threshold:  69
Counter({'0000-0003-2100-4417': 43, '0000-0001-5602-2741': 20, '0000-0002-7333-9908': 4, '0000-0003-1628-4020': 1, '0000-0002-0667-8088': 1})
['0000-0003-2100-4417', '0000-0001-5602-2741']
Total sample size after apply threshold:  63
(0, 0)
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


             precision    recall  f1-score   support

          0       0.88      0.64      0.74        11
          1       0.70      0.97      0.81        31
          2       1.00      0.10      0.18        10

avg / total       0.79      0.73      0.67        52

[ 7  4  0  1 30  0  0  9  1]
LR Accuracy:  0.7307692307692307
LR F1:  0.5764903659640502
Done


In [16]:
# accuracy
from statistics import mean 
cleaned_mnb_accuracy = [x for x in all_mnb_accuracy if isinstance(x, float)]
cleaned_svcLinear_accuracy = [x for x in all_svcLinear_accuracy if isinstance(x, float)]
cleaned_lr_accuracy = [x for x in all_LR_accuracy if isinstance(x, float)]
print(len(cleaned_mnb_accuracy))
print(len(cleaned_svcLinear_accuracy))
print(len(cleaned_lr_accuracy))
print(mean(cleaned_mnb_accuracy))
print(mean(cleaned_svcLinear_accuracy))
print(mean(cleaned_lr_accuracy))

784
784
784
0.7192512241904999
0.77393245727873
0.7259381668203747


In [17]:
# f1
from statistics import mean 
# remove string from result
cleaned_mnb_f1 = [x for x in all_mnb_f1 if isinstance(x, float)]
cleaned_svcLinear_f1 = [x for x in all_svcLinear_f1 if isinstance(x, float)]
cleaned_lr_f1 = [x for x in all_LR_f1 if isinstance(x, float)]
print(len(cleaned_mnb_f1))
print(len(cleaned_svcLinear_f1))
print(len(cleaned_lr_f1))
print(mean(cleaned_mnb_f1))
print(mean(cleaned_svcLinear_f1))
print(mean(cleaned_lr_f1))

784
784
784
0.598677461782394
0.7096296153254906
0.5971807362010636


In [None]:
%reset

In [None]:
%who