# Loading training data

In [36]:
import csv
import numpy as np
X = []
with open("./data/train/crawler/data/tweets.txt.text", newline='', encoding='utf8') as file_data:
    i = 0 
    for row in file_data:
        X.append(row.replace("\n",""))
            
y = []
with open("./data/train/crawler/data/tweets.txt.labels", newline='', encoding='utf8') as file_data:
    j = 0
    for row in file_data:
        y.append(row.replace("\n",""))

In [37]:
X = np.array(X)
y = np.array(y)

# Data vectorisation

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)

# Data test loading and vectorisation

In [39]:
test=[]
with open("./data/test/us_test.text", newline='', encoding='utf8') as test_data:
    file = test_data.readlines()
    for row in file:
        test.append(row.replace("\n",""))

test = np.asarray(test)
test_dtm = vect.transform(test)

In [40]:
test_label = []
with open("./data/test/us_test.labels", newline='', encoding='utf8') as test_data_label:
    file = test_data_label.readlines()
    for row in file:
        test_label.append(row.replace("\n",""))
test_label = np.asarray(test_label)
test_label = test_label.reshape(-1,1)

# Model implementation and training
## Measurements calculation and results

In [43]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score, jaccard_score, classification_report

models = [
    MultinomialNB(alpha=0.1)    
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_dtm, y)
    predict_label = model.predict(test_dtm)
    acc = accuracy_score(predict_label,test_label)
    f1 = f1_score(predict_label, test_label, average = 'weighted')
    cm = multilabel_confusion_matrix(test_label,predict_label)
    jaccard = jaccard_score(test_label, predict_label, average='micro')
    print(classification_report(test_label, predict_label))
    print ("la matrice de confusion : ")
    print(cm)
    entries.append((model_name, acc, f1, jaccard))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy', 'f1', 'jaccard'])

              precision    recall  f1-score   support

           0       0.35      0.51      0.42     10798
           1       0.24      0.25      0.24      4830
          10       0.16      0.14      0.15      1432
          11       0.46      0.47      0.47      1949
          12       0.24      0.42      0.30      1265
          13       0.20      0.06      0.10      1114
          14       0.08      0.04      0.06      1306
          15       0.22      0.14      0.17      1244
          16       0.08      0.04      0.05      1153
          17       0.59      0.55      0.57      1545
          18       0.29      0.11      0.16      2417
          19       0.05      0.02      0.03      1010
           2       0.30      0.47      0.37      4534
           3       0.15      0.09      0.11      2605
           4       0.45      0.41      0.43      3716
           5       0.08      0.08      0.08      1613
           6       0.14      0.12      0.13      1996
           7       0.27    

In [44]:
print (cv_df)

      model_name  accuracy        f1   jaccard
0  MultinomialNB   0.29694  0.318694  0.174357


# Grid Search use to find the best parameters for this model

In [45]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# Works better with small alpha, and fit_prior changes nothing
parameters = {'alpha':[1, 2, 0.7, 0, 0.1, 0.5], 'fit_prior':('False', 'True')}
mnb = MultinomialNB()

clf = GridSearchCV(mnb, parameters, cv=3, n_jobs=-1)

clf.fit(X_dtm, y)
                            
print(clf.cv_results_)

{'mean_fit_time': array([1.27850437, 1.15368517, 1.31007473, 1.48485192, 1.31382211,
       1.45402455, 1.34216412, 1.49965429, 1.41114871, 1.44537743,
       1.34590165, 1.25690444]), 'std_fit_time': array([0.06269197, 0.02398118, 0.11619214, 0.18892382, 0.04715809,
       0.10785702, 0.02873494, 0.10679886, 0.12807551, 0.22800635,
       0.12273234, 0.05564568]), 'mean_score_time': array([0.24673923, 0.2404693 , 0.25699496, 0.24162793, 0.26615961,
       0.24808971, 0.31336474, 0.27992964, 0.29836718, 0.25113853,
       0.31259886, 0.20510936]), 'std_score_time': array([0.01003118, 0.01278984, 0.00108068, 0.04193019, 0.05757249,
       0.0936216 , 0.10019844, 0.02738176, 0.03370489, 0.04558999,
       0.02522813, 0.042204  ]), 'param_alpha': masked_array(data=[1, 1, 2, 2, 0.7, 0.7, 0, 0, 0.1, 0.1, 0.5, 0.5],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=object), 'para

  'setting alpha = %.1e' % _ALPHA_MIN)


In [13]:
print (cv_df)

               model_name    score
0      LogisticRegression  0.29528
1  RandomForestClassifier  0.21596
2               LinearSVC  0.24350
3           MultinomialNB  0.26272
