# Loading training data

In [None]:
import csv
import numpy as np
X = []
with open("./data/train/crawler/data/tweets.txt.text", newline='', encoding='utf8') as file_data:
    i = 0 
    for row in file_data:
        X.append(row.replace("\n",""))
            
y = []
with open("./data/train/crawler/data/tweets.txt.labels", newline='', encoding='utf8') as file_data:
    j = 0
    for row in file_data:
        y.append(row.replace("\n",""))

In [None]:
X = np.array(X)
y = np.array(y)

# Data vectorisation

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)

# Data test loading and vectorisation

In [24]:
test=[]
with open("./data/test/us_test.text", newline='', encoding='utf8') as test_data:
    file = test_data.readlines()
    for row in file:
        test.append(row.replace("\n",""))

test = np.asarray(test)
test_dtm = vect.transform(test)

In [25]:
test_label = []
with open("./data/test/us_test.labels", newline='', encoding='utf8') as test_data_label:
    file = test_data_label.readlines()
    for row in file:
        test_label.append(row.replace("\n",""))
test_label = np.asarray(test_label)
test_label = test_label.reshape(-1,1)

# Model implementation and training
## Measurements calculation and results

In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score, jaccard_score, classification_report

models = [
    RandomForestClassifier(n_estimators=3, max_depth=50, criterion='gini')    
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

for model in models:
    model_name = model.__class__.__name__
    model.fit(X_dtm, y)
    predict_label = model.predict(test_dtm)
    acc = accuracy_score(predict_label,test_label)
    f1 = f1_score(predict_label, test_label, average = 'weighted')
    cm = multilabel_confusion_matrix(test_label,predict_label)
    jaccard = jaccard_score(test_label, predict_label, average='micro')
    print(classification_report(test_label, predict_label))
    print ("la matrice de confusion : ")
    print(cm)
    entries.append((model_name, acc, f1, jaccard))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy', 'f1', 'jaccard'])

              precision    recall  f1-score   support

           0       0.22      0.95      0.36     10798
           1       0.16      0.01      0.01      4830
          10       0.11      0.00      0.01      1432
          11       0.56      0.05      0.10      1949
          12       0.47      0.13      0.20      1265
          13       0.21      0.01      0.02      1114
          14       0.07      0.00      0.00      1306
          15       0.03      0.00      0.00      1244
          16       0.10      0.00      0.01      1153
          17       0.65      0.49      0.56      1545
          18       0.23      0.01      0.01      2417
          19       0.02      0.00      0.00      1010
           2       0.30      0.06      0.10      4534
           3       0.09      0.00      0.01      2605
           4       0.56      0.13      0.21      3716
           5       0.05      0.00      0.01      1613
           6       0.20      0.01      0.02      1996
           7       0.20    

In [35]:
print (cv_df)

               model_name  accuracy        f1  jaccard
0  RandomForestClassifier   0.24284  0.351621   0.1382


# Grid Search use to find the best parameters for this model

In [16]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# Gini works better than entropy and more estimators, better the score is
parameters = {'n_estimators':[200, 300], 'criterion':('entropy', 'gini')}
rdf = RandomForestClassifier()

clf = GridSearchCV(rdf, parameters, cv=3, n_jobs=-1)

clf.fit(X_dtm, y)
                            
print(clf.cv_results_)



{'mean_fit_time': array([1.15674965, 2.13355851, 1.41043146, 1.8539598 ]), 'std_fit_time': array([0.03226974, 0.30035809, 0.03162175, 0.05005567]), 'mean_score_time': array([0.03836799, 0.07133683, 0.06487362, 0.0717663 ]), 'std_score_time': array([0.00107693, 0.01531819, 0.01782814, 0.01722672]), 'param_criterion': masked_array(data=['entropy', 'entropy', 'gini', 'gini'],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[200, 300, 200, 300],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'criterion': 'entropy', 'n_estimators': 200}, {'criterion': 'entropy', 'n_estimators': 300}, {'criterion': 'gini', 'n_estimators': 200}, {'criterion': 'gini', 'n_estimators': 300}], 'split0_test_score': array([0.1849711 , 0.1734104 , 0.20231214, 0.19075145]), 'split1_test_score': array([0.17964072, 0.17365269, 0.20359281, 0.19760479]), 'split2_test_sco

In [13]:
print (cv_df)

               model_name    score
0      LogisticRegression  0.29528
1  RandomForestClassifier  0.21596
2               LinearSVC  0.24350
3           MultinomialNB  0.26272
