# Loading training data

In [1]:
import csv
import numpy as np
# Training tweets recovery
X = []
with open("./data/train/crawler/data/tweets.txt.text", newline='', encoding='utf8') as file_data: 
    for row in file_data:
        X.append(row)

# Training emojis recovery
y = []
with open("./data/train/crawler/data/tweets.txt.labels", newline='', encoding='utf8') as file_data:
    for row in file_data:
        y.append(row.replace("\n",""))
        

In [2]:
X = np.array(X)
y = np.array(y)

# Data vectorisation

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)

# Data test loading and vectorisation

In [4]:
# Testing tweets recovery
test=[]
with open("./data/test/us_test.text", newline='', encoding='utf8') as test_data:
    file = test_data.readlines()
    for row in file:
        test.append(row.replace("\n",""))
test = np.asarray(test)
test_dtm = vect.transform(test)

In [5]:
# Testing emojis recovery
test_label = []
with open("./data/test/us_test.labels", newline='', encoding='utf8') as test_data_label:
    file = test_data_label.readlines()
    for row in file:
        test_label.append(row.replace("\n",""))
test_label = np.asarray(test_label)
test_label = test_label.reshape(-1,1)

# Model implementation and training
## Measurements calculation and results

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score, jaccard_score, classification_report

models = [
    LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='multinomial',
                        n_jobs=-1, max_iter= 1000)
]

# Size of the cross validation 
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    # Training
    model.fit(X_dtm, y)
    # Prediction
    predict_label = model.predict(test_dtm)
    # Measurement calculation
    acc = accuracy_score(predict_label,test_label)
    f1 = f1_score(predict_label, test_label, average = 'weighted')
    cm = multilabel_confusion_matrix(test_label,predict_label)
    jaccard = jaccard_score(test_label, predict_label, average='micro')
    print(classification_report(test_label, predict_label))
    print ("la matrice de confusion : ")
    print(cm)
    entries.append((model_name, acc, f1, jaccard))
cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy', 'f1', 'jaccard'])

              precision    recall  f1-score   support

           0       0.33      0.62      0.44     10798
           1       0.24      0.25      0.25      4830
          10       0.16      0.14      0.15      1432
          11       0.62      0.50      0.56      1949
          12       0.35      0.38      0.36      1265
          13       0.26      0.06      0.10      1114
          14       0.12      0.05      0.07      1306
          15       0.24      0.18      0.20      1244
          16       0.12      0.05      0.07      1153
          17       0.65      0.59      0.62      1545
          18       0.31      0.12      0.17      2417
          19       0.07      0.03      0.04      1010
           2       0.33      0.45      0.38      4534
           3       0.15      0.06      0.08      2605
           4       0.52      0.42      0.46      3716
           5       0.09      0.07      0.08      1613
           6       0.18      0.14      0.16      1996
           7       0.31    

In [7]:
print (cv_df)

           model_name  accuracy        f1   jaccard
0  LogisticRegression   0.32122  0.350189  0.191341


# Grid Search use to find the best parameters for this model

In [46]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# Test with solver saga
parameters_saga = {'penalty':('l2', 'elasticnet', 'none'),
              'fit_intercept':[0, 1], 'class_weight':('balanced', 'None'),
                  'l1_ratio':[0, 0.5, 1]}
lg_saga = LogisticRegression(solver='saga')

# Test with solver saga and l1 penalty
parameters_saga_l1 = {'fit_intercept':[0, 1], 'class_weight':('balanced', 'None')}
lg_saga_l1 = LogisticRegression(solver='saga', penalty='l1')


# Test with solver lbfgs
parameters_lbfgs = {'penalty':('l2', 'none'), 'fit_intercept':[0, 1],
                   'class_weight':('balanced', 'None')}
lg_lbfgs = LogisticRegression(solver='lbfgs')


parameters = {'solver':('sag', 'newton-cg'), 'penalty':('l2', 'none'),
              'fit_intercept':[0, 1], 'class_weight':('balanced', 'None')}
lg = LogisticRegression()

# Grid Search based on Logistic Regression with cross validation = 3 and all thread use
# On Logistic Regression we modify the parameters because they can't be all used at the same time
clf = GridSearchCV(lg, parameters, cv=3, n_jobs=-1)

# Start of Grid Search
clf.fit(X_dtm, y)
                            
print(clf.cv_results_)



{'mean_fit_time': array([0.20850412, 0.64488872, 0.36110107, 0.86796069, 0.34987028,
       1.15327382, 0.43410603, 1.43390616, 0.1715188 , 0.88171991,
       0.37534825, 1.22245971, 0.38801901, 1.01836991, 0.55536652,
       1.2311546 ]), 'std_fit_time': array([0.02219167, 0.03748875, 0.02272539, 0.13014933, 0.00539448,
       0.06817575, 0.07801177, 0.20012817, 0.03456551, 0.14927531,
       0.06033472, 0.04752019, 0.08604417, 0.17173747, 0.05328032,
       0.07199231]), 'mean_score_time': array([0.00069332, 0.00070977, 0.00068974, 0.0006427 , 0.00083105,
       0.00107757, 0.00074188, 0.00067385, 0.00075984, 0.00184687,
       0.00070278, 0.00712196, 0.00260623, 0.00085282, 0.00074792,
       0.00102305]), 'std_score_time': array([9.11064746e-05, 8.17834808e-05, 5.54660213e-05, 4.02178215e-05,
       9.68375409e-05, 2.26621956e-04, 1.56191689e-04, 2.94472416e-05,
       1.10294988e-04, 1.46504840e-03, 8.47466172e-05, 6.12829924e-03,
       2.61180159e-03, 1.95306295e-04, 9.56253696e