# Import library use to pre processing

In [1]:
import re
import nltk
from nltk.corpus import stopwords

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Bidirectional
from keras.utils import np_utils

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/julien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading training data

In [3]:
import csv
import numpy as np
X = []
with open("./data/train/crawler/data/tweets.txt.text", newline='', encoding='utf8') as file_data:
    i = 0 
    for row in file_data:
        X.append(row)

y = []
with open("./data/train/crawler/data/tweets.txt.labels", newline='', encoding='utf8') as file_data:
    j = 0
    for row in file_data:
        y.append(row.replace("\n",""))


In [4]:
X = np.array(X)
y = np.array(y)

# Loading data test

In [8]:
test_dtm=[]
with open("./data/test/us_test.text", newline='', encoding='utf8') as test_data:
    file = test_data.readlines()
    for row in file:
        test_dtm.append(row.replace("\n",""))
test_dtm = np.asarray(test_dtm)

In [9]:
test_label = []
with open("./data/test/us_test.labels", newline='', encoding='utf8') as test_data_label:
    file = test_data_label.readlines()
    for row in file:
        test_label.append(row.replace("\n",""))
test_label = np.asarray(test_label)
test_label = test_label.reshape(-1,1)

## Cleaning the text

In [5]:
def clean_tweets(tweets):
  # Stripping away location for the end of tweets
  p = re.compile(r'\s*@ .*$')
  tweets_1 = [p.sub('', tweet) for tweet in tweets]
  
  # Removing @user mentions
  tweets_2 = [tweet.replace('@user', '') for tweet in tweets_1]
  
  # Removing stopwords
  stop_words = set(stopwords.words('english'))

  tweets_3 = []

  for tweet in tweets_2:
    res = ''
    for w in tweet.split():
      if w.lower() not in stop_words:
        res = res+w+' '
    tweets_3.append(res)
    
  # Removing unicode characters
  tweets_4 = [(tweet.encode('ascii', 'ignore')).decode("utf-8") for tweet in tweets_3]
  
  return tweets_4


In [6]:
X = clean_tweets(X)

In [10]:
test_dtm = clean_tweets(test_dtm)

# Tokenizing the text

In [11]:
vocab_size = 50000
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', num_words=vocab_size, oov_token="UNK")
tokenizer.fit_on_texts(texts=X)

Encoding and adding a padding to the training data

In [12]:
tokenized_train = tokenizer.texts_to_sequences(X)
x_train = pad_sequences(tokenized_train, padding='post')

Retrieving unified vector length after padding


In [13]:
maxlen = len(x_train[0])

Encoding and adding a padding to the test data

In [14]:
tokenized_test = tokenizer.texts_to_sequences(test_dtm)
x_test = pad_sequences(tokenized_test, padding='post', maxlen=maxlen)

# Model implementation and training
## Measurements calculation and results

In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, f1_score, jaccard_score, classification_report

models = [
    RandomForestClassifier(n_estimators=3, max_depth=50, criterion='gini')    
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    model.fit(x_train, y)
    predict_label = model.predict(x_test)
    acc = accuracy_score(predict_label,test_label)
    f1 = f1_score(predict_label, test_label, average = 'weighted')
    cm = multilabel_confusion_matrix(test_label,predict_label)
    jaccard = jaccard_score(test_label, predict_label, average='micro')
    print(classification_report(test_label, predict_label))
    print ("la matrice de confusion : ")
    print(cm)
    entries.append((model_name, acc, f1, jaccard))
cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy', 'f1', 'jaccard'])

              precision    recall  f1-score   support

           0       0.24      0.50      0.32     10798
           1       0.10      0.16      0.13      4830
          10       0.04      0.06      0.05      1432
          11       0.14      0.12      0.13      1949
          12       0.10      0.11      0.10      1265
          13       0.02      0.03      0.02      1114
          14       0.03      0.03      0.03      1306
          15       0.05      0.05      0.05      1244
          16       0.03      0.03      0.03      1153
          17       0.28      0.20      0.23      1545
          18       0.08      0.03      0.04      2417
          19       0.03      0.02      0.02      1010
           2       0.12      0.08      0.10      4534
           3       0.08      0.04      0.05      2605
           4       0.16      0.04      0.07      3716
           5       0.04      0.02      0.03      1613
           6       0.08      0.02      0.04      1996
           7       0.16    

In [16]:
print (cv_df)

               model_name  accuracy        f1   jaccard
0  RandomForestClassifier   0.15946  0.191684  0.086638


# Grid Search use to find the best parameters for this model

In [46]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# Test with solver saga
parameters_saga = {'penalty':('l2', 'elasticnet', 'none'),
              'fit_intercept':[0, 1], 'class_weight':('balanced', 'None'),
                  'l1_ratio':[0, 0.5, 1]}
lg_saga = LogisticRegression(solver='saga')

# Test with solver saga and l1 penalty
parameters_saga_l1 = {'fit_intercept':[0, 1], 'class_weight':('balanced', 'None')}
lg_saga_l1 = LogisticRegression(solver='saga', penalty='l1')


# Test with solver lbfgs
parameters_lbfgs = {'penalty':('l2', 'none'), 'fit_intercept':[0, 1],
                   'class_weight':('balanced', 'None')}
lg_lbfgs = LogisticRegression(solver='lbfgs')


parameters = {'solver':('sag', 'newton-cg'), 'penalty':('l2', 'none'),
              'fit_intercept':[0, 1], 'class_weight':('balanced', 'None')}
lg = LogisticRegression()

clf = GridSearchCV(lg, parameters, cv=3, n_jobs=-1)

clf.fit(X_dtm, y)
                            
print(clf.cv_results_)



{'mean_fit_time': array([0.20850412, 0.64488872, 0.36110107, 0.86796069, 0.34987028,
       1.15327382, 0.43410603, 1.43390616, 0.1715188 , 0.88171991,
       0.37534825, 1.22245971, 0.38801901, 1.01836991, 0.55536652,
       1.2311546 ]), 'std_fit_time': array([0.02219167, 0.03748875, 0.02272539, 0.13014933, 0.00539448,
       0.06817575, 0.07801177, 0.20012817, 0.03456551, 0.14927531,
       0.06033472, 0.04752019, 0.08604417, 0.17173747, 0.05328032,
       0.07199231]), 'mean_score_time': array([0.00069332, 0.00070977, 0.00068974, 0.0006427 , 0.00083105,
       0.00107757, 0.00074188, 0.00067385, 0.00075984, 0.00184687,
       0.00070278, 0.00712196, 0.00260623, 0.00085282, 0.00074792,
       0.00102305]), 'std_score_time': array([9.11064746e-05, 8.17834808e-05, 5.54660213e-05, 4.02178215e-05,
       9.68375409e-05, 2.26621956e-04, 1.56191689e-04, 2.94472416e-05,
       1.10294988e-04, 1.46504840e-03, 8.47466172e-05, 6.12829924e-03,
       2.61180159e-03, 1.95306295e-04, 9.56253696e