In [None]:
import re
from collections import Counter

import numpy  as np
import pandas as pd
import nltk   as nl
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers, models, optimizers
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, concatenate, Dropout, GRU
from tensorflow.python.keras.optimizers import  RMSprop
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.utils import class_weight
import string

# Chargement des données

In [None]:
data = pd.read_csv("Train.csv", dtype=object)

In [None]:
data.head()

In [None]:
concatenated_train = data['content'].map(str) + data['title'].map(str)
reduced_data = pd.DataFrame(concatenated_train, columns=['text'])

# Preprocessing


Next steps are as follows:
$$ \begin{itemize}
\item Tokenize the text column and convert them to vector sequences
\item Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.
\item Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simpletrain val split.
\end{itemize} $$

In [1]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)


NameError: name 'Tokenizer' is not defined

In [None]:
tokenizer.fit_on_texts(reduced_data['text'])

In [None]:
x_tokens = tokenizer.texts_to_sequences(reduced_data['text'])

we will use a sequence-length that covers most sequences in the data-set, and we will then truncate longer sequences and pad shorter sequences.

In [None]:
num_tokens = [len(tokens) for tokens in x_tokens]
num_tokens = np.array(num_tokens)
print('The average number of tokens in a sequence is {}'.format(np.mean(num_tokens)))
print('The maximum number of tokens in a sequence is {}'.format(np.max(num_tokens)))

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print('The chosen max tokens is {}'.format(max_tokens))
print('The pourcentage of entries that don''t reach the max tokens {}'.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

In [None]:
pad = 'pre'
X_pad = pad_sequences(x_tokens, maxlen=max_tokens,padding=pad, truncating=pad)

print('The new shape of our train data after padding is {}'.format(x_pad.shape))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pad,data['fake'], test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

Assign different weights to each class because the data is not balanced

In [None]:
class_weights = class_weight.compute_class_weight('balanced',np.unique(train['fake']),train['fake'])
print(class_weights)
##class_weights = [0.85, 1.3]

# Built The model : Create the Recurrent Neural Network

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model.
let us use the Glove embeddings

In [None]:
embed_size = 50 # how big is each word vector
EMBEDDING_FILE = "./glove.6B.100d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Define the hidden layers

In [None]:
model = Sequential()
nlp_input = layers.Input((maxlen, ))
embedding = Embedding(max_features,      ##The embedding-layer 
                      embed_size, 
                      weights=[embedding_matrix],
                      trainable=False)(nlp_input)


gru = GRU(units=16, return_sequences=True)(embedding)
gru = Dropout(0.2)(gru)
gru = GRU(units=4)(gru)
gru = Dropout(0.2)(gru)

x = Dense(1, activation='sigmoid')(gru)

model = Model(inputs=[nlp_input], outputs=[x])
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(model.summary())

We will trains the model on data generated batch-by-batch.
The generator is run in parallel to the model, for efficiency. For instance, this allows you to do real-time data augmentation on images on CPU in parallel to training your model on GPU.

In [None]:
def generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0

    while 1:

        X_batch = np.array(X_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
        y_batch = np.array(y_data[batch_size*counter:batch_size*(counter+1)]).astype('float32')
        counter += 1
        yield X_batch,y_batch

    #restart counter to yeild data in the next epoch as well
        if counter >= number_of_batches:
            counter = 0


In [None]:
epochs = 3 
batch_size = 64
class_weights = [0.85, 1.3]
history=model.fit_generator(generator(X_train, Y_train, batch_size), steps_per_epoch=train.shape[0]/batch_size, epochs=2, verbose=1, 
              callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)], 
              validation_data=generator(X_test,Y_test,batch_size*2), 
              validation_steps=train.shape[0]/batch_size,
              class_weight=class_weights, max_queue_size=10, 
              workers=1, use_multiprocessing=False, shuffle=True, initial_epoch=0)
  

# Evaluate The model

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
print('The ditribution of our label in the test data is {}'.format(Y_test.value_counts()))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();

THRESHHOLD = 0.5
predicted = pd.DataFrame(model.predict(X_test))
predicted[predicted<THRESHHOLD] = 0
predicted[predicted>=THRESHHOLD] = 1

Plot the confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

predicted_values = predicted[0].values
predicted_values = [int(i) for i in predicted_values]
true_values = Y_test.values
true_values = [int(i) for i in true_values]

labels=[0, 1]
cm = confusion_matrix(true_values, predicted_values, labels)

def plot_confusion_matrix(cm,target_names,title='Confusion matrix',cmap=None,normalize=True):
    import itertools
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()
    
    
plot_confusion_matrix(cm,labels, normalize=False)
recall = cm[1, 1] / (cm[1,1] + cm[1,0])
print('The recall equals to {}'.format(recall))