# NLP model building with long short term memory LSTM

Purpose : Building a model able to do a classification on sentences

## 0 - Interesting links

* https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
* https://www.hackdeploy.com/keras-lstm-example-sequence-binary-classification/
* https://www.youtube.com/watch?v=A9QVYOBjZdY&ab_channel=TensorFlow
* https://www.youtube.com/watch?v=ZMudJXhsUpY&ab_channel=TensorFlow

## 1 - Libraries import

In [27]:
import json
import random
import numpy as np
import pandas as pd
import sklearn as sk
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import CSVLogger

## 2 - Variables definition

In [28]:
nb_train = 10000 # Number of training examples considered
nb_valid = 2500 # Number of validation examples considered
vocab_size = 10000 # Number of words to be considered regarding on their frequency
max_length = 400 # Maximum length of a sentence --> Size of entrance of neural network
embedding_dim = 32 # Embedding dimension for vector representation of words
nb_epochs = 20 # Number of epochs for training
batch_size = 256 # Number training examples in mini-batches

## 3 - Training JSON file reading

In [29]:
training_examples = []
URL = []
X = [] # sentences
Y = [] # sarcastic or not

for line in open('review_amazon_ordinateurportable.json', 'r'):
    training_examples.append(json.loads(line))

training_examples = training_examples[0]

for item in training_examples:
    X.append(item['review'])    
    Y.append(item['positive'])

## 4 - Training / Test dataset splitting

In [30]:
training_examples = list(zip(X, Y))
random.shuffle(training_examples)
X, Y = zip(*training_examples)

X_train = X[0:nb_train]
Y_train = Y[0:nb_train]

X_valid = X[nb_train:nb_train + nb_valid]
Y_valid = Y[nb_train:nb_train + nb_valid]

X_test = X[nb_train + nb_valid:]
Y_test = Y[nb_train + nb_valid:]

## 5 - Sentences tokenization

In [31]:
tokenizer = Tokenizer(oov_token = "<OOV>", num_words = vocab_size) # Strategy for considering Out Of Vocabulary words
tokenizer.fit_on_texts(X_train) # Only considering words in training examples
word_index = tokenizer.word_index

# Pre-processing training set:
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen = max_length, padding = 'post', truncating = 'post')

# Pre-processing validation set:
X_valid = tokenizer.texts_to_sequences(X_valid)
X_valid = pad_sequences(X_valid, maxlen = max_length, padding = 'post', truncating = 'post')

# Pre-processing test set:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_length, padding = 'post', truncating = 'post')

## 6 - Model architecture definition

In [32]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [33]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 400, 32)           320000    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 400, 200)          106400    
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 200)               240800    
_________________________________________________________________
dense_10 (Dense)             (None, 24)                4824      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 25        
Total params: 672,049
Trainable params: 672,049
Non-trainable params: 0
_________________________________________________________________


## 7 - Model training

In [34]:
callback_list = []

callback_list.append(EarlyStopping(monitor = 'val_loss', patience = 15, min_delta = 0.0, mode = 'min')) # Callback to ensure parameters update in accordance with test accuracy improvement
callback_list.append(CSVLogger('training_log.csv')) # Callback to get a feedback

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)

X_valid = np.array(X_valid)
Y_valid = np.array(Y_valid)

X_test = np.array(X_test)
Y_test = np.array(Y_test)

history = model.fit(X_train, Y_train, epochs = nb_epochs, batch_size = batch_size, validation_data = (X_valid, Y_valid), verbose = 2, callbacks = callback_list)

Epoch 1/20
40/40 - 807s - loss: 0.6692 - accuracy: 0.5777 - val_loss: 0.6010 - val_accuracy: 0.6904
Epoch 2/20
40/40 - 847s - loss: 0.5359 - accuracy: 0.7413 - val_loss: 0.5486 - val_accuracy: 0.7276
Epoch 3/20
40/40 - 874s - loss: 0.4590 - accuracy: 0.7874 - val_loss: 0.5164 - val_accuracy: 0.7280
Epoch 4/20
40/40 - 932s - loss: 0.4009 - accuracy: 0.7979 - val_loss: 0.5112 - val_accuracy: 0.7376
Epoch 5/20
40/40 - 1153s - loss: 0.3537 - accuracy: 0.8148 - val_loss: 0.4805 - val_accuracy: 0.7548
Epoch 6/20


## 8 - Model training analysis

### 8.1 - Accuracy & Loss plots

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
    
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

### 8.2 - Loading log file information

In [None]:
df_log = pd.read_csv('training_log.csv')
df_log

## 9 - Predictions on test set & new sentences

In [None]:
Y_test_prediction = np.round_(model.predict(X_test)[:,0])
conf_matrix = confusion_matrix(Y_test_prediction, Y_test)

print('CONFUSION MATRIX' + '\n' + '='*20)
print('TRUE POSITIVES : ' + str(conf_matrix[0][0]) + '\nFALSE POSITIVES : ' + str(conf_matrix[0][1]) + '\nFALSE NEGATIVES : ' + str(conf_matrix[1][0]) + '\nTRUE NEGATIVES : ' + str(conf_matrix[1][1]))
print('-'*20 + '\nACCURACY : ' + str(np.round_(100 * (conf_matrix[0][0] + conf_matrix[1][1])/(conf_matrix[0][0] + conf_matrix[1][0] + conf_matrix[0][1] + conf_matrix[1][1]),2)) + ' %')

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real"]
sequence = tokenizer.texts_to_sequences(sentence)
sentence = pad_sequences(sequence, maxlen = max_length, padding = 'post', truncating = 'post')

print(np.round_(model.predict(sentence)))

## 10 - Model parameters save 

In [None]:
model.save("seq_model.h5")

## 11 - Saved model loading

In [None]:
model = load_model('seq_model.h5')
model.summary()