# RNN VS LSTM VS GRU for tweet dataset from Kaggle

In [None]:
import tensorflow as tf

tf.config.list_physical_devices('GPU')

In [45]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, SimpleRNN, Activation, Dropout, Conv1D
from tensorflow.keras.layers import Embedding, Flatten, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
# https://haochen23.github.io/2020/01/nlp-rnn-sentiment.html#.Y3sIcXZByUk

In [46]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# data = pd.read_csv("https://raw.githubusercontent.com/haochen23/nlp-rnn-lstm-sentiment/master/training.1600000.processed.noemoticon.csv", header=None, encoding='cp437')
# print("The shape of the original dataset is {}".format(data.shape))
# data

raw = pd.read_csv('data/Reddit_Data.csv')
raw

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [47]:
raw['category'].value_counts()

 1    15830
 0    13142
-1     8277
Name: category, dtype: int64

In [48]:
NUM = 8277

positive = raw[raw['category']==1].sample(8277)
neutral = raw[raw['category']==0].sample(8277)
negative = raw[raw['category']==-1].sample(8277)

data = pd.concat([positive, neutral, negative], axis=0).sample(frac=1)
# positive.shape, neutral.shape, negative.shape

data

Unnamed: 0,clean_comment,category
28732,pakistan seems have closed their airspace agai...,-1
19562,please read and make video about essar tapes e...,0
36608,needed this stand against pakistan atleast yr...,1
15616,anything get little upset when boyfriend play...,-1
14096,from india its great see president trump host...,1
...,...,...
5485,what that cube thing they looking,0
25093,much was looking forward the modi govt come i...,0
4473,not know real joke anymore fuck you april fuc...,-1
13206,points upvoted votes the army here,0


In [49]:
def load_glove_model(glove_file, encoding='iso-8859-1'):
    print("[INFO]Loading GloVe Model...")
    model = {}
    with open(glove_file, 'r', encoding=encoding) as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embeddings = [float(val) for val in split_line[1:]]
            model[word] = embeddings
    print("[INFO] Done...{} words loaded!".format(len(model)))
    return model
# adopted from utils.py
nlp = spacy.blank("en")

def remove_stopwords(sentence):
    '''
    function to remove stopwords
        input: sentence - string of sentence
    '''
    new = []
    # tokenize sentence
    sentence = nlp(sentence)
    for tk in sentence:
        if (tk.is_stop == False) & (tk.pos_ !="PUNCT"):
            new.append(tk.string.strip())
    # convert back to sentence string
    c = " ".join(str(x) for x in new)
    return c


def lemmatize(sentence):
    '''
    function to do lemmatization
        input: sentence - string of sentence
    '''
    sentence = nlp(sentence)
    s = ""
    for w in sentence:
        s +=" "+w.lemma_
    return nlp(s)

def sent_vectorizer(sent, model):
    '''
    sentence vectorizer using the pretrained glove model
    '''
    sent_vector = np.zeros(200)
    num_w = 0
    for w in sent.split():
        try:
            # add up all token vectors to a sent_vector
            sent_vector = np.add(sent_vector, model[str(w)])
            num_w += 1
        except:
            pass
    return sent_vector

In [50]:
data_X = data['clean_comment'].to_numpy().astype('str')
data_y = data['category']
data_y = pd.get_dummies(data_y).to_numpy()

data_X.shape, data_y.shape

((24831,), (24831, 3))

In [51]:
# load the glove model
glove_model = load_glove_model("glove.twitter.27B.200d.txt", encoding='utf-8')
# number of vocab to keep
max_vocab = 18000
# length of sequence that will generate
max_len = 15

tokenizer = Tokenizer(num_words=max_vocab)

[INFO]Loading GloVe Model...
[INFO] Done...1193514 words loaded!


In [52]:
tokenizer.fit_on_texts(data_X)
sequences = tokenizer.texts_to_sequences(data_X)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_keras = pad_sequences(sequences, maxlen=max_len, padding="post")

data_keras.shape

Found 44112 unique tokens.


(24831, 15)

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_keras, data_y, test_size = 0.3, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((17381, 15), (7450, 15), (17381, 3), (7450, 3))

In [54]:
# calculate number of words
nb_words = len(tokenizer.word_index) + 1
print(f"Number of words: {nb_words}")

# obtain the word embedding matrix
embedding_matrix = np.zeros((nb_words, 200))

for word, i in word_index.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

44113
Null word embeddings: 12999


In [55]:
# adopted from sent_tran_eval.py
def build_model(nb_words, rnn_model="SimpleRNN", embedding_matrix=None):
    '''
    build_model function:
    inputs: 
        rnn_model - which type of RNN layer to use, choose in (SimpleRNN, LSTM, GRU)
        embedding_matrix - whether to use pretrained embeddings or not
    '''
    model = Sequential()
    # add an embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(nb_words, 
                        200, 
                        weights=[embedding_matrix], 
                        input_length= max_len,
                        trainable = False))
    else:
        model.add(Embedding(nb_words, 
                        200, 
                        input_length= max_len,
                        trainable = False))
        
    # add an RNN layer according to rnn_model
    if rnn_model == "SimpleRNN":
        model.add(SimpleRNN(256))
    elif rnn_model == "LSTM":
        model.add(LSTM(256))
    else:
        model.add(GRU(256))
        
    model.add(Dense(512,activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
    return model

In [56]:
model_rnn = build_model(nb_words, "SimpleRNN", embedding_matrix)
mode_rnn_history = model_rnn.fit(x_train, y_train, epochs=20, batch_size=120,
          validation_data=(x_test, y_test))
# predictions = model_rnn.predict(x_test)
# predictions = predictions.argmax(axis=1)
# print(classification_report(y_test.argmax(axis=1), predictions))
#, callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b85d572e20>

In [57]:
model_rnn.evaluate(x_test, y_test)



[2.15606951713562, 0.6056376099586487]

In [None]:
max_val_acc_epoch = np.argmax(list(mode_rnn_history.history['val_accuracy']))+1
max_val_loss_epoch = np.argmin(list(mode_rnn_history.history['val_loss']))+1
epochs = range(1, len(mode_rnn_history.history['accuracy']) + 1)
plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,mode_rnn_history.history['accuracy'], label='Train')
plt.plot(epochs,mode_rnn_history.history['val_accuracy'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,mode_rnn_history.history['loss'], label='Train')
plt.plot(epochs,mode_rnn_history.history['val_loss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import visualkeras
from PIL import ImageFont
visualkeras.layered_view(model_rnn, legend=True)

In [None]:
model_rnn.save_weights("models/rnn-model.h5")

In [58]:
model_lstm = build_model(nb_words, "LSTM", embedding_matrix)
model_lstm_history = model_lstm.fit(x_train, y_train, epochs=20, batch_size=120,
          validation_data=(x_test, y_test))
# predictions = model_lstm.predict(x_test)
# predictions = predictions.argmax(axis=1)
# print(classification_report(y_test.argmax(axis=1), predictions))
#, callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b9f4e277f0>

In [59]:
model_lstm.evaluate(x_test, y_test)



[1.8574057817459106, 0.7428187727928162]

In [None]:
max_val_acc_epoch = np.argmax(list(model_lstm_history.history['val_accuracy']))+1
max_val_loss_epoch = np.argmin(list(model_lstm_history.history['val_loss']))+1
epochs = range(1, len(model_lstm_history.history['accuracy']) + 1)
plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,model_lstm_history.history['accuracy'], label='Train')
plt.plot(epochs,model_lstm_history.history['val_accuracy'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,model_lstm_history.history['loss'], label='Train')
plt.plot(epochs,model_lstm_history.history['val_loss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import visualkeras
from PIL import ImageFont
visualkeras.layered_view(model_lstm, legend=True)

In [None]:
model_lstm.save_weights("models/model-lstm.h5")

In [60]:
model_gru = build_model(nb_words, "GRU", embedding_matrix)
model_gru_history = model_gru.fit(x_train, y_train, epochs=20, batch_size=120,
          validation_data=(x_test, y_test))
# predictions = model_gru.predict(x_test)
# predictions = predictions.argmax(axis=1)
# print(classification_report(y_test.argmax(axis=1), predictions))
#, callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1b635dd8040>

In [61]:
model_gru.evaluate(x_test, y_test)



[1.6071900129318237, 0.7499328851699829]

In [None]:
max_val_acc_epoch = np.argmax(list(model_gru_history.history['val_accuracy']))+1
max_val_loss_epoch = np.argmin(list(model_gru_history.history['val_loss']))+1
epochs = range(1, len(model_gru_history.history['accuracy']) + 1)
plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,model_gru_history.history['accuracy'], label='Train')
plt.plot(epochs,model_gru_history.history['val_accuracy'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10,3))
plt.minorticks_on()
plt.axvline(x=max_val_acc_epoch, color='0.5', linestyle='--')
plt.axvline(x=max_val_loss_epoch, color='0.5', linestyle=':')
plt.plot(epochs,model_gru_history.history['loss'], label='Train')
plt.plot(epochs,model_gru_history.history['val_loss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import visualkeras
from PIL import ImageFont
visualkeras.layered_view(model_gru, legend=True)

In [None]:
model_gru.save_weights("models/model-gru.h5")

### Using a non deeplearning model (TFIDF vectorization) and other traditional machine learning models

In [62]:
data

Unnamed: 0,clean_comment,category
28732,pakistan seems have closed their airspace agai...,-1
19562,please read and make video about essar tapes e...,0
36608,needed this stand against pakistan atleast yr...,1
15616,anything get little upset when boyfriend play...,-1
14096,from india its great see president trump host...,1
...,...,...
5485,what that cube thing they looking,0
25093,much was looking forward the modi govt come i...,0
4473,not know real joke anymore fuck you april fuc...,-1
13206,points upvoted votes the army here,0


In [63]:
from sklearn.model_selection import train_test_split

x = data['clean_comment'].astype('str')
y = data['category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

x_train_vec.shape, x_test_vec.shape 

((19864, 10000), (4967, 10000))

## Using RandomForestClassifier

In [65]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train_vec, y_train)
model.score(x_test_vec, y_test)

0.7608214213811154

## Using Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train_vec, y_train)
model.score(x_test_vec, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8361183813166901