# Data Preprocessing + Modelling

In [68]:
import os
import re
import h5py

import numpy as np
import tensorflow as tf

np.random.seed(2021)
tf.random.set_seed(2021)

import pandas as pd
import keras
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, classification_report, log_loss, confusion_matrix

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Flatten
from keras.layers import Dropout, Conv1D, GlobalMaxPool1D, GRU, GlobalAvgPool1D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [18]:
df = pd.read_csv('data/data_after_preprocessing.csv')

In [19]:
df['label'].value_counts()

0    7221
1    2017
Name: label, dtype: int64

## Text Preprocessing

In [20]:
# Remove punctuation
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
df['tweet'].head()

0    service connected covid19 pandemic impacting t...
1    im not gone lie ion like normal girls i like e...
3    why am i helping my suicidal irl im literally ...
4    the polluter pays principle is a threat to thi...
Name: tweet, dtype: object

In [21]:
# Lemmatization
nltk.download('wordnet')

df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
df['tweet'].head()

0    service connected covid19 pandemic impacting t...
1    im not gone lie ion like normal girl i like em...
3    why am i helping my suicidal irl im literally ...
4    the polluter pay principle is a threat to this...
Name: tweet, dtype: object

In [8]:
df.dtypes

tweet              object
label               int64
day                 int64
nlikes              int64
nreplies            int64
nretweets           int64
reply_to            int64
url                 int64
join_time           int64
tweets              int64
following           int64
followers           int64
likes               int64
media               int64
day_after           int64
tweet_length        int64
tweet_sentiment     int64
bio_sentiment       int64
first_person        int64
second_person       int64
third_person        int64
text_vec           object
dtype: object

### Train test split

In [30]:
data_y = df['label']
data_x = df['tweet']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, stratify=data_y, random_state = 2021)

In [31]:
x_train.apply(lambda x : len(x.split(' '))).quantile(0.95)

53.0

### Tokenizer

In [33]:
# Run keras Tokenizer
# Tokenize the sentences
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(x_train)
x_train_tok = tokenizer.texts_to_sequences(x_train)
x_test_tok = tokenizer.texts_to_sequences(x_test)

### Pad sequences

In [34]:
#training constants
MAX_SEQ_LEN = 53 # Based on above

In [35]:
train_text_vec = pad_sequences(x_train_tok, maxlen=MAX_SEQ_LEN)
test_text_vec = pad_sequences(x_test_tok, maxlen=MAX_SEQ_LEN)

In [38]:
print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", train_text_vec.max(), "\n")

print('Sample Tweet Before Processing:', x_train.values[0])
print('Sample Tweet After Processing:', tokenizer.sequences_to_texts([train_text_vec[0]]), '\n')

print('What the model will interpret:', train_text_vec[0].tolist())

Number of Tokens: 14216
Max Token Index: 14216 

Sample Tweet Before Processing: we can at least prevent people from such suicidal act  using coronil remains their choice eventually marne ke baad coronil kaam ka nahi bol ke kya fayda
Sample Tweet After Processing: ['we can at least prevent people from such suicidal act using coronil remains their choice eventually marne ke baad coronil kaam ka nahi bol ke kya fayda'] 

What the model will interpret: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 45, 43, 358, 788, 27, 42, 296, 1, 372, 470, 2691, 2146, 79, 768, 1153, 4575, 1957, 4576, 2691, 4577, 1958, 3629, 4578, 1957, 4579, 4580]


In [39]:
# One Hot Encode Y values:
encoder = LabelEncoder()

y_train_label = encoder.fit_transform(y_train.values)
y_train_label = to_categorical(y_train_label) 

y_test_label = encoder.fit_transform(y_test.values)
y_test_label = to_categorical(y_test_label) 

### Get class weights for the training data, this will be used in training

In [42]:
from collections import Counter

ctr = Counter(y_train.values)
print('Distribution of Classes:', ctr)

Distribution of Classes: Counter({0: 5776, 1: 1614})


In [47]:
# get class weights for the training data, this will be used data
y_train_int = np.argmax(y_train_label,axis=1)
cws_raw = class_weight.compute_class_weight('balanced', np.unique(y_train_int), y_train_int)
label = [0,1]

cws = dict(zip(label, cws_raw))

print(cws)

{0: 0.6397160664819944, 1: 2.2893432465923174}


## Modelling

### Bidirectional LSTM

In [69]:
DEFAULT_BATCH_SIZE = 128
DEFAULT_EPOCHS = 100


model = Sequential()
model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.002, patience=2, 
                                              verbose=0, mode='auto', baseline=None)
checkpoint = ModelCheckpoint('LSTM_models/best_LSTM_model.h5', monitor='loss', mode='auto', 
                             verbose = 1, save_best_only=True)
callbacks_list = [checkpoint, early_stop]

model.fit(x=train_text_vec,
          y=y_train_label,
          class_weight=cws,
          batch_size=DEFAULT_BATCH_SIZE,
          epochs=DEFAULT_EPOCHS,
          callbacks=callbacks_list,
          verbose=1,
          validation_data=(
              test_text_vec,
              y_test_label
          ))

Epoch 1/100

Epoch 00001: loss improved from inf to 0.59100, saving model to LSTM_models\best_LSTM_model.h5
Epoch 2/100

Epoch 00002: loss improved from 0.59100 to 0.39309, saving model to LSTM_models\best_LSTM_model.h5
Epoch 3/100

Epoch 00003: loss improved from 0.39309 to 0.29655, saving model to LSTM_models\best_LSTM_model.h5
Epoch 4/100

Epoch 00004: loss improved from 0.29655 to 0.23611, saving model to LSTM_models\best_LSTM_model.h5
Epoch 5/100

Epoch 00005: loss improved from 0.23611 to 0.18397, saving model to LSTM_models\best_LSTM_model.h5
Epoch 6/100

Epoch 00006: loss improved from 0.18397 to 0.13678, saving model to LSTM_models\best_LSTM_model.h5
Epoch 7/100

Epoch 00007: loss improved from 0.13678 to 0.08906, saving model to LSTM_models\best_LSTM_model.h5
Epoch 8/100

Epoch 00008: loss improved from 0.08906 to 0.07393, saving model to LSTM_models\best_LSTM_model.h5
Epoch 9/100

Epoch 00009: loss improved from 0.07393 to 0.06550, saving model to LSTM_models\best_LSTM_model

<tensorflow.python.keras.callbacks.History at 0x1dbda82ffa0>

In [82]:
model.load_weights('LSTM_models/best_LSTM_model.h5')
results = model.evaluate(test_text_vec, y_test_label)
print("Test loss:\n", results[0])
print("\n")
print("Test accuracy:\n", results[1])
print("\n")

y_test_hat = model.predict(test_text_vec)
confusion = confusion_matrix(np.argmax(y_test_label,axis=1), np.argmax(y_test_hat,axis=1))
class_report = classification_report(np.argmax(y_test_label, axis=1), np.argmax(y_test_hat, axis=1))
                                     
print("Confusion matrix:\n", confusion)
print("\n")
print("Classification report:\n",class_report)

Test loss:
 1.3180272579193115


Test accuracy:
 0.7992424368858337


Confusion matrix:
 [[1270  175]
 [ 196  207]]


Classification report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.87      1445
           1       0.54      0.51      0.53       403

    accuracy                           0.80      1848
   macro avg       0.70      0.70      0.70      1848
weighted avg       0.80      0.80      0.80      1848



### Bidirectional LSTM with CNN layer

In [71]:
DEFAULT_BATCH_SIZE = 128
DEFAULT_EPOCHS = 100

model_cnn = Sequential()
model_cnn.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
model_cnn.add(SpatialDropout1D(0.2))
model_cnn.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model_cnn.add(Conv1D(64, 4))
model_cnn.add(GlobalMaxPool1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(2, activation='softmax'))

model_cnn.compile(optimizer=tf.keras.optimizers.Adam(), 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stop_cnn = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.002, patience=2, 
                                              verbose=0, mode='auto', baseline=None)
checkpoint_cnn = ModelCheckpoint('LSTM_models/best_LSTM_CNN_model.h5', monitor='loss', mode='auto', 
                             verbose = 1, save_best_only=True)
callbacks_list_cnn = [checkpoint_cnn, early_stop_cnn]

model_cnn.fit(x=train_text_vec,
              y=y_train_label,
              class_weight=cws,
              batch_size=DEFAULT_BATCH_SIZE,
              epochs=DEFAULT_EPOCHS,
              callbacks=callbacks_list_cnn,
              verbose=1,
              validation_data=(
                  test_text_vec,
                  y_test_label
              ))

Epoch 1/100

Epoch 00001: loss improved from inf to 0.56902, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 2/100

Epoch 00002: loss improved from 0.56902 to 0.35814, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 3/100

Epoch 00003: loss improved from 0.35814 to 0.25431, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 4/100

Epoch 00004: loss improved from 0.25431 to 0.17871, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 5/100

Epoch 00005: loss improved from 0.17871 to 0.12314, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 6/100

Epoch 00006: loss improved from 0.12314 to 0.09564, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 7/100

Epoch 00007: loss improved from 0.09564 to 0.06468, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 8/100

Epoch 00008: loss improved from 0.06468 to 0.05896, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 9/100

Epoch 00009: loss did not improve from 0.05896
Epoch 10/100

Ep

<tensorflow.python.keras.callbacks.History at 0x1dc01ab26d0>

In [84]:
model_cnn.load_weights('LSTM_models/best_LSTM_CNN_model.h5')
results_cnn = model_cnn.evaluate(test_text_vec, y_test_label)
print("Test loss:\n", results_cnn[0])
print("\n")
print("Test accuracy:\n", results_cnn[1])
print("\n")

y_test_hat_cnn = model_cnn.predict(test_text_vec)
confusion_cnn = confusion_matrix(np.argmax(y_test_label,axis=1), np.argmax(y_test_hat_cnn,axis=1))
class_report_cnn = classification_report(np.argmax(y_test_label, axis=1), np.argmax(y_test_hat_cnn, axis=1))
                                     
print("Confusion matrix:\n", confusion_cnn)
print("\n")
print("Classification report:\n",class_report_cnn)

Test loss:
 0.9623579382896423


Test accuracy:
 0.7949134111404419


Confusion matrix:
 [[1229  216]
 [ 163  240]]


Classification report:
               precision    recall  f1-score   support

           0       0.88      0.85      0.87      1445
           1       0.53      0.60      0.56       403

    accuracy                           0.79      1848
   macro avg       0.70      0.72      0.71      1848
weighted avg       0.81      0.79      0.80      1848

