# Data Preprocessing + Modelling

In [1]:
import os
import re
import h5py
import pickle

import evaluate

import numpy as np
import tensorflow as tf

np.random.seed(2021)
tf.random.set_seed(2021)

import pandas as pd
import keras
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from sklearn.metrics import f1_score, classification_report, log_loss, confusion_matrix

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Flatten
from keras.layers import Dropout, Conv1D, GlobalMaxPool1D, GRU, GlobalAvgPool1D
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
df = pd.read_csv('data/data_after_preprocessing.csv')

In [3]:
df['label'].value_counts()

0    7221
1    2017
Name: label, dtype: int64

## Text Preprocessing

In [4]:
# Remove punctuation
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
df['tweet'].head()

0    service connected covid19 pandemic impacting t...
1    im not gone lie ion like normal girls i like e...
3    why am i helping my suicidal irl im literally ...
4    the polluter pays principle is a threat to thi...
Name: tweet, dtype: object

In [5]:
# Lemmatization
nltk.download('wordnet')

df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df['tweet'].head()

0    service connected covid19 pandemic impacting t...
1    im not gone lie ion like normal girl i like em...
3    why am i helping my suicidal irl im literally ...
4    the polluter pay principle is a threat to this...
Name: tweet, dtype: object

In [7]:
df.dtypes

tweet              object
label               int64
day                 int64
nlikes              int64
nreplies            int64
nretweets           int64
reply_to            int64
url                 int64
join_time           int64
tweets              int64
following           int64
followers           int64
likes               int64
media               int64
day_after           int64
tweet_length        int64
tweet_sentiment     int64
bio_sentiment       int64
first_person        int64
second_person       int64
third_person        int64
dtype: object

### Train test split

In [8]:
data_y = df['label']
data_x = df['tweet']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, stratify=data_y, random_state = 2021)

In [9]:
x_train.apply(lambda x : len(x.split(' '))).quantile(0.95)

53.0

### Tokenizer

In [10]:
# Run keras Tokenizer
# Tokenize the sentences
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(x_train)
x_train_tok = tokenizer.texts_to_sequences(x_train)
x_test_tok = tokenizer.texts_to_sequences(x_test)

In [11]:
import pickle

with open('LSTM_models/tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)

### Pad sequences

In [12]:
#training constants
MAX_SEQ_LEN = 53 # Based on above

In [13]:
train_text_vec = pad_sequences(x_train_tok, maxlen=MAX_SEQ_LEN)
test_text_vec = pad_sequences(x_test_tok, maxlen=MAX_SEQ_LEN)

In [14]:
print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", train_text_vec.max(), "\n")

print('Sample Tweet Before Processing:', x_train.values[0])
print('Sample Tweet After Processing:', tokenizer.sequences_to_texts([train_text_vec[0]]), '\n')

print('What the model will interpret:', train_text_vec[0].tolist())

Number of Tokens: 14216
Max Token Index: 14216 

Sample Tweet Before Processing: we can at least prevent people from such suicidal act  using coronil remains their choice eventually marne ke baad coronil kaam ka nahi bol ke kya fayda
Sample Tweet After Processing: ['we can at least prevent people from such suicidal act using coronil remains their choice eventually marne ke baad coronil kaam ka nahi bol ke kya fayda'] 

What the model will interpret: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 45, 43, 358, 788, 27, 42, 296, 1, 372, 470, 2691, 2146, 79, 768, 1153, 4575, 1957, 4576, 2691, 4577, 1958, 3629, 4578, 1957, 4579, 4580]


In [15]:
# One Hot Encode Y values:
encoder = LabelEncoder()

y_train_label = encoder.fit_transform(y_train.values)
y_train_label = to_categorical(y_train_label) 

y_test_label = encoder.fit_transform(y_test.values)
y_test_label = to_categorical(y_test_label) 

### Get class weights for the training data, this will be used in training

In [16]:
from collections import Counter

ctr = Counter(y_train.values)
print('Distribution of Classes:', ctr)

Distribution of Classes: Counter({0: 5776, 1: 1614})


In [17]:
# get class weights for the training data, this will be used data
y_train_int = np.argmax(y_train_label,axis=1)
cws_raw = class_weight.compute_class_weight('balanced', np.unique(y_train_int), y_train_int)
label = [0,1]

cws = dict(zip(label, cws_raw))

print(cws)

{0: 0.6397160664819944, 1: 2.2893432465923174}




## Modelling

### Bidirectional LSTM

In [30]:
DEFAULT_BATCH_SIZE = 64
DEFAULT_EPOCHS = 100

model = Sequential()
model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.compile(optimizer = 'adam',
              loss = keras.losses.BinaryCrossentropy(from_logits = False),
              metrics = ['accuracy']
             )

early_stop = keras.callbacks.EarlyStopping(monitor = 'loss',
                                              verbose = 1,
                                              patience = 10,
                                              mode = 'auto',
                                              restore_best_weights = True
                                             )

checkpoint = ModelCheckpoint('LSTM_models/best_LSTM_model.h5', monitor='loss', mode='auto', 
                             verbose = 1, save_best_only=True)

callbacks_list = [checkpoint, early_stop]

model.fit(x=train_text_vec,
          y=y_train_label,
          class_weight=cws,
          batch_size=DEFAULT_BATCH_SIZE,
          epochs=DEFAULT_EPOCHS,
          callbacks=callbacks_list,
          verbose=1,
          validation_data=(
              test_text_vec,
              y_test_label,
          ))

Epoch 1/100

Epoch 00001: loss improved from inf to 0.56624, saving model to LSTM_models\best_LSTM_model.h5
Epoch 2/100

Epoch 00002: loss improved from 0.56624 to 0.38841, saving model to LSTM_models\best_LSTM_model.h5
Epoch 3/100

Epoch 00003: loss improved from 0.38841 to 0.29063, saving model to LSTM_models\best_LSTM_model.h5
Epoch 4/100

Epoch 00004: loss improved from 0.29063 to 0.20948, saving model to LSTM_models\best_LSTM_model.h5
Epoch 5/100

Epoch 00005: loss improved from 0.20948 to 0.16417, saving model to LSTM_models\best_LSTM_model.h5
Epoch 6/100

Epoch 00006: loss improved from 0.16417 to 0.11353, saving model to LSTM_models\best_LSTM_model.h5
Epoch 7/100

Epoch 00007: loss improved from 0.11353 to 0.08735, saving model to LSTM_models\best_LSTM_model.h5
Epoch 8/100

Epoch 00008: loss improved from 0.08735 to 0.08411, saving model to LSTM_models\best_LSTM_model.h5
Epoch 9/100

Epoch 00009: loss improved from 0.08411 to 0.05828, saving model to LSTM_models\best_LSTM_model


Epoch 00077: loss did not improve from 0.00563
Epoch 78/100

Epoch 00078: loss did not improve from 0.00563
Epoch 79/100

Epoch 00079: loss did not improve from 0.00563
Epoch 80/100

Epoch 00080: loss did not improve from 0.00563
Epoch 81/100

Epoch 00081: loss did not improve from 0.00563
Restoring model weights from the end of the best epoch.
Epoch 00081: early stopping


<tensorflow.python.keras.callbacks.History at 0x1cca0540310>

In [4]:
reconstructed_model = keras.models.load_model("LSTM_models/best_LSTM_model.h5")
reconstructed_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 53, 128)           1819776   
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 53, 128)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_10 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 130       
Total params: 2,099,522
Trainable params: 2,099,522
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.load_weights('LSTM_models/best_LSTM_model.h5')

y_test_hat = model.predict(test_text_vec)
confusion = confusion_matrix(np.argmax(y_test_label,axis=1), np.argmax(y_test_hat,axis=1))
class_report = classification_report(np.argmax(y_test_label, axis=1), np.argmax(y_test_hat, axis=1))
                                     
print("Confusion matrix:\n", confusion)
print("\n")
print("Classification report:\n",class_report)

perf_metrics = evaluate.performance(y_test, np.argmax(y_test_hat,axis=1), y_test_hat)
print(perf_metrics['report'])

threshold_metrics = evaluate.threshold(y_test_hat, y_test)
print(threshold_metrics)

Confusion matrix:
 [[1246  199]
 [ 176  227]]


Classification report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      1445
           1       0.53      0.56      0.55       403

    accuracy                           0.80      1848
   macro avg       0.70      0.71      0.71      1848
weighted avg       0.80      0.80      0.80      1848


The evaluation report of classification is:
Confusion Matrix:
[[1246  199]
 [ 176  227]]
Accuracy: 0.797077922077922
Precision: 0.5328638497652582
Recall: 0.5632754342431762
F2 Score: 0.5569185475956819
AUC Score: 0.8074493204083562

{'threshold': 0.01, 'score': 0.6023255813953488, 'y_pred': array([0, 0, 0, ..., 1, 0, 0])}


In [8]:
# import seaborn as sns

# sns.distplot(y_test_hat)

### Bidirectional LSTM with CNN layer

In [27]:
DEFAULT_BATCH_SIZE = 64
DEFAULT_EPOCHS = 100

model_cnn = Sequential()
model_cnn.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
model_cnn.add(SpatialDropout1D(0.2))
model_cnn.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model_cnn.add(Conv1D(64, 4))
model_cnn.add(GlobalMaxPool1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(2, activation='sigmoid'))


model_cnn.compile(optimizer = 'adam', 
              loss=keras.losses.BinaryCrossentropy(from_logits = False),
              metrics=['accuracy'])

early_stop_cnn = keras.callbacks.EarlyStopping(monitor = 'loss',
                                                  verbose = 1,
                                                  patience = 10,
                                                  mode = 'auto',
                                                  restore_best_weights = True
                                                 )

checkpoint_cnn = ModelCheckpoint('LSTM_models/best_LSTM_CNN_model.h5', monitor='loss', mode='auto', 
                             verbose = 1, save_best_only=True)

callbacks_list_cnn = [checkpoint_cnn, early_stop_cnn]

model_cnn.fit(x=train_text_vec,
              y=y_train_label,
              class_weight=cws,
              batch_size=DEFAULT_BATCH_SIZE,
              epochs=DEFAULT_EPOCHS,
              callbacks=callbacks_list_cnn,
              verbose=1,
              validation_data=(
                  test_text_vec,
                  y_test_label
              ))

Epoch 1/100

Epoch 00001: loss improved from inf to 0.54548, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 2/100

Epoch 00002: loss improved from 0.54548 to 0.35497, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 3/100

Epoch 00003: loss improved from 0.35497 to 0.24929, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 4/100

Epoch 00004: loss improved from 0.24929 to 0.17787, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 5/100

Epoch 00005: loss improved from 0.17787 to 0.11824, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 6/100

Epoch 00006: loss improved from 0.11824 to 0.08743, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 7/100

Epoch 00007: loss improved from 0.08743 to 0.06990, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 8/100

Epoch 00008: loss improved from 0.06990 to 0.05597, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 9/100

Epoch 00009: loss improved from 0.05597 to 0.04819, saving mode


Epoch 00037: loss did not improve from 0.00858
Epoch 38/100

Epoch 00038: loss did not improve from 0.00858
Epoch 39/100

Epoch 00039: loss improved from 0.00858 to 0.00747, saving model to LSTM_models\best_LSTM_CNN_model.h5
Epoch 40/100

Epoch 00040: loss did not improve from 0.00747
Epoch 41/100

Epoch 00041: loss did not improve from 0.00747
Epoch 42/100

Epoch 00042: loss did not improve from 0.00747
Epoch 43/100

Epoch 00043: loss did not improve from 0.00747
Epoch 44/100

Epoch 00044: loss did not improve from 0.00747
Epoch 45/100

Epoch 00045: loss did not improve from 0.00747
Epoch 46/100

Epoch 00046: loss did not improve from 0.00747
Epoch 47/100

Epoch 00047: loss did not improve from 0.00747
Epoch 48/100

Epoch 00048: loss did not improve from 0.00747
Epoch 49/100

Epoch 00049: loss did not improve from 0.00747
Restoring model weights from the end of the best epoch.
Epoch 00049: early stopping


<tensorflow.python.keras.callbacks.History at 0x1cd012fba60>

In [6]:
reconstructed_model_cnn = keras.models.load_model("LSTM_models/best_LSTM_CNN_model.h5")
reconstructed_model_cnn.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 53, 128)           1819776   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 53, 128)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 53, 256)           263168    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 50, 64)            65600     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_9 (Dense)              (None, 2)                

In [28]:
model_cnn.load_weights('LSTM_models/best_LSTM_CNN_model.h5')

y_test_hat_cnn = model_cnn.predict(test_text_vec)
confusion_cnn = confusion_matrix(np.argmax(y_test_label,axis=1), np.argmax(y_test_hat_cnn,axis=1))
class_report_cnn = classification_report(np.argmax(y_test_label, axis=1), np.argmax(y_test_hat_cnn, axis=1))
                                     
print("Confusion matrix:\n", confusion_cnn)
print("\n")
print("Classification report:\n",class_report_cnn)

perf_metrics_cnn = evaluate.performance(y_test, np.argmax(y_test_hat_cnn,axis=1), y_test_hat_cnn)
print(perf_metrics_cnn['report'])

threshold_metrics_cnn = evaluate.threshold(y_test_hat_cnn, y_test)
print(threshold_metrics_cnn)

Confusion matrix:
 [[1045  400]
 [  95  308]]


Classification report:
               precision    recall  f1-score   support

           0       0.92      0.72      0.81      1445
           1       0.44      0.76      0.55       403

    accuracy                           0.73      1848
   macro avg       0.68      0.74      0.68      1848
weighted avg       0.81      0.73      0.75      1848


The evaluation report of classification is:
Confusion Matrix:
[[1045  400]
 [  95  308]]
Accuracy: 0.7321428571428571
Precision: 0.4350282485875706
Recall: 0.7642679900744417
F2 Score: 0.6637931034482758
AUC Score: 0.8174452849304954

{'threshold': 0.03, 'score': 0.6939868204283361, 'y_pred': array([1, 0, 0, ..., 0, 0, 0])}


In [7]:
# import seaborn as sns

# sns.distplot(y_test_hat_cnn)