<a href="https://colab.research.google.com/github/DSPOWER93/quora-insincere/blob/main/Bi_LSTM_insincere_question_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Synopsis:

The Following file is a working on NLP classifier to identify insincere questions. Data is taken from kaggle Competition [Quora Insicere Question Classification](https://www.kaggle.com/c/quora-insincere-questions-classification).

- **Champion Model**: Bi-Directional LSTM + Conv1D
- **Params**:
- **Framework**:


#### Mounting G-drive to get training & embedding Data.

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Importing python Library & packages

In [2]:
#  Importing base Libraries.
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
import re, os 


# TensorFlow & Keras Libraries
import tensorflow as tf
from tensorflow import keras
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras import layers
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import initializers
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Dropout, CuDNNLSTM
from keras.initializers import Constant
from sklearn.model_selection import train_test_split 

# For Parallel Processing
import multiprocessing
import concurrent.futures as c

### Installing Remaining packages

In [3]:
%%capture
!pip install pyspellchecker

### Importing the raw Data
Quora's question classfier data is stored on gdrive for easy loading.

In [4]:
# Loading csv file
# Location to be changed as per file location
df = pd.read_csv('/content/drive/MyDrive/Quora_project/train.csv')

In [5]:
#  Instpecting the Dataset
print(df.info())

print(df.groupby('target').count()/ len(df) * 100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1306122 non-null  object
 1   question_text  1306122 non-null  object
 2   target         1306122 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 29.9+ MB
None
              qid  question_text
target                          
0       93.812982      93.812982
1        6.187018       6.187018


### Importing pre-processing libraries

In [6]:
# Importing Libraries for NLP 
import re
import spacy
spacy.prefer_gpu()
import string

# to make spacy work in pipeline mode.
nlp_vocab = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
nlp_vocab.add_pipe(nlp_vocab.create_pipe('sentencizer'))

# Importing spellchecker & NLP
from spellchecker import SpellChecker

### Corpus Cleaning

Defining function to clean corpus. 

In [7]:
# lemmatization of words from spacy. 
def spacy_lemmatize(x):
  x = nlp_vocab(x)
  x = [s.lemma_ for s in x]
  x = " ".join(x)
  return x

# Spelling collection 
spell = SpellChecker()
def correct_spellings(x, spell=spell):
    """correct the misspelled words of a given corpus"""
    x = x.split()
    misspelled = spell.unknown(x)
    result = map(lambda word : spell.correction(word) if word in  misspelled else word, x)
    return " ".join(result)

# corpus cleaning. keeping Lemmatization default as False for pre-processing as it's time serialbased activity we will later use it in parallel computing.
def corpus_cleaning(x, correct_spelling=True, remove_emojis=True, remove_stop_words=False, lemmatize=False):
    """Apply function to a clean a corpus"""
    x = x.lower().strip()
    # romove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    x = url.sub(r'',x)
    # remove html tags
    html = re.compile(r'<.*?>')
    x = html.sub(r'',x)
    # remove punctuation
    operator = str.maketrans('','',string.punctuation) #????
    x = x.translate(operator)
    if correct_spelling:
        x = correct_spellings(x)
    if lemmatize:
        x = spacy_lemmatize(x)
    if remove_emojis:
        x = x.encode('ascii', 'ignore').decode('utf8').strip()
    if remove_stop_words:
        x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    return x

### Lemmatization using parallel processing

Spacy in general creates metadata of each corpus element which is quite of time consuming task in general. This creates room for parallel processing as running for loop will be doing sequential job not utilizing potential power of computing instance. The codes used to execute were used from well witten article on Spacy for parallel processing. [Link](https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html)

In [8]:
#%%time
from joblib import Parallel, delayed

def lemmatize_pipe(doc):
    lemma_list = [s.lemma_ for s in doc] 
    return lemma_list
    
def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp_vocab.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp_vocab.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=1000):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

#### Sample test code for Cleaning using Parallelization.

In [9]:
import multiprocessing
pool = multiprocessing.Pool()

corpuss = ['I am not, sure whether  halth living this is an rigth answer', 'greatness', 'are you even seriuos', 'please dont tell its a joke']

preprocess_parallel(corpuss)
pool_processing = pool.map(corpus_cleaning, corpuss)
print(pool_processing)

['i am not sure whether health living this is an rigth answer', 'greatness', 'are you even serious', 'please dont tell its a joke']


### View of original Data

In [10]:
# normal questions
print(df[df['target'] == 0]['question_text'][:5])
# immature questions
print(df[df['target'] == 1]['question_text'][:5])

0    How did Quebec nationalists see their province...
1    Do you have an adopted dog, how would you enco...
2    Why does velocity affect time? Does velocity a...
3    How did Otto von Guericke used the Magdeburg h...
4    Can I convert montra helicon D to a mountain b...
Name: question_text, dtype: object
22     Has the United States become the largest dicta...
30     Which babies are more sweeter to their parents...
110    If blacks support school choice and mandatory ...
114    I am gay boy and I love my cousin (boy). He is...
115                 Which races have the smallest penis?
Name: question_text, dtype: object


### Cleaning of Data

Before moving to corpus Cleaning demonstrating advantage and comparison of using parallel processing in pre-processing using small textual corpus, comparing traditional sequential Methods.

In [None]:
%%time
dd = list(df[:500]['question_text'])
fd = [corpus_cleaning(k) for k in dd]

CPU times: user 33.4 s, sys: 112 ms, total: 33.5 s
Wall time: 33.4 s


In [None]:
%%time
with c.ProcessPoolExecutor() as executor:
  results = [ k for k in executor.map(corpus_cleaning,dd)]

CPU times: user 224 ms, sys: 107 ms, total: 331 ms
Wall time: 12.8 s


In [None]:
fd == results

True

#### Observation

The Computational time has reduced by almost more than 50% for Calculation. we would inculcate the option 

#### One time activity of cleaning the Data


We will Proceed ahead with cleaning the data as one time activity of one million records. As cleaning Large Data takes quite amount of time. The objective is that not to repeat the cleaning process again in analysis. We will split the Data in small Parts and download seperately, Because if Python crashes midway we can resume from checkpoints.

In [None]:
'''
# %%time
# corpus transformation.

from google.colab import files

corpus = list(df[0:1000000]['question_text'])
target =  list(df[0:1000000]['target'])
rows = list(range(0,len(corpus)))

f = 0 
for i in range(0,len(corpus), int(len(corpus)/5)):
  f =  f+1
  iter = corpus[i:i+int(len(corpus)/5)]
  iter_target = target[i:i+int(len(corpus)/5)]
  iter_rows = rows[i:i+int(len(corpus)/5)]
  print('completion {}'.format(i),'/{}'.format(len(corpus)))
  # using parallel processing to complete to clean the corpus
  with c.ProcessPoolExecutor() as executor:
    my_list = [ k for k in executor.map(corpus_cleaning,iter)]
  # Parallel computing of lemmatization. 
  my_list = preprocess_parallel(my_list)
  joined_corpus =[]
  for l in my_list:
    joined_corpus.append(" ".join(l))
  iter_df = pd.DataFrame({
      'Rows' : iter_rows,
      'Text' : joined_corpus,
      'target': iter_target
  })
  iter_df.to_csv('clean_data_'+str(f)+'.csv')
  files.download('clean_data_'+str(f)+'.csv') 
'''

### Importing Cleaned Data from above step


In [11]:
clean_df = pd.read_csv('/content/drive/MyDrive/Quora_project/final_data.csv', index_col= 'Unnamed: 0').reset_index()

# Drop Null entries
clean_df = clean_df[clean_df['Text'].notnull()]

### Stop Words Removal 
#### (In order text to maintain the sentiment context better haven't dropped stop words in analysis)

Removing specific Stop Words from Data set which wouldn't be having much impact on the corpus Data. Usually the Stop Words from existing Libraries would be having including Negated words like No, Not, aren't, can't etc... which does have impact on sentiment present in text

In [None]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

def drop_stopwords(x): 
    corpus = " ".join([word for word in x.split() if word not in (stopwords)])
    return corpus
drop_stopwords('i am happy who is this')

In [None]:
# cleaning stop words
# clean_df['Text'] =  clean_df['Text'].apply(drop_stopwords)

#### Inspecting final  clean Data set for Model consumption.

In [None]:
# normal questions
print(clean_df[clean_df['target'] == 0]['Text'][:5])
# immature questions
print(clean_df[clean_df['target'] == 1]['Text'][:5])

0    how do quebec nationalist see their province a...
1    do you have a adopt dog how would you encourag...
2    why doe velocity affect time doe velocity affe...
3    how do otto von guericke use the magdeburg hem...
4    can i convert mantra helicon i to a mountain b...
Name: Text, dtype: object
22     have the unite state become the large dictator...
30     which baby be much sweet to their parent dark ...
110    if black support school choice and mandatory s...
114    i be gay boy and i love my cousin boy he be se...
115                      which race have the small penis
Name: Text, dtype: object


### Filtering out Sample Data

The base Dataset is having category ratio  of 93:6, which makes distribution inbalance. Have resized the proportion 90:10 to reduce data imbalance.


In [12]:
import random
random.seed(0)

# Seperating insincere questions.
#  Insincere Question 
insincere=clean_df[clean_df['target'] == 1]
# Normal Question 
sincere=clean_df[clean_df['target'] == 0]


#  Consuming 50% of insincere questions in train & test of Data
top_50 = int(round(len(insincere)*0.5,0))

# Generating seperate Dataframe with 30K 
insincere_train = insincere[:top_50]
insincere_prod =  insincere[top_50:]

sincere_train = sincere[:270000]
sincere_prod = sincere[270000:]

insincere_train = insincere_train.append(sincere_train)
insincere_prod = insincere_prod.append(sincere_prod)


train_shuffle = (list(random.sample(range(len(insincere_train)), len(insincere_train))))
test_shuffle = (list(random.sample(range(len(insincere_prod)), len(insincere_prod))))


insincere_train = insincere_train.iloc[train_shuffle,:]
insincere_prod = insincere_prod.iloc[test_shuffle,:]

del(sincere_train,sincere_prod)


insincere_train.drop(['index','Rows'], axis=1, inplace= True)
insincere_prod.drop(['index','Rows'], axis=1, inplace= True)

print(insincere_train.shape, insincere_prod.shape)
insincere_train.groupby('target').count()/ len(insincere_train) * 100

(300894, 2) (699104, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,Text
target,Unnamed: 1_level_1
0,89.732597
1,10.267403


### Train & Test Split (60:40)

In [None]:
from sklearn.model_selection import train_test_split

# Split Train and Validation data
X_train, X_test = train_test_split( insincere_train, test_size=0.4, random_state=0)

y_train = X_train['target']
y_valid = X_test['target']

print(X_train.shape, X_test.shape)

(180536, 2) (120358, 2)


## Create a vocabulary index

Let's use the `TextVectorization` to index the vocabulary found in the dataset.
Later, we'll use the same layer instance to vectorize the samples.

Our layer will only consider the top 30,000 words, and will truncate or pad sequences to be 40 tokens long.  We would do word embedding on  entire Data set. As model would have context of entire Dataset.

Also below cell contains codes to save & reload the vectorizers, this would required for model deployment.

In [None]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=40)
text_ds = tf.data.Dataset.from_tensor_slices(X_train['Text']).batch(128)
vectorizer.adapt(text_ds)

In [None]:
# Saving & loading the vectorizer again 
import pickle

# Vector for word "this"
print (vectorizer("this"))

# to save Pickle the config and weights
pickle.dump({'config': vectorizer.get_config(),
             'weights': vectorizer.get_weights()}
            , open("vectorizer.pkl", "wb"))

print ("*"*100)

from_disk = pickle.load(open("vectorizer.pkl", "rb"))
loaded_vector = TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
loaded_vector.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
loaded_vector.set_weights(from_disk['weights'])


s= [(loaded_vector("this"))]
kk = tf.keras.preprocessing.sequence.pad_sequences( s,maxlen= 40, padding='post')
print(kk)

tf.Tensor(
[55  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], shape=(40,), dtype=int64)
****************************************************************************************************
[[55  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


#### Glimpse of First 10 words vocabulary

In [None]:
print(vectorizer.get_vocabulary()[:10])
print(loaded_vector.get_vocabulary()[:10])

['', '[UNK]', 'be', 'the', 'a', 'what', 'to', 'in', 'do', 'of']
['', '[UNK]', 'be', 'the', 'a', 'what', 'to', 'in', 'do', 'of']


As you can see, "be" gets represented as "2". Why not 0, given that "be" was the first word in the vocabulary? That's because index 0 is reserved for padding and index 1 is reserved for "out of vocabulary" tokens.
Creating a dict mapping words to their indices:


In [None]:
# Creating vocabulary with index values
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))
# printing index of flower
print(word_index['flower'])

2335


As you can see, we obtain the same encoding as above for our test sentence:

In [None]:
test = ['iran', 'life']
[word_index[w] for w in test]

[1230, 77]


#### Importing web embedding. 
We would be importing word embedding. The archive contains  text-encoded vectors of various sizes: 50-dimensional, 100-dimensional, 200-dimensional, 300-dimensional.  would be using 50D ones. Word embedding from glove has been used here. downloaded from [Glove](https://nlp.stanford.edu/projects/glove/)

Let's make a dict mapping words (strings) to their NumPy vector representation:

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/drive/MyDrive/Word_Embeddings/glove.6B.50d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


### Embedding Matrix

Now, let's prepare a corresponding embedding matrix that we can use in a Keras
`Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained
vector for the word of index `i` in our `vectorizer`'s vocabulary.

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 26097 words (3903 misses)


### Evaluating if the Mapping of words has happend accurately for embedding matrix.

In [None]:
# in Word glove dictionary embedding of Iran as an example
print(embeddings_index['iran'])

# indexing number in vocb for iran
print(word_index['iran'])

#  embedding matrix of iran through Numerical indexing 
print(embedding_matrix[1230])

[-0.18997    0.11493    0.85566   -0.039811   0.10742   -0.44042
  1.2496     0.49928    0.58689    0.8321     0.027948  -0.85445
 -0.39854   -0.18763   -0.050099   0.95036    0.59861    0.25454
  0.6548     0.87505    0.82139   -0.0041283  0.9193    -0.033385
  0.1914    -3.0393     0.58703    0.23673    0.031058   0.17775
  2.4503    -0.35655   -0.68777   -0.43984    0.12271   -0.46345
 -0.29642    0.33648   -1.6442     0.23183   -0.019779   0.0057172
  0.94701   -1.2708     0.53767    0.80297   -0.70422    1.7059
 -0.64729   -0.97299  ]
1230
[-0.18997     0.11493     0.85566002 -0.039811    0.10742    -0.44042
  1.24960005  0.49928001  0.58688998  0.83209997  0.027948   -0.85444999
 -0.39853999 -0.18763    -0.050099    0.95036     0.59860998  0.25454
  0.6548      0.87505001  0.82138997 -0.0041283   0.91930002 -0.033385
  0.19140001 -3.03929996  0.58702999  0.23672999  0.031058    0.17775001
  2.45029998 -0.35655001 -0.68777001 -0.43983999  0.12271    -0.46345001
 -0.29642001  0.336

### Applying Vectorizer (Numerical Indexing) on Train & Test Data.

In [None]:
x_train = vectorizer(np.array([[s] for s in X_train['Text']])).numpy()
x_val = vectorizer(np.array([[s] for s in X_test['Text']])).numpy()

### Loading embedding Layer

Next, we load the pre-trained word embeddings matrix into an `Embedding` layer.

Note that we set `trainable=True` so as to fine tune the embedding as per contextual requirement.

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

### Monitoring Metrics for Epochs

In [None]:
#https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Model 1 - Simple Bi-LSTM model

In [None]:
#  Import K to clear session for model.
from keras import backend as K
K.clear_session()

RANDOM_STATE = 42
# Define weight initializer with a random seed to ensure reproducibility
weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE)

def simple_LSTM():
  K.clear_session()
  model=Sequential()
  model.add(embedding_layer)
  model.add(Bidirectional(LSTM(64)))
  # Adding Dropout
  model.add(Dropout(0.2))
  model.add((Dense(64, activation= 'relu')))
  model.add(Dropout(0.2))
  model.add(Dense(1, activation='sigmoid',
                  kernel_initializer=weight_initializer))
  # compile the model
  optimzer = keras.optimizers.Adam(   #clipvalue=0.5,
                                   learning_rate= 0.0001) # clip value to avoid the gradient exploding
 
  model.compile(optimizer=optimzer, 
              loss='binary_crossentropy', 
              metrics=['acc',f1_m,precision_m, recall_m, tf.keras.metrics.AUC()])
  return model

In [None]:
# Model Initiation
simple_LSTM_Model = simple_LSTM()

# Batch Size 
BATCH_SIZE = 512 
NUM_STEPS = len(X_train.index) // int(BATCH_SIZE)

# early stopping
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor= 'val_auc', 
                                                 patience=5,
                                                 mode='max',
                                                 restore_best_weights=True)
# fit the model
simple_LSTM_Model.fit(x_train, y_train,
                      epochs=100,
                      batch_size= BATCH_SIZE  ,
                      steps_per_epoch = NUM_STEPS,
                      callbacks=[earlyStopping], 
                      validation_data=(x_val,y_valid),
                      verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<keras.callbacks.History at 0x7f6844a8ead0>

### Model-1 Test predict Results  

In [None]:

# model results
train_pred = simple_LSTM_Model.predict(x_train)
test_pred = simple_LSTM_Model.predict(x_val)

train_pred_binary = np.where(train_pred> 0.49,1,0)
test_pred_binary = np.where(test_pred> 0.49,1,0)
# reshaping array 
train_pred_binary = train_pred_binary.reshape(180536,)
test_pred_binary = test_pred_binary.reshape(120358,)


# Accuracy
acc = sum(train_pred_binary == y_train)/  len(y_train)
test_acc = sum(test_pred_binary == y_valid)/  len(y_valid)
print('Accuracy of train model is {}'.format(acc))
print('Accuracy of test model is {}'.format(test_acc))


# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_valid,test_pred_binary))


#  Recall  , f1 , precision 
from sklearn.metrics import classification_report
print(classification_report(y_valid,test_pred_binary))

Accuracy of train model is 0.9427094873044711
Accuracy of test model is 0.9311304607919706
[[103889   4197]
 [  4092   8180]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96    108086
           1       0.66      0.67      0.66     12272

    accuracy                           0.93    120358
   macro avg       0.81      0.81      0.81    120358
weighted avg       0.93      0.93      0.93    120358



### Saving Model 1 

In [None]:
from google.colab import files

# Save the entire model to a HDF5 file.
# The '.h5' extension indicates that the model should be saved to HDF5.
model.save('model_1_final_20210911.h5') 
files.download('model_1_final_20210911.h5')

### MODEL 2 - Adding Dual Bi-LSTM Layer with GlobalAveragepooling1D 

GlobalAveragepooling1D is used to reduce dimentiality of data  in order to reduce noise from data.

In [None]:
LSTM_UNITS = 64
BATCH_SIZE = 512
DENSE_HIDDEN_UNITS = 2 * LSTM_UNITS
EPOCHS = 100

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True)


#  Import K to clear session for model.
from keras import backend as K
K.clear_session()


def LSTM_with_Pooling():
  K.clear_session()
  model=Sequential()
  model.add(embedding_layer)
  model.add(Bidirectional(LSTM(LSTM_UNITS, return_sequences=True)))
  model.add(Dropout(0.4))
  model.add(Bidirectional(LSTM(LSTM_UNITS, return_sequences=True)))
  model.add( Dense(DENSE_HIDDEN_UNITS,  activation='relu') ) 
  model.add(Dropout(0.4))
  model.add(GlobalAveragePooling1D())
  model.add(Dense(1, activation='sigmoid'))
  # compile the model
  optimzer = keras.optimizers.Adam(    #clipvalue=0.5,
                                   learning_rate= 0.0001) # clip value to avoid the gradient exploding
  model.compile(optimizer=optimzer,
                loss='binary_crossentropy',
                metrics=['acc',f1_m,precision_m, recall_m, tf.keras.metrics.AUC()])
  return model

In [None]:

# early stopping
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor= 'val_recall_m', 
                                                 patience=20,
                                                 mode='max',
                                                 restore_best_weights=True)

model1 = LSTM_with_Pooling()
model1.fit(x_train, y_train,
                    epochs=EPOCHS, 
                    batch_size= BATCH_SIZE  ,
                    callbacks=[earlyStopping], #check,
                    validation_data=(x_val,y_valid), 
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100


<keras.callbacks.History at 0x7fa5c4675950>

### Saving  Model 2

In [None]:
### Saving Distilbert Model

#  Saving Model weights
model1.save_weights('LSTM_pooling_final/my_model')

### Loading saved model.
# loading the model params on which it was trained
load_model = LSTM_with_Pooling()

#  loading the training weights back to model.
load_model.load_weights('LSTM_pooling_final/my_model')

from google.colab import files
!zip -r /content/LSTM_pooling_final.zip /content/LSTM_pooling_final
files.download('LSTM_pooling_final.zip')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6824fbb110>

### Testing Model 2 on validation data.

In [None]:
# model results
train_pred1 = load_model.predict(x_train)
test_pred1 = load_model.predict(x_val)

print(train_pred1.shape, test_pred1.shape)

train_pred_binary1 = np.where(train_pred1> 0.49,1,0)
test_pred_binary1 = np.where(test_pred1> 0.49,1,0)
# reshaping array 
train_pred_binary1 = train_pred_binary1.reshape(180536,)
test_pred_binary1 = test_pred_binary1.reshape(120358,)


# Accuracy
acc = sum(train_pred_binary1 == y_train)/  len(y_train)
test_acc = sum(test_pred_binary1 == y_valid)/  len(y_valid)
print('Accuracy of train model is {}'.format(acc))
print('Accuracy of test model is {}'.format(test_acc))


# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_valid,test_pred_binary1))


#  Recall  , f1 , precision 
from sklearn.metrics import classification_report
print(classification_report(y_valid,test_pred_binary1))

Accuracy of train model is 0.9421721983427128
Accuracy of test model is 0.9333571511656891
[[103495   4591]
 [  3430   8842]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96    108086
           1       0.66      0.72      0.69     12272

    accuracy                           0.93    120358
   macro avg       0.81      0.84      0.83    120358
weighted avg       0.94      0.93      0.93    120358



### Random Sample Text to test model 2 results.

In [None]:
ques = ['will democrats win 2021', 
        'is he still alive', 
        'name famous black celebreties',
        'are republican not racists', 
        'distance to moon'
        'which race is the smartest', 
        'what are achivements by trump' ,
        'are republican racists',
        'is my job safe', 
        'are jews loyal', 
        'which race is not loyal',
        'why republican cry always', 
        'does trump hate gay', 
        'do we require another pandamic',
        'is america under dictatorship', 
        'are muslim terrorist', 
        'do you support taliban',
        'are white people', 
        'are you fool', 
        'are muslim extremist', 
        'is china facist country' ,
        'are democrats dumb', 
        'do blacks support gay',  
        'do americans support holocaust']
ques = preprocess_parallel(ques)

' '.join(ques[0])
# ques = ' '.join(preprocess_parallel(ques))

join_corpus =[]
for l in ques:
  join_corpus.append(' '.join(l))


testing = vectorizer(np.array([[s] for s in join_corpus])).numpy()

(model1.predict(testing))

### Model 3 - Hybrid model LSTM + Conv1D

In [None]:

# Pointers
LSTM_UNITS = 64
BATCH_SIZE = 128
DENSE_HIDDEN_UNITS = 2 * LSTM_UNITS
EPOCHS = 100


#  Import K to clear session for model.
from keras import backend as K

# Padding of sentence is done 40
maxlen = 40 

# Model 3
def BiLSTM_CNN(spatialdropout=0.2, rnn_units=128, filters=[100, 80, 30, 12], weight_decay=0.10):
  K.clear_session()
  x_input = Input(shape=(maxlen,))
  
  emb = Embedding(num_tokens,
                  embedding_dim, 
                  embeddings_initializer=keras.initializers.Constant(embedding_matrix), 
                  trainable=False, name='Embedding')(x_input)

  # adding  spatial drop out will nullify the embedding vectors with Zeros.
  x = SpatialDropout1D(rate=spatialdropout, seed=10000)(emb) 

  rnn = Bidirectional(LSTM(rnn_units, return_sequences=True, kernel_initializer=initializers.glorot_uniform(seed=123000), recurrent_initializer=initializers.Orthogonal(gain=1.0, seed=123000)))(x)
  
  # Adding 4 Channels of Conv1D layers
  x1 = Conv1D(filters=filters[0], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=110000))(rnn)
  x2 = Conv1D(filters=filters[1], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=120000))(rnn)
  x3 = Conv1D(filters=filters[2], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=130000))(rnn)
  x4 = Conv1D(filters=filters[3], activation='relu', kernel_size=1, padding='same', kernel_initializer=initializers.glorot_uniform(seed=140000))(rnn)

# Adding max pooling to select domnant feature
  x1 = GlobalMaxPooling1D()(x1)
  x2 = GlobalMaxPooling1D()(x2)
  x3 = GlobalMaxPooling1D()(x3)
  x4 = GlobalMaxPooling1D()(x4)

  c = concatenate([x1, x2, x3, x4])
  x = Dense(256, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=111000))(c)
  x = Dropout(0.2, seed=10000)(x)
  x = BatchNormalization()(x)
  x_output = Dense(1, activation='sigmoid', kernel_initializer=initializers.glorot_uniform(seed=110000))(x)
 
  model = Model(inputs=x_input, outputs=x_output)
  model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(    #clipvalue=0.5, 
                                 learning_rate= 0.0001) # clip value to avoid the gradient exploding
                , metrics=['acc',f1_m,precision_m, recall_m, tf.keras.metrics.AUC()])
  return model

In [None]:
model2 = BiLSTM_CNN()
model2.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 40)]         0           []                               
                                                                                                  
 Embedding (Embedding)          (None, 40, 50)       1500100     ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 40, 50)      0           ['Embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 40, 256)      183296      ['spatial_dropout1d[0][0]']  

In [None]:
# 
model2 = BiLSTM_CNN()

earlyStopping = tf.keras.callbacks.EarlyStopping(monitor= 'val_auc', 
                                                 patience=20,
                                                 mode='max',
                                                 restore_best_weights=True)

model2.fit(x_train, y_train,
                    epochs=EPOCHS, 
                    batch_size= BATCH_SIZE  ,
                    callbacks=[earlyStopping], #check,
                    validation_data=(x_val,y_valid), 
                    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


<keras.callbacks.History at 0x7f51d5c335d0>

### Model 3 Testing results

In [None]:
# model results
train_pred2 = model2.predict(x_train)
test_pred2 = model2.predict(x_val)

train_pred_binary2 = np.where(train_pred2>= 0.48,1,0)
test_pred_binary2 = np.where(test_pred2>= 0.48,1,0)
# reshaping array 
train_pred_binary2 = train_pred_binary2.reshape(180536,)
test_pred_binary2 = test_pred_binary2.reshape(120358,)

# Accuracy




acc = sum(train_pred_binary2 == y_train)/  len(y_train)
test_acc = sum(test_pred_binary2 == y_valid)/  len(y_valid)
print('Accuracy of train model is {}'.format(acc))
print('Accuracy of test model is {}'.format(test_acc))


# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_valid,test_pred_binary2))


#  Recall  , f1 , precision 
from sklearn.metrics import classification_report
print(classification_report(y_valid,test_pred_binary2))

Accuracy of train model is 0.9457227367394868
Accuracy of test model is 0.9275993286694694
[[101831   6255]
 [  2459   9813]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96    108086
           1       0.61      0.80      0.69     12272

    accuracy                           0.93    120358
   macro avg       0.79      0.87      0.83    120358
weighted avg       0.94      0.93      0.93    120358



### Random Sample Text to test model 3 results.

In [None]:
ques = ['will democrats win 2021', 
        'is he still alive', 
        'name famous black celebreties',
        'are republican not racists', 
        'distance to moon'
        'which race is the smartest', 
        'what are achivements by trump' ,
        'are republican racists',
        'is india safe', 
        'are jews loyal', 
        'which race is not loyal',
        'why republican cry always', 
        'does trump hate gay', 
        'do we require another pandamic',
        'is america under dictatorship', 
        'are muslim terrorist', 
        'do you support taliban',
        'are white people', 
        'are you fool', 
        'are muslim extremist', 
        'is china facist country' ,
        'are democrats dumb', 
        'do blacks support gay',  
        'do americans support holocaust']
ques = preprocess_parallel(ques)

' '.join(ques[0])
# ques = ' '.join(preprocess_parallel(ques))

join_corpus =[]
for l in ques:
  join_corpus.append(' '.join(l))


testing = vectorizer(np.array([[s] for s in join_corpus])).numpy()

pd.DataFrame({
     'Inputs': ques
    ,'results': model2.predict(testing)})

### Saving Model 3

In [None]:
### Saving Distilbert Model

#  Saving Model weights
model2.save_weights('BiLSTM_CNN/my_model')

### Loading saved model.
# loading the model params on which it was trained
load_model2 = BiLSTM_CNN()

#  loading the training weights back to model.
load_model2.load_weights('BiLSTM_CNN/my_model')


from google.colab import files
!zip -r /content/BiLSTM_CNN_v5.zip /content/BiLSTM_CNN
files.download('BiLSTM_CNN_v5.zip')

# model results
train_pred2 = load_model2.predict(x_train)
test_pred2 = load_model2.predict(x_val)


print(train_pred2.shape, test_pred2.shape)

train_pred_binary2 = np.where(train_pred2>= 0.48,1,0)
test_pred_binary2 = np.where(test_pred2>= 0.48,1,0)
# reshaping array 
train_pred_binary2 = train_pred_binary2.reshape(180536,)
test_pred_binary2 = test_pred_binary2.reshape(120358,)

# Accuracy




acc = sum(train_pred_binary2 == y_train)/  len(y_train)
test_acc = sum(test_pred_binary2 == y_valid)/  len(y_valid)
print('Accuracy of train model is {}'.format(acc))
print('Accuracy of test model is {}'.format(test_acc))


# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_valid,test_pred_binary2))


#  Recall  , f1 , precision 
from sklearn.metrics import classification_report
print(classification_report(y_valid,test_pred_binary2))

  adding: content/BiLSTM_CNN/ (stored 0%)
  adding: content/BiLSTM_CNN/my_model.index (deflated 70%)
  adding: content/BiLSTM_CNN/checkpoint (deflated 40%)
  adding: content/BiLSTM_CNN/my_model.data-00000-of-00001 (deflated 16%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(180536, 1) (120358, 1)
Accuracy of train model is 0.9457227367394868
Accuracy of test model is 0.9275993286694694
[[101831   6255]
 [  2459   9813]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96    108086
           1       0.61      0.80      0.69     12272

    accuracy                           0.93    120358
   macro avg       0.79      0.87      0.83    120358
weighted avg       0.94      0.93      0.93    120358



### Loading final Model

In [None]:
import zipfile

with zipfile.ZipFile('/content/drive/MyDrive/Final Model/BiLSTM_CNN_v3_final.zip', 'r') as zip_ref:
  zip_ref.extractall('/')

In [None]:
# loading the model params on which it was trained
inference_model = BiLSTM_CNN()

#  loading the training weights back to model.
inference_model.load_weights('BiLSTM_CNN/my_model')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa6244ef890>

In [None]:
ques = ['will democrats win 2021', 
        'is he still alive', 
        'name famous black celebreties',
        'are republican not racists', 
        'distance to moon'
        'which race is the smartest', 
        'what are achivements by trump' ,
        'are republican racists',
        'is india safe', 
        'are jews loyal', 
        'which race is not loyal',
        'why republican cry always', 
        'does trump hate gay', 
        'do we require another pandamic',
        'is america under dictatorship', 
        'are muslim terrorist', 
        'do you support taliban',
        'are white people', 
        'are you fool', 
        'are muslim extremist', 
        'is china facist country' ,
        'are democrats dumb', 
        'do blacks support gay',  
        'do americans support holocaust']
ques = preprocess_parallel(ques)

' '.join(ques[0])
# ques = ' '.join(preprocess_parallel(ques))

join_corpus =[]
for l in ques:
  join_corpus.append(' '.join(l))


testing = vectorizer(np.array([[s] for s in join_corpus])).numpy()

(inference_model.predict(testing))

array([[0.19312404],
       [0.19258153],
       [0.18131801],
       [0.9749401 ],
       [0.3616696 ],
       [0.30859843],
       [0.9526248 ],
       [0.48837128],
       [0.80390453],
       [0.42273363],
       [0.92296475],
       [0.9560496 ],
       [0.01709131],
       [0.50699085],
       [0.97760195],
       [0.4158798 ],
       [0.93195975],
       [0.6864999 ],
       [0.8287997 ],
       [0.27979502],
       [0.95841295],
       [0.96839327],
       [0.91232324]], dtype=float32)

### End