### Build a sequential NLP classifier which can use input text parameters to determine the customer 

In [56]:
import warnings
import json
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Activation, Bidirectional, LSTM, Dense, Dropout, Flatten, Input
from functools import partial
from tensorflow.keras.layers import BatchNormalization
from keras.models import Model
warnings.filterwarnings('ignore')

#### 1. Read and explore the data

In [24]:
df = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
df.sample(5)

Unnamed: 0,article_link,headline,is_sarcastic
1741,https://politics.theonion.com/epa-chief-pruitt...,epa chief pruitt welcomes delegation of pollut...,1
10438,https://www.theonion.com/skittish-juniors-depa...,skittish juniors-department clerk calls securi...,1
2160,https://www.huffingtonpost.com/entry/unions-pl...,unions plot major push after landmark labor ru...,0
13896,https://politics.theonion.com/poll-89-of-ameri...,poll: 89% of americans believe obama has faile...,1
22501,https://www.theonion.com/401k-enrollment-form-...,401k enrollment form sits at bottom of desk dr...,1


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


Expected data is loaded in notebook. 
Important columns as per problem statement are "headline" and "is_sarcastic".
All columns are having values, count of records is 26k+

#### 2. Retain relevant columns

In [27]:
# Eleminate rest of the non-important columns
df.drop(['article_link'], axis =1, inplace= True)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [28]:
df['headline'].isna().sum()

0

#### 3. Get length for each sentence

In [29]:
df['headline'].str.len()

0        78
1        84
2        79
3        84
4        64
         ..
26704    36
26705    23
26706    21
26707    60
26708    33
Name: headline, Length: 26709, dtype: int64

This share us length of each sentance

In [31]:
df['headline'].str.len().describe()

count    26709.000000
mean        60.910592
std         19.184470
min          7.000000
25%         48.000000
50%         61.000000
75%         73.000000
max        254.000000
Name: headline, dtype: float64

This length is including spaces in a sentances, we need to calculate without spaces

In [33]:
df['len'] = df['headline'].apply(lambda x:len(x.split(" ")))
df['len'].describe()

count    26709.000000
mean         9.847842
std          3.172099
min          2.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         39.000000
Name: len, dtype: float64

We have max 39 words length with this dataset

#### 4. Define parameters [1 Marks]

In [34]:
max_features = 10000
maxlen = 30
embedding_size = 50

Set the parameters for word embedding and feature count.
It a trail approach to reach results with optimum usage.

#### 5. Get indices for words

In [38]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df['headline']))

X = tokenizer.texts_to_sequences(df['headline'])

print("Number of Samples:", len(X))       
print(X[0]) 

Number of Samples: 26709
[307, 678, 3336, 2297, 47, 381, 2575, 5, 2576, 8433]


#### 6. Create features and labels

In [40]:
#pad sequence to have equal length of sentences picked for processing
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(df['is_sarcastic'])

print("Number of Labels: ", len(y))   
print(y)

Number of Labels:  26709
[0 0 1 ... 0 0 0]


#### 7. Get vocabulary size

In [41]:
X.shape

(26709, 30)

In [42]:
num_words = len(tokenizer.word_index) + 1
print(num_words)

29657


#### 8. Create a weight matrix using GloVe embeddings

In [43]:
EMBEDDING_FILE = 'glove.6B.50d.txt'
embeddings = {}
for o in open(EMBEDDING_FILE,  'r', encoding='utf-8'):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

In [48]:
embedding_matrix = np.zeros((num_words, 50))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [49]:
print (len(embeddings))

400000


In [50]:
print(embedding_matrix.shape)

(29657, 50)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50, stratify=y, test_size=0.3)

In [52]:
print(X_train.shape, X_test.shape)

(18696, 30) (8013, 30)


#### 9. Define and compile a Bidirectional LSTM model.

In [53]:
def rnn_model():
    # create model
    inputs = Input(name='inputs',shape=[maxlen])
    layer = Embedding(num_words,50,input_length=maxlen,  weights = [embedding_matrix])(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(128,name='Features')(layer)
    layer = Activation('relu')(layer)
    layer = Activation('LeakyReLU')(layer)
    layer = Dense(1,name='Out')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [57]:
# build the model
model = rnn_model()

#### 10. Fit the model and check the validation 

In [58]:
model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

In [59]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 30)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 30, 50)            1482850   
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 Features (Dense)            (None, 128)               8320      
                                                                 
 activation_2 (Activation)   (None, 128)               0         
                                                                 
 activation_3 (Activation)   (None, 128)               0         
                                                                 
 Out (Dense)                 (None, 1)                 129 

In [183]:
batch_size = 128
epochs = 15
training_history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
