In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional
from keras.utils.all_utils import plot_model
from keras.callbacks import TensorBoard

In [81]:
# Hyper-Parameters
max_features = 5000
no_classes = 1
max_length = 100
batch_size = 32
embedding_size = 64
dropout_rate = 0.5
no_epochs = 10

In [82]:
data = pd.read_csv("Corona_NLP_test.csv")
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [83]:
data.shape

(3798, 6)

In [84]:
data['Sentiment'] = LabelEncoder().fit_transform(data['Sentiment'])
data['Sentiment'].head()

0    0
1    4
2    1
3    2
4    3
Name: Sentiment, dtype: int32

In [85]:
data.isnull().sum()

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64

In [86]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   int32 
dtypes: int32(1), int64(2), object(3)
memory usage: 163.3+ KB


In [87]:
# lets summarize the Dataset
data.describe()

Unnamed: 0,UserName,ScreenName,Sentiment
count,3798.0,3798.0,3798.0
mean,1899.5,46851.5,2.192206
std,1096.532489,1096.532489,1.380256
min,1.0,44953.0,0.0
25%,950.25,45902.25,1.0
50%,1899.5,46851.5,2.0
75%,2848.75,47800.75,3.0
max,3798.0,48750.0,4.0


In [88]:
# lets summarize Categorical data also
data.describe(include = 'object')

Unnamed: 0,Location,TweetAt,OriginalTweet
count,2964,3798,3798
unique,1717,15,3798
top,United States,13-03-2020,TRENDING: New Yorkers encounter empty supermar...
freq,75,1233,1


In [89]:
x = data[data.columns.difference(['Sentiment'])]
y = data['Sentiment']

### Cleaning the Reviews

In [90]:
def clean_data(text):
    return ' '.join(re.sub("(@[a-zA-Z0-9]+)|([^0-9A-Za-z])|(https://[\w.]+/[\w]+)", " ", text).split())

data['OriginalTweet'] = data['OriginalTweet'].apply(clean_data)

In [91]:
# as it is clear that the reviews have so many unnecassry things such as Stopwords, Punctuations, numbers etc
import string
# First lets remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

data['OriginalTweet'] = data['OriginalTweet'].apply(punctuation_removal)

In [92]:
# Now lets Remove the Stopwords also

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')
stop.append("i'm")

stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 

def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words ]

data['OriginalTweet'] = data['OriginalTweet'].apply(stopwords_removal)

In [93]:
# lets remove the Numbers also

import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

data['OriginalTweet'] = data['OriginalTweet'].apply(drop_numbers)

In [94]:
# function for stemming and Lemmatization 
import en_core_web_sm
nlp = en_core_web_sm.load()
corpus = []
for i in range(len(x)):
    Tweet = data['OriginalTweet'][i]
    stemmer = PorterStemmer()
    Tweet = ' '.join([stemmer.stem(word) for word in Tweet.split()])
    Tweet = nlp(Tweet)
    Tweet = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.Tweet for word in Tweet])
    corpus.append(Tweet) 

In [95]:
corpus[0]

'trend new yorker encount empti supermarket shelv pictur wegman brooklyn sell onlin grocer foodkick maxdeliveri coronaviru fear shopper stock'

In [96]:
on_hot_r = [one_hot(corpus[0], max_features) for word in corpus]
on_hot_r[0]

[3189,
 2022,
 2978,
 647,
 1543,
 554,
 3294,
 683,
 2657,
 605,
 1095,
 4050,
 2909,
 2049,
 1181,
 4969,
 1278,
 1837,
 1455]

In [97]:
input = pad_sequences(on_hot_r,maxlen= max_length, padding='pre' )
input

array([[   0,    0,    0, ..., 1278, 1837, 1455],
       [   0,    0,    0, ..., 1278, 1837, 1455],
       [   0,    0,    0, ..., 1278, 1837, 1455],
       ...,
       [   0,    0,    0, ..., 1278, 1837, 1455],
       [   0,    0,    0, ..., 1278, 1837, 1455],
       [   0,    0,    0, ..., 1278, 1837, 1455]])

In [98]:
final_x = np.array(input)
final_y = np.array(y).reshape(-1,1)

In [99]:
# splitting the data into training and testing sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(final_x, final_y, test_size = 0.3, random_state = 15)

### Build Model

In [100]:
# Design Neural Network Architecture with LSTM
print('Building LSTM Model..')

model = Sequential()
# Add Embedding layer
model.add(Embedding(max_features, embedding_size, input_length=max_length))
# Add Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(dropout_rate))
# Output Layer 
model.add(Dense(no_classes, activation='sigmoid'))

Building LSTM Model..


In [101]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 64)           320000    
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 386,177
Trainable params: 386,177
Non-trainable params: 0
_________________________________________________________________


In [102]:
plot_model(model)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [103]:
# TensorBoard
tensorboard = TensorBoard('/LSTM')

# Train!
print('Training the model..')
model.fit(x_train, y_train, batch_size=batch_size, verbose=1, epochs=no_epochs, validation_data=[x_test, y_test], callbacks = [tensorboard])

Training the model..
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21147a0a288>