In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import sequential
from keras.layers import Dense, LSTM, Dropout, Embedding

In [2]:
data = pd.read_csv("Corona_NLP_test.csv")
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [3]:
data.shape

(3798, 6)

In [4]:
data['Sentiment'] = LabelEncoder().fit_transform(data['Sentiment'])
data['Sentiment'].head()

0    0
1    4
2    1
3    2
4    3
Name: Sentiment, dtype: int32

In [5]:
data.isnull().sum()

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   int32 
dtypes: int32(1), int64(2), object(3)
memory usage: 163.3+ KB


In [7]:
# lets summarize the Dataset
data.describe()

Unnamed: 0,UserName,ScreenName,Sentiment
count,3798.0,3798.0,3798.0
mean,1899.5,46851.5,2.192206
std,1096.532489,1096.532489,1.380256
min,1.0,44953.0,0.0
25%,950.25,45902.25,1.0
50%,1899.5,46851.5,2.0
75%,2848.75,47800.75,3.0
max,3798.0,48750.0,4.0


In [8]:
# lets summarize Categorical data also
data.describe(include = 'object')

Unnamed: 0,Location,TweetAt,OriginalTweet
count,2964,3798,3798
unique,1717,15,3798
top,United States,13-03-2020,TRENDING: New Yorkers encounter empty supermar...
freq,75,1233,1


In [14]:
x = data[data.columns.difference(['Sentiment'])]
y = data['Sentiment']

In [15]:
# splitting the data into training and testing sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 15)

### Cleaning the Reviews

In [20]:
def clean_data(text):
    return ' '.join(re.sub("(@[a-zA-Z0-9]+)|([^0-9A-Za-z])|(https://[\w.]+/[\w]+)", " ", text).split())

data['OriginalTweet'] = data['OriginalTweet'].apply(clean_data)

In [17]:
# as it is clear that the reviews have so many unnecassry things such as Stopwords, Punctuations, numbers etc
import string
# First lets remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

data['OriginalTweet'] = data['OriginalTweet'].apply(punctuation_removal)

In [18]:
# Now lets Remove the Stopwords also

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')
stop.append("i'm")

stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 

def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words ]

data['OriginalTweet'] = data['OriginalTweet'].apply(stopwords_removal)

In [19]:
# lets remove the Numbers also

import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

data['OriginalTweet'] = data['OriginalTweet'].apply(drop_numbers)

In [32]:
# function for stemming and Lemmatization 

corpus = []
for i in range(len(x)):
    Tweet = data['OriginalTweet'][i]
    stemmer = PorterStemmer()
    Tweet = ' '.join([stemmer.stem(word) for word in Tweet.split()])
    Tweet = nlp(Tweet)
    Tweet = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.Tweet for word in Tweet])
    corpus.append(Tweet) 

In [33]:
corpus[0]

'trend new yorker encount empti supermarket shelf pictur wegman brooklyn soldout onlin grocer foodkick maxdeliveri coronavirusfear shopper stock'

In [36]:
max_voc = 50000
on_hot_r = [one_hot(corpus[0], max_voc) for word in corpus]
on_hot_r[0]

[42078,
 42614,
 3852,
 7632,
 8325,
 16898,
 39379,
 16247,
 19755,
 42429,
 24663,
 30942,
 39963,
 4026,
 3099,
 31628,
 32354,
 15267]

In [38]:
send_lenght = 30 
input = pad_sequences(on_hot_r,maxlen= send_lenght, padding='pre' )
input

array([[    0,     0,     0, ..., 31628, 32354, 15267],
       [    0,     0,     0, ..., 31628, 32354, 15267],
       [    0,     0,     0, ..., 31628, 32354, 15267],
       ...,
       [    0,     0,     0, ..., 31628, 32354, 15267],
       [    0,     0,     0, ..., 31628, 32354, 15267],
       [    0,     0,     0, ..., 31628, 32354, 15267]])

In [43]:
final_x = np.array(input)
final_y = np.array(y).reshape(-1,1)