In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df = pd.read_csv('/content/Corona_NLP_test.csv')
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [None]:
df.drop(['UserName','ScreenName','Location','TweetAt'],axis=1,inplace=True)
df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


In [None]:
df['Sentiment'].value_counts()

Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: Sentiment, dtype: int64

In [None]:
df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [None]:
df['Sentiment']=le.fit_transform(df['Sentiment'])

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,0
1,When I couldn't find hand sanitizer at Fred Me...,4
2,Find out how you can protect yourself and love...,1
3,#Panic buying hits #NewYork City as anxious sh...,2
4,#toiletpaper #dunnypaper #coronavirus #coronav...,3
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,4
3794,Did you panic buy a lot of non-perishable item...,2
3795,Asst Prof of Economics @cconces was on @NBCPhi...,3
3796,Gov need to do somethings instead of biar je r...,0


In [None]:
def clean_tweet(text):
  tokens= word_tokenize(text.lower())
  ftoken= [t for t in tokens if(t.isalpha())]
  stop = stopwords.words('english')
  ctoken= [t for t in ftoken if (t not in stop)]
  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]
  return ' '.join(ltoken)

In [None]:
df['OriginalTweet']=df['OriginalTweet'].apply(clean_tweet)

In [None]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,trending new yorkers encounter empty supermark...,0
1,could find hand sanitizer fred meyer turned am...,4
2,find protect loved one coronavirus,1
3,panic buying hit newyork city anxious shopper ...,2
4,toiletpaper dunnypaper coronavirus coronavirus...,3
...,...,...
3793,meanwhile supermarket israel people dance sing...,4
3794,panic buy lot item echo need food donation als...,2
3795,asst prof economics cconces nbcphiladelphia ta...,3
3796,gov need somethings instead biar je rakyat ass...,0


In [None]:
x = df['OriginalTweet']
y = df['Sentiment']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=1)

In [None]:
sentlen = []

for sent in df["OriginalTweet"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen
df.head()

Unnamed: 0,OriginalTweet,Sentiment,SentLen
0,trending new yorkers encounter empty supermark...,0,18
1,could find hand sanitizer fred meyer turned am...,4,16
2,find protect loved one coronavirus,1,5
3,panic buying hit newyork city anxious shopper ...,2,26
4,toiletpaper dunnypaper coronavirus coronavirus...,3,20


In [None]:
max(sentlen)

39

In [None]:
max_len = np.quantile(sentlen,0.95)

In [None]:
tok = Tokenizer(char_level=False,split=' ')
tok.fit_on_texts(X_train)
tok.index_word

{1: 'http',
 2: 'coronavirus',
 3: 'food',
 4: 'store',
 5: 'grocery',
 6: 'people',
 7: 'stock',
 8: 'supermarket',
 9: 'amp',
 10: 'shopping',
 11: 'online',
 12: 'price',
 13: 'panic',
 14: 'need',
 15: 'paper',
 16: 'get',
 17: 'toilet',
 18: 'like',
 19: 'time',
 20: 'buying',
 21: 'go',
 22: 'u',
 23: 'local',
 24: 'home',
 25: 'going',
 26: 'covid',
 27: 'please',
 28: 'supply',
 29: 'retail',
 30: 'buy',
 31: 'shelf',
 32: 'coronaviruspandemic',
 33: 'day',
 34: 'today',
 35: 'work',
 36: 'consumer',
 37: 'week',
 38: 'hand',
 39: 'everyone',
 40: 'good',
 41: 'help',
 42: 'due',
 43: 'one',
 44: 'thing',
 45: 'demand',
 46: 'coronaoutbreak',
 47: 'virus',
 48: 'pandemic',
 49: 'stay',
 50: 'shop',
 51: 'keep',
 52: 'still',
 53: 'would',
 54: 'coronapocalypse',
 55: 'went',
 56: 'even',
 57: 'water',
 58: 'empty',
 59: 'make',
 60: 'take',
 61: 'know',
 62: 'think',
 63: 'many',
 64: 'leave',
 65: 'item',
 66: 'see',
 67: 'way',
 68: 'every',
 69: 'panicbuying',
 70: 'health',

In [None]:
vocab_len=len(tok.index_word)
vocab_len

8072

In [None]:
seqtrain = tok.texts_to_sequences(X_train)
seqtrain

[[425,
  1021,
  1882,
  2438,
  82,
  1310,
  150,
  19,
  246,
  3502,
  3503,
  120,
  11,
  10,
  451,
  3504,
  3505,
  3506,
  59,
  473,
  1539,
  1540,
  22,
  2],
 [3507,
  73,
  25,
  10,
  213,
  3508,
  31,
  916,
  26,
  426,
  114,
  521,
  13,
  917,
  284,
  3509,
  3,
  115,
  1],
 [1541,
  474,
  34,
  3510,
  55,
  8,
  1542,
  2439,
  1543,
  308,
  19,
  475,
  2440,
  3511,
  1311,
  3512,
  1883,
  700,
  344],
 [3513, 3514, 645, 918, 36, 247, 132, 1, 476, 2441, 53, 18, 41],
 [293,
  61,
  2442,
  3515,
  3516,
  184,
  43,
  2442,
  427,
  22,
  7,
  3,
  9,
  377,
  25,
  23,
  1160,
  184,
  90,
  22,
  88],
 [3517,
  345,
  363,
  477,
  2443,
  192,
  273,
  3,
  133,
  51,
  7,
  1884,
  2,
  274,
  6,
  69,
  273,
  3],
 [60, 1161, 58, 31, 56, 21, 5, 4, 1544, 3518, 1],
 [1885,
  1886,
  3519,
  2444,
  1887,
  42,
  1162,
  2444,
  2445,
  478,
  3520,
  3521,
  919,
  259,
  193,
  700,
  36,
  364,
  14,
  771],
 [43,
  5,
  4,
  3522,
  2446,
  94,
  11

In [None]:
seqmattrain = sequence.pad_sequences(seqtrain,maxlen= int(max_len))
seqmattrain

array([[   0,    0,    0, ..., 1540,   22,    2],
       [   0,    0,    0, ...,    3,  115,    1],
       [   0,    0,    0, ..., 1883,  700,  344],
       ...,
       [   0,    0,    0, ...,  498, 1890,    2],
       [   0,    0,    0, ..., 3137,   91,    1],
       [   0,    0,    0, ...,  118, 8072,    1]], dtype=int32)

In [None]:
seqtest = tok.texts_to_sequences(X_train)
seqmattest = sequence.pad_sequences(seqtest, maxlen= int(max_len))

In [None]:
vocab_len

8072

In [None]:
max_len

28.0

In [None]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,20, input_length=int(max_len), mask_zero=True))
rnn.add(SimpleRNN(units=16, activation="tanh"))
rnn.add(Dense(units=16, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=5, activation="softmax"))

rnn.compile(optimizer="adam", loss="sparse_categorical_crossentropy",metrics=['accuracy'])

rnn.fit(seqmattrain, y_train, batch_size=25, epochs=10)

ypred = rnn.predict(seqmattest)

ypred = ypred>0.5

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,ypred))

ValueError: ignored