# Create a classification model to predict the sentiment either (Positive or Negative) based on Covid Tweets

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Corona_NLP (2).csv',encoding = 'latin1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
 df.shape

(41157, 6)

## I Have sliced the data into 20000 rows as my kernel is crashing and getting a ''memory error''.

In [4]:
df = df[:20000]

In [5]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         4313
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [6]:
df['Sentiment'].value_counts()

Positive              5425
Negative              4978
Neutral               3576
Extremely Positive    3038
Extremely Negative    2983
Name: Sentiment, dtype: int64

In [7]:
df['Sentiment']=df['Sentiment'].replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})
df['Sentiment'].value_counts()

Positive    8463
Negative    7961
Neutral     3576
Name: Sentiment, dtype: int64

In [8]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

In [10]:
lb = WordNetLemmatizer()
sw = stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
corpus = []
for i in df['OriginalTweet']:
  txt = re.sub('[^A-Za-z0-9]',' ',i) 
  txt = txt.lower()
  txt = word_tokenize(txt)
  txt = [i for i in txt if i not in sw]
  txt = [lb.lemmatize(i) for i in txt]
  txt = ' '.join(txt)
  corpus.append(txt)

In [12]:
print(corpus)



In [13]:
print(corpus[:10])

['menyrbie phil gahan chrisitv http co ifz9fan2pa http co xx6ghgfzcc http co i2nlzdxno8', 'advice talk neighbour family exchange phone number create contact list phone number neighbour school employer chemist gp set online shopping account po adequate supply regular med order', 'coronavirus australia woolworth give elderly disabled dedicated shopping hour amid covid 19 outbreak http co binca9vp8p', 'food stock one empty please panic enough food everyone take need stay calm stay safe covid19france covid 19 covid19 coronavirus confinement confinementotal confinementgeneral http co zrlg0z520j', 'ready go supermarket covid19 outbreak paranoid food stock litteraly empty coronavirus serious thing please panic cause shortage coronavirusfrance restezchezvous stayathome confinement http co usmualq72n', 'news region first confirmed covid 19 case came sullivan county last week people flocked area store purchase cleaning supply hand sanitizer food toilet paper good tim dodson report http co cfxch7

In [14]:
cv = CountVectorizer()

In [15]:
res = cv.fit_transform(corpus).toarray()

In [16]:
print(res.shape)

(20000, 41442)


In [17]:
print(type(res))

<class 'numpy.ndarray'>


In [18]:
x = res
y = df['Sentiment']
print(len(x))
print(y.shape)

20000
(20000,)


In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(15000, 41442)
(5000, 41442)
(15000,)
(5000,)


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

**Applying MultiNomial Naive Bayes Algorithm**

In [21]:
m1 = MultinomialNB()
m1.fit(x_train,y_train)

MultinomialNB()

In [22]:
print('Training score ',m1.score(x_train,y_train))
print('Testing score ',m1.score(x_test,y_test))

Training score  0.8422666666666667
Testing score  0.6716


In [23]:
ypred_m1=m1.predict(x_test)
print(ypred_m1)

['Negative' 'Positive' 'Negative' ... 'Negative' 'Negative' 'Positive']


In [24]:
cm_m1=confusion_matrix(y_test,ypred_m1)
print(cm_m1)
print(classification_report(y_test,ypred_m1))

[[1505   47  464]
 [ 258  195  397]
 [ 424   52 1658]]
              precision    recall  f1-score   support

    Negative       0.69      0.75      0.72      2016
     Neutral       0.66      0.23      0.34       850
    Positive       0.66      0.78      0.71      2134

    accuracy                           0.67      5000
   macro avg       0.67      0.58      0.59      5000
weighted avg       0.67      0.67      0.65      5000



**Applying RandomForestClassfier Algorithm**

In [25]:
m2 = RandomForestClassifier(n_estimators=80,criterion='gini',max_depth=7)
m2.fit(x_train,y_train)

RandomForestClassifier(max_depth=7, n_estimators=80)

In [26]:
print('Training Score',m2.score(x_train,y_train))
print('Testing Score',m2.score(x_test,y_test))

Training Score 0.5583333333333333
Testing Score 0.5528


In [27]:
ypred_m2 = m2.predict(x_test)
print(ypred_m2)

['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']


In [28]:
cm_m2 = confusion_matrix(y_test,ypred_m2)
print(cm_m2)
print(classification_report(y_test,ypred_m2,zero_division=0))

[[ 717    0 1299]
 [  27    0  823]
 [  87    0 2047]]
              precision    recall  f1-score   support

    Negative       0.86      0.36      0.50      2016
     Neutral       0.00      0.00      0.00       850
    Positive       0.49      0.96      0.65      2134

    accuracy                           0.55      5000
   macro avg       0.45      0.44      0.38      5000
weighted avg       0.56      0.55      0.48      5000



**Appplying KNN Classification Algorithm**

In [29]:
m3 = KNeighborsClassifier(n_neighbors=250)
m3.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=250)

In [30]:
print('Training Score',m3.score(x_train,y_train))
print('Testing Score',m3.score(x_test,y_test))

Training Score 0.19186666666666666
Testing Score 0.18


In [31]:
ypred_m3 = m3.predict(x_test)
print(ypred_m3)

['Neutral' 'Neutral' 'Neutral' ... 'Neutral' 'Neutral' 'Neutral']


In [32]:
cm_m3 = confusion_matrix(y_test,ypred_m3)
print(cm_m3)
print(classification_report(y_test,ypred_m3,zero_division=0))

[[  44 1972    0]
 [   0  850    0]
 [   9 2119    6]]
              precision    recall  f1-score   support

    Negative       0.83      0.02      0.04      2016
     Neutral       0.17      1.00      0.29       850
    Positive       1.00      0.00      0.01      2134

    accuracy                           0.18      5000
   macro avg       0.67      0.34      0.11      5000
weighted avg       0.79      0.18      0.07      5000



# As I Have used only 20000 rows of data, the precision is quite lower.

# In The above three models Multinomial Navie Bayes Algorithm gives high precision.