# **Sentiment Analysis on COVID-19 related Tweets (Machine Learning: TF-IDF)**


In [None]:
# We have to mount
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#**Import Files**

In [None]:
import base64

import numpy as np 
import pandas as pd

#Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go
import plotly.tools as tls
import re
import nltk
import string


# Other imports
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline
%matplotlib notebook

#**Reading CSV File**


In [None]:
#TextBlob
#train=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Text_Blob_PolarityAnalysis.csv')
#Original 45,007
#train=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/COVID-19MasterFinalDataset.csv')
#36k Tweets
#train=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/36kCleanedCOVID-19MasterFinalDataset.csv')
#36k Tweets

train=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/SentimentAnalysisCOVID-19MasterFinalDataset.csv')
train.head(10)

Unnamed: 0,Date,Tweets,Translated,Polarity,Sentiment,length
0,Mar,Me to COVID-19: pic.twitter.com/OLX9LTjTsW,me to covid,0.0,Neutral,11
1,May,So many realizations during ECQ because of COV...,so many realizations during ecq because of cov...,0.3,Positive,153
2,Mar,Murag sure pko na magka covid kesa magbalik mi...,it's like a covid covid kesa back to cejay hah...,0.1,Positive,51
3,Apr,"While we are all fighting against COVID-19, me...",while we are all fighting against covid meanwh...,0.0,Neutral,112
4,Mar,"Taong bahay, dahil sa covid-19😴",person house because of covid,0.0,Neutral,29
5,Mar,"mamamatay ata ako sa stress, hindi sa covid.",I will die in stress not in covid,0.0,Neutral,33
6,Jun,I want to join kaso may pangamba pa rin ng COV...,I Want To Join Case There is still a covid,0.0,Neutral,42
7,Mar,"So gi confiscate ang akuang alcohol na 75mL, u...",So confiscate the Akuang Alcohol ML then the w...,0.0,Neutral,62
8,Apr,I choose you to be a positive from covid 19 ch...,i choose you to be a positive from covid char...,0.227273,Positive,53
9,Apr,"Jgh, then ligo. Grabe covid stop na. 🤷🏻‍♂️🙏",jgh then ligo grabe covid stop na 🤷,0.0,Neutral,35


In [None]:
train.shape

(44709, 6)

# **Choose Translated Tweet and its Polarity**

In [None]:
new_data = train[['Translated','Sentiment']]
new_data.head(20)

Unnamed: 0,Translated,Sentiment
0,me to covid,Neutral
1,so many realizations during ecq because of cov...,Positive
2,it's like a covid covid kesa back to cejay hah...,Positive
3,while we are all fighting against covid meanwh...,Neutral
4,person house because of covid,Neutral
5,I will die in stress not in covid,Neutral
6,I Want To Join Case There is still a covid,Neutral
7,So confiscate the Akuang Alcohol ML then the w...,Neutral
8,i choose you to be a positive from covid char...,Positive
9,jgh then ligo grabe covid stop na 🤷,Neutral


In [None]:
def convert_label(sentiment):
 
  for i in range(len(new_data)):
    if sentiment == "Negative":
      return -1;
    elif sentiment == "Neutral":
      return 0;
    elif sentiment == "Positive":
      return 1;

new_data["Analysis"] = new_data["Sentiment"].apply(convert_label)  
new_data.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Translated,Sentiment,Analysis
0,me to covid,Neutral,0
1,so many realizations during ecq because of cov...,Positive,1
2,it's like a covid covid kesa back to cejay hah...,Positive,1
3,while we are all fighting against covid meanwh...,Neutral,0
4,person house because of covid,Neutral,0


#**Data Definition**

In [None]:
print('Dataset size:',new_data.shape)
print('Columns are:',new_data.columns)
new_data.info()

Dataset size: (44709, 3)
Columns are: Index(['Translated', 'Sentiment', 'Analysis'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44709 entries, 0 to 44708
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Translated  44709 non-null  object
 1   Sentiment   44709 non-null  object
 2   Analysis    44709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


# **Data Analysis**

# **Data Cleaning**

In [None]:
X = new_data['Translated'] 
y = new_data['Sentiment']

In [None]:
processed_tweets = []

for new_data in range(0, len(X)):  
    # Remove all the special characters
    processed_tweet = re.sub(r'\W', ' ', str(X[new_data]))

    # remove all single characters
    processed_tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_tweet)

    # Remove single characters from the start
    processed_tweet = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_tweet) 

    # Substituting multiple spaces with single space
    processed_tweet= re.sub(r'\s+', ' ', processed_tweet, flags=re.I)

    # Removing prefixed 'b'
    processed_tweet = re.sub(r'^b\s+', '', processed_tweet)

    # Converting to Lowercase
    processed_tweet = processed_tweet.lower()

    processed_tweets.append(processed_tweet)

# **Data Pre-preprocessing**

# **Stop-words**

In [None]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words_senti = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## **TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stop_words_senti) 
#tfidfconverter = TfidfVectorizer() 
X = tfidfconverter.fit_transform(processed_tweets).toarray()

# **Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=50)  
random_forest.fit(X_train, y_train)

predictions_randomforest = random_forest.predict(X_test)
  
print(confusion_matrix(y_test,predictions_randomforest ))
print(classification_report(y_test,predictions_randomforest))
print('Accuracy Score: ',accuracy_score(y_test, predictions_randomforest))
print('Train Accuracy Score: ',random_forest.score(X_train,y_train))
print('Test Accuracy Score: ',random_forest.score(X_test,y_test))

[[1141  173  253]
 [  14 3493   53]
 [  88  203 3524]]
              precision    recall  f1-score   support

    Negative       0.92      0.73      0.81      1567
     Neutral       0.90      0.98      0.94      3560
    Positive       0.92      0.92      0.92      3815

    accuracy                           0.91      8942
   macro avg       0.91      0.88      0.89      8942
weighted avg       0.91      0.91      0.91      8942

Accuracy Score:  0.9123238649071796
Train Accuracy Score:  0.9942964184863142
Test Accuracy Score:  0.9123238649071796


# **Multinomial Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb= MultinomialNB(alpha = 0.003).fit(X_train, y_train)

predictions_mnb = mnb.predict(X_test)

print(confusion_matrix(y_test,predictions_mnb))  
print(classification_report(y_test,predictions_mnb))  
print('Accuracy Score: ',accuracy_score(y_test, predictions_mnb))
print('Train Accuracy Score: ',mnb.score(X_train,y_train))
print('Test Accuracy Score: ',mnb.score(X_test,y_test))


[[ 616  247  704]
 [  18 2683  859]
 [  43  308 3464]]
              precision    recall  f1-score   support

    Negative       0.91      0.39      0.55      1567
     Neutral       0.83      0.75      0.79      3560
    Positive       0.69      0.91      0.78      3815

    accuracy                           0.76      8942
   macro avg       0.81      0.68      0.71      8942
weighted avg       0.78      0.76      0.74      8942

Accuracy Score:  0.7563184969805412
Train Accuracy Score:  0.7886599379316129
Test Accuracy Score:  0.7563184969805412


# **Linear Support Vector Machine**

In [None]:
from sklearn.svm import LinearSVC 

linSVC = LinearSVC(C= 1.0, loss= 'squared_hinge', max_iter= 10000, penalty= 'l2', multi_class='crammer_singer', random_state = 500) 
linSVC.fit(X_train, y_train) 

predictions_linSVC = linSVC.predict(X_test)
 
    
print(confusion_matrix(y_test,predictions_linSVC))  
print(classification_report(y_test,predictions_linSVC))  
print('Accuracy Score: ',accuracy_score(y_test, predictions_linSVC))
print('Train Accuracy Score: ',linSVC.score(X_train,y_train))
print('Test Accuracy Score: ',linSVC.score(X_test,y_test))


Liblinear failed to converge, increase the number of iterations.



[[1083  254  230]
 [  26 3492   42]
 [ 123  352 3340]]
              precision    recall  f1-score   support

    Negative       0.88      0.69      0.77      1567
     Neutral       0.85      0.98      0.91      3560
    Positive       0.92      0.88      0.90      3815

    accuracy                           0.89      8942
   macro avg       0.89      0.85      0.86      8942
weighted avg       0.89      0.89      0.88      8942

Accuracy Score:  0.8851487363006039
Train Accuracy Score:  0.8909609416501244
Test Accuracy Score:  0.8851487363006039


# **Multi-class Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

#logRes = LogisticRegression(C=0.6, class_weight='balanced',multi_class='multinomial',max_iter=100,random_state=0)
logRes = LogisticRegression(C=0.6, multi_class='multinomial',max_iter=100,random_state=0)
logRes.fit(X_train, y_train)
 
predictions_logRes = logRes.predict(X_test)
    
print(confusion_matrix(y_test,predictions_logRes))    
print(classification_report(y_test,predictions_logRes ))  
print('Accuracy Score: ',accuracy_score(y_test, predictions_logRes ))
print('Train Accuracy Score: ',logRes.score(X_train,y_train))
print('Test Accuracy Score: ',logRes.score(X_test,y_test))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



[[ 927  339  301]
 [  15 3449   96]
 [  95  381 3339]]
              precision    recall  f1-score   support

    Negative       0.89      0.59      0.71      1567
     Neutral       0.83      0.97      0.89      3560
    Positive       0.89      0.88      0.88      3815

    accuracy                           0.86      8942
   macro avg       0.87      0.81      0.83      8942
weighted avg       0.87      0.86      0.86      8942

Accuracy Score:  0.8627823753075374
Train Accuracy Score:  0.879833365951855
Test Accuracy Score:  0.8627823753075374


# **Adaboost Classifier**

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc = AdaBoostClassifier(n_estimators = 300, learning_rate = 1)
abc.fit(X_train, y_train)
 
predictions_abc = abc.predict(X_test)

print(confusion_matrix(y_test,predictions_abc))  
print(classification_report(y_test,predictions_abc))  
print('Accuracy Score: ',accuracy_score(y_test, predictions_abc))
print('Train Accuracy Score: ',abc.score(X_train,y_train))
print('Test Accuracy Score: ',abc.score(X_test,y_test))

KeyboardInterrupt: ignored