In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [1]:
from zipfile import ZipFile 

zip_file_path = r'datasets\Corona_NLP_train.csv.zip'
# Link: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification?resource=download&select=Corona_NLP_train.csv

extract_to_path = 'datasets' 

with ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

print(f"Contents of '{zip_file_path}' extracted to '{extract_to_path}'")

Contents of 'datasets\Corona_NLP_train.csv.zip' extracted to 'datasets'


In [3]:
data_train = pd.read_csv(r'datasets\Corona_NLP_train.csv', encoding='ISO-8859-1')
data_test = pd.read_csv(r'datasets\Corona_NLP_test.csv', encoding='ISO-8859-1')

In [4]:
data_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [5]:
data_test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [6]:
sentiments = data_train['Sentiment'].unique()
print(sentiments)

['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive']


In [7]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,5), )
vectorizer.fit(data_train['OriginalTweet'])

le = LabelEncoder()
le.fit(data_train['Sentiment'])

def processing_pipeline(data):
    data = data.dropna(subset=['OriginalTweet', 'Sentiment'])
    data = data.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'], errors='ignore')
    data['label'] = le.transform(data['Sentiment'])
    X = vectorizer.transform(data['OriginalTweet'])
    y = data['label']
    return X, y


In [9]:
X_train, y_train = processing_pipeline(data_train)
X_test, y_test = processing_pipeline(data_train)

In [10]:
print(X_train[2])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 60 stored elements and shape (1, 2290478)>
  Coords	Values
  (0, 13475)	0.032022053468199356
  (0, 30777)	0.08194209675863416
  (0, 31082)	0.10562270586548271
  (0, 31099)	0.15829455996806083
  (0, 128046)	0.06905042831269106
  (0, 128566)	0.08431720972217066
  (0, 128567)	0.08436102183373943
  (0, 128675)	0.11004459911679594
  (0, 128686)	0.12424690060071743
  (0, 182039)	0.0935457139527707
  (0, 182594)	0.1524234690632159
  (0, 182598)	0.15829455996806083
  (0, 182599)	0.15829455996806083
  (0, 182600)	0.15829455996806083
  (0, 229632)	0.15829455996806083
  (0, 423269)	0.027114568703151584
  (0, 424380)	0.14238677286883272
  (0, 424390)	0.15829455996806083
  (0, 424391)	0.15829455996806083
  (0, 424392)	0.15829455996806083
  (0, 477031)	0.03257480199570882
  (0, 477041)	0.03310740402945268
  (0, 489038)	0.08216659570253353
  (0, 489236)	0.10562270586548271
  (0, 489247)	0.15829455996806083
  :	:
  (0, 677068)	0.07996686784

In [11]:
model = MultinomialNB()
y_pred = model.fit(X_train, y_train).predict(X_test)

In [12]:

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 41157 points : 4571


In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8889374832956727
              precision    recall  f1-score   support

           0       1.00      0.54      0.70      5481
           1       1.00      0.79      0.88      6624
           2       0.92      0.99      0.95      9917
           3       0.99      0.92      0.96      7713
           4       0.76      1.00      0.86     11422

    accuracy                           0.89     41157
   macro avg       0.93      0.85      0.87     41157
weighted avg       0.91      0.89      0.88     41157

