In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
import contractions


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional,Embedding, Dropout,BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel

In [5]:
data = pd.read_csv(r'../../data/Corona_NLP_train.csv', encoding= 'ISO-8859-1')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [7]:
# filter the data to only conbtain the tweets and the sentiment
data = data[['OriginalTweet', 'Sentiment']]
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
# plot the sentiment distribution using plotly
px.histogram(data, x='Sentiment', title='Sentiment Distribution')

In [9]:
# The data seems balanced, we can now proceed to clean the data

In [10]:
# Initialize the tokenizer and other resources
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
# function to clean the data
def preprocess_tweet(tweet):
    # Remove accents
    tweet = unidecode(tweet)
    # make the tweet to lower case
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Replace hastags with the space
    tweet = re.sub(r'#', ' ', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)
    # Tokenize tweet
    tokens = tokenizer.tokenize(tweet)
    # Remove stop words and do lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    tweet = ' '.join(tokens)
    # Remove multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Strip leading/trailing spaces
    tweet = tweet.strip()
    return tweet

In [12]:
# Apply preprocessing to the 'OriginalTweet	' column
data['cleaned_tweet'] = data['OriginalTweet'].apply(preprocess_tweet)

In [13]:
# Check the empty values in the cleaned_tweet column
data[data['cleaned_tweet'] == ''].shape[0]

34

In [14]:
# Remove the empty values
data = data[data['cleaned_tweet'] != '']

In [15]:
data.shape

(41123, 3)

In [16]:
# Take only the cleaned tweets and the sentiment
data = data[['cleaned_tweet', 'Sentiment']]

In [17]:
# find the length of the tokens from the cleaned tweets
data['tweet_length'] = data['cleaned_tweet'].apply(lambda x: len(x.split()))

In [18]:
data = data.sort_values(by=['tweet_length'], ascending=False)
data.head(20)

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
35866,mdoc horhn m m brown mississippi prisoner goin...,Extremely Negative,40
21439,u oil rig drop hugely dallas fed energy survey...,Extremely Positive,39
7600,domino fall coronavirus gt world healthemergen...,Extremely Positive,39
10514,new play cannabis sale price skyrocketing also...,Positive,38
16632,bir ddettir sa permarketlerin lojistik hizmeti...,Neutral,38
29288,war drug firing ig amp capt cg u lost battle v...,Extremely Negative,38
37156,nisan cumartesi itibariyle bbanin ta hizmetler...,Neutral,38
27005,supermercados econo confirman que un empleado ...,Neutral,37
9027,la escena se repite alrededor del mundo desde ...,Neutral,37
13168,mar climate change locust crisis covid threate...,Negative,37


In [19]:
# Filter the data such that the first row starts from 13th row
data = data.iloc[10:]
data.head()

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
31813,big story wash post abt walmart worker death a...,Neutral,37
25209,impact covid somalia food supply chain covid a...,Extremely Negative,37
19861,anonymous donor gave u certificate tampt pet f...,Negative,37
8374,wake news breakfast school run daily food shop...,Extremely Negative,37
39866,president trump last day rtad call dr faucis f...,Extremely Negative,37


In [20]:
# let's suffle the data and reset the index
data = data.sample(frac=1)
# reset the index because the index was not a good representation of the data
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
0,facing short term financial issue contact bank...,Negative,18
1,covid pandemic created issue wedding industry ...,Extremely Positive,26
2,apparently take pandemic lower insulin price l...,Negative,14
3,kantar report diagnosis consumer response covi...,Neutral,10
4,food redistribution organisation across englan...,Positive,19


In [21]:
# Sentiment Column Analysis
data['Sentiment'].value_counts()

Sentiment
Positive              11419
Negative               9915
Neutral                7678
Extremely Positive     6622
Extremely Negative     5479
Name: count, dtype: int64

In [22]:
# convert the sentiments to only three categories|
data['Sentiment'] = data['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [23]:
data['Sentiment'].value_counts()

Sentiment
2    18041
0    15394
1     7678
Name: count, dtype: int64

In [24]:
# Split the data into X and y
X = data['cleaned_tweet']
y = data['Sentiment']


In [25]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(30834,) (10279,) (30834,) (10279,)


In [26]:
# initialise the tf idf vectorizer
tfidf = TfidfVectorizer(max_features=1000)
tf_x_train = tfidf.fit_transform(X_train)
tf_x_test = tfidf.transform(X_test)

In [27]:
# Apply min max scaler to the data
scaler = MaxAbsScaler()
scaler.fit(tf_x_train)
tf_x_train = scaler.transform(tf_x_train)
tf_x_test = scaler.transform(tf_x_test)

In [28]:
# create a function to fit different models
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('Accuracy: ', accuracy)
    print('\nPrecision: ', precision)
    print('\nRecall: ', recall)
    print('\nF1 Score: ', f1)
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))

In [29]:
# Fit a logistic regression model to the data
logreg = LogisticRegression(max_iter = 1000)   
fit_model(logreg, tf_x_train, y_train, tf_x_test, y_test)

Accuracy:  0.7295456756493822

Precision:  0.7304853381145662

Recall:  0.7295456756493822

F1 Score:  0.7299730801964431

Confusion Matrix: 
 [[2805  396  643]
 [ 398 1224  315]
 [ 660  368 3470]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.73      0.73      0.73      3844
           1       0.62      0.63      0.62      1937
           2       0.78      0.77      0.78      4498

    accuracy                           0.73     10279
   macro avg       0.71      0.71      0.71     10279
weighted avg       0.73      0.73      0.73     10279



In [30]:
# fit a random forest model to the data
rf = RandomForestClassifier()
fit_model(rf, tf_x_train, y_train, tf_x_test, y_test)

Accuracy:  0.7068781009825859

Precision:  0.707808651149755

Recall:  0.7068781009825859

F1 Score:  0.7072770257916302

Confusion Matrix: 
 [[2693  409  742]
 [ 392 1181  364]
 [ 684  422 3392]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.71      0.70      0.71      3844
           1       0.59      0.61      0.60      1937
           2       0.75      0.75      0.75      4498

    accuracy                           0.71     10279
   macro avg       0.69      0.69      0.69     10279
weighted avg       0.71      0.71      0.71     10279



In [31]:
# fit the naive bayes model to the data
nb = MultinomialNB()
fit_model(nb, tf_x_train, y_train, tf_x_test, y_test)

Accuracy:  0.6434478062068295

Precision:  0.6406996484360175

Recall:  0.6434478062068295

F1 Score:  0.6247272684278374

Confusion Matrix: 
 [[2605  162 1077]
 [ 568  494  875]
 [ 841  142 3515]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.65      0.68      0.66      3844
           1       0.62      0.26      0.36      1937
           2       0.64      0.78      0.71      4498

    accuracy                           0.64     10279
   macro avg       0.64      0.57      0.58     10279
weighted avg       0.64      0.64      0.62     10279



In [32]:
# fit the linear svc model to the data
svc = LinearSVC()
fit_model(svc, tf_x_train, y_train, tf_x_test, y_test)





Accuracy:  0.7329506761358109

Precision:  0.7360021037452354

Recall:  0.7329506761358109

F1 Score:  0.734138404404522

Confusion Matrix: 
 [[2782  438  624]
 [ 364 1283  290]
 [ 625  404 3469]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.74      0.72      0.73      3844
           1       0.60      0.66      0.63      1937
           2       0.79      0.77      0.78      4498

    accuracy                           0.73     10279
   macro avg       0.71      0.72      0.71     10279
weighted avg       0.74      0.73      0.73     10279



In [33]:
# fit the xgboost model to the data
xgb_model = xgb.XGBClassifier()
fit_model(xgb_model, tf_x_train, y_train, tf_x_test, y_test)

Accuracy:  0.7223465317637903

Precision:  0.7259504474049613

Recall:  0.7223465317637903

F1 Score:  0.7233608912410466

Confusion Matrix: 
 [[2674  451  719]
 [ 286 1295  356]
 [ 609  433 3456]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.75      0.70      0.72      3844
           1       0.59      0.67      0.63      1937
           2       0.76      0.77      0.77      4498

    accuracy                           0.72     10279
   macro avg       0.70      0.71      0.71     10279
weighted avg       0.73      0.72      0.72     10279



In [34]:
# save a contractions dict to` be used in the next section
contractions_dict = {
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not",                               
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'll": "he shall / he will",
    "he's": "he is",
    "I'd": "I had / I would",
    "I'll": "I shall / I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'll": "it shall / it will",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "shan't": "shall not",
    "she'd": "she had / she would",
    "she'll": "she shall / she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had / they would",
    "they'll": "they shall / they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'll": "we shall / we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who had / who would",
    "who'll": "who shall / who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you had / you would",
    "you'll": "you shall / you will",
    "you're": "you are",
    "you've": "you have"
}

In [35]:
# create a class to  preprocess the data to create a pipeline
class PreProcessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize components for text processing
        self.tokenizer = TweetTokenizer()
        # Example custom stopwords list (keeping negations like 'not', 'no', etc.)
        custom_stopwords = set(stopwords.words('english')) - {'not', 'no', 'never'}
        self.stop_words = custom_stopwords
        self.lemmatizer = WordNetLemmatizer()
        
    # Function to expand contractions using the 'contractions' library
    def expanding_words(self, tweet):
        return contractions.fix(tweet)
        
    def preprocess_tweet(self, tweet):
        # Remove accents
        tweet = unidecode(tweet)
        # Make the tweet lowercase
        tweet = tweet.lower()
        # Remove URLs
        tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
        # Remove mentions
        tweet = re.sub(r'@\w+', '', tweet)
        # Replace hashtags with space
        tweet = re.sub(r'#', ' ', tweet)
        # Remove punctuation
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
        # Remove digits
        tweet = re.sub(r'\d+', '', tweet)
        # Tokenize tweet
        tokens = self.tokenizer.tokenize(tweet)
        # Remove stop words and apply lemmatization
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
        # Join tokens back to string
        tweet = ' '.join(tokens)
        # Remove multiple spaces with a single space
        tweet = re.sub(r'\s+', ' ', tweet)
        return tweet
        
    def fit(self, X, y=None):
        # No fitting needed, just return self
        return self
    
    def transform(self, X, y=None):
        # First, expand contractions, then preprocess tweet
        X_expanded = X.apply(self.expanding_words)
        # Apply preprocessing to the expanded data
        return X_expanded.apply(self.preprocess_tweet)


In [36]:
# import the testing data
test_data = pd.read_csv(r'../../data/Corona_NLP_test.csv', encoding= 'ISO-8859-1')

In [37]:
test_data = test_data[['OriginalTweet', 'Sentiment']]

In [38]:
# split the data to X and y
X_test = test_data['OriginalTweet']
y_test = test_data['Sentiment']

In [39]:
# convert the sentiments to only three categories
y_test = y_test.map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [40]:
X_test.head

<bound method NDFrame.head of 0       TRENDING: New Yorkers encounter empty supermar...
1       When I couldn't find hand sanitizer at Fred Me...
2       Find out how you can protect yourself and love...
3       #Panic buying hits #NewYork City as anxious sh...
4       #toiletpaper #dunnypaper #coronavirus #coronav...
                              ...                        
3793    Meanwhile In A Supermarket in Israel -- People...
3794    Did you panic buy a lot of non-perishable item...
3795    Asst Prof of Economics @cconces was on @NBCPhi...
3796    Gov need to do somethings instead of biar je r...
3797    I and @ForestandPaper members are committed to...
Name: OriginalTweet, Length: 3798, dtype: object>

In [41]:
#create the pipeline with linearsvc model
pipeline = Pipeline([
    ('preprocessor', PreProcessor()),
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('scaler', MaxAbsScaler()),
    ('model', LinearSVC())
])

In [42]:
# Find the accuracy of the pipeline on the test data
pipeline.fit(X_train, y_train)





In [43]:
# calculate the classification report
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.70      0.73      1633
           1       0.57      0.60      0.59       619
           2       0.75      0.79      0.77      1546

    accuracy                           0.72      3798
   macro avg       0.69      0.70      0.69      3798
weighted avg       0.72      0.72      0.72      3798

