In [307]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
import contractions
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional,Embedding, Dropout,BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel

In [308]:
# Reading data from two directories up
data = pd.read_csv(r'../../data/Corona_NLP_train.csv', encoding= 'ISO-8859-1')

In [309]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [310]:
# filter the data to only conbtain the tweets and the sentiment
data = data[['OriginalTweet', 'Sentiment']]
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [311]:
# plot the sentiment distribution using plotly
px.histogram(data, x='Sentiment', title='Sentiment Distribution')

In [312]:
# The data seems balanced, we can now proceed to clean the data

In [313]:
# Initialize the tokenizer and other resources
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english')) - {'not', 'no', 'never'}
lemmatizer = WordNetLemmatizer()

In [314]:
# function to clean the data
def preprocess_tweet(tweet):
    # Remove accents
    tweet = unidecode(tweet)
    # make the tweet to lower case
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Replace hastags with the space
    tweet = re.sub(r'#', ' ', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)
    # Tokenize tweet
    tokens = tokenizer.tokenize(tweet)
    # Remove stop words and do lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    tweet = ' '.join(tokens)
    # Remove multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Strip leading/trailing spaces
    tweet = tweet.strip()
    return tweet

In [315]:
# Apply preprocessing to the 'OriginalTweet	' column
data['cleaned_tweet'] = data['OriginalTweet'].apply(preprocess_tweet)

In [316]:
# Check the empty values in the cleaned_tweet column
data[data['cleaned_tweet'] == ''].shape[0]

34

In [317]:
# Remove the empty values
data = data[data['cleaned_tweet'] != '']

In [318]:
data.shape

(41123, 3)

In [319]:
# Take only the cleaned tweets and the sentiment
data = data[['cleaned_tweet', 'Sentiment']]

In [320]:
# find the length of the tokens from the cleaned tweets
data['tweet_length'] = data['cleaned_tweet'].apply(lambda x: len(x.split()))

In [321]:
data = data.sort_values(by=['tweet_length'], ascending=False)
data.head(20)

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
27745,no mall no movie no supermarket no amazon no o...,Extremely Negative,44
35866,mdoc horhn m m brown mississippi prisoner goin...,Extremely Negative,40
21439,u oil rig not drop hugely dallas fed energy su...,Extremely Positive,40
7600,domino fall coronavirus gt world healthemergen...,Extremely Positive,39
29288,war drug firing ig amp capt cg u lost battle v...,Extremely Negative,39
10514,new play cannabis sale price skyrocketing also...,Positive,38
10023,extremely let tesco no delivery no phonecall n...,Extremely Negative,38
37156,nisan cumartesi itibariyle bbanin ta hizmetler...,Neutral,38
28699,wen every w panic buying wks ago told every st...,Extremely Negative,38
16632,bir ddettir sa permarketlerin lojistik hizmeti...,Neutral,38


In [322]:
# Filter the data such that the first row starts from 13th row
data = data.iloc[10:]
data.head()

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
13168,mar climate change locust crisis covid threate...,Negative,37
31813,big story wash post abt walmart worker death a...,Neutral,37
8374,wake news breakfast school run daily food shop...,Extremely Negative,37
9027,la escena se repite alrededor del mundo desde ...,Neutral,37
40700,amazon home low wage poor working condition am...,Extremely Positive,37


In [323]:
# let's suffle the data and reset the index
data = data.sample(frac=1)
# reset the index because the index was not a good representation of the data
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,cleaned_tweet,Sentiment,tweet_length
0,quite bizarre wuhan report new case complete u...,Extremely Negative,30
1,home decor trendy fashion local shop boutique ...,Neutral,12
2,unlike consumer mobile ascom mobile device pur...,Neutral,23
3,covid may spell end private profit healthcare ...,Positive,15
4,great toiletpaper panic via ht toiletpaper pan...,Positive,12


In [324]:
# Sentiment Column Analysis
data['Sentiment'].value_counts()

Sentiment
Positive              11419
Negative               9916
Neutral                7680
Extremely Positive     6622
Extremely Negative     5476
Name: count, dtype: int64

In [325]:
# convert the sentiments to only three categories|
data['Sentiment'] = data['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [326]:
data['Sentiment'].value_counts()

Sentiment
2    18041
0    15392
1     7680
Name: count, dtype: int64

In [327]:
# Split the data into X and y
X = data['cleaned_tweet']
y = data['Sentiment']


In [328]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(30834,) (10279,) (30834,) (10279,)


In [329]:
## Training our own word2vec embeddings
## create the story using the cleaned tweets
story = []
# iterate over each row in the dataframe
for index, row in data.iterrows():
    # split the tweet into sentences
    raw_sentences = sent_tokenize(row['cleaned_tweet'])
    # append the sentences to the story
    for sentence in raw_sentences:
        # sentence = simple_preprocess(sentence)
        story.append(sentence)


In [330]:
# create a gensim model
model = Word2Vec(vector_size=300, window=5, min_count=1, workers=4)


In [331]:
# Build the vocabulary
model.build_vocab(story)

In [332]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(4508087, 26419235)

In [333]:
len(model.wv.key_to_index)

27

In [334]:
[sentence for sentence in X_train]

['consumer psychologist detail business need know covid business management psychology success health culture',
 'connect live consumer panel lunch today get question answered whatas going consumer right mrx webinar coronavirus',
 'paul krugman bash trump scheme boost oil profit stonewalling post office aid via',
 'trending outofcontrol teenager coughing grocery store produce coronavirus',
 'need stock food preparation worst case scenario covid betwinnervirtual',
 'need cut another mmbbld fix oil price cure covid',
 'coronavirus poll going grocery store food food delivered lazyvulcanpolls',
 'ohio lockdown least april th reasses situation travel grocery store certain people still go work like law enforcement healthcare etc ohiolockdown coronavirus pandemic',
 'commemorative earring coronavirus toiletpaper',
 'point time three seashell demolition man invented response toilet paper hoarding madness coronavirus toiletpaper',
 'world gone little crazy amount emptyshelved supermarket pictur

In [335]:
# create features for the X_train, X_test using the word2vec embeddings
x_train_word2 = np.array([np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in X_train])
x_test_word2 = np.array([np.mean([model.wv[word] for word in sentence if word in model.wv] or [np.zeros(100)], axis=0) for sentence in X_test])

In [336]:
print(x_train_word2)

[[-3.7058596e-02 -3.0994322e-02  8.3870944e-03 ... -1.3036024e-02
  -4.0876202e-02  7.8900442e-02]
 [-3.0395171e-02 -2.6520470e-02 -7.9622697e-03 ... -8.8595198e-03
  -4.9609497e-02  8.4194988e-02]
 [-3.9508767e-02 -3.2794036e-02 -1.5287841e-02 ... -1.9530347e-05
  -4.9790360e-02  6.5771021e-02]
 ...
 [-2.7701473e-02 -1.8611019e-02 -3.2857468e-03 ... -1.3021737e-02
  -2.7716286e-02  6.3945882e-02]
 [-3.3871822e-02 -2.4512006e-02  1.1593896e-02 ...  3.2961229e-03
  -2.9511815e-02  8.4891744e-02]
 [-3.8540415e-02 -9.1845184e-05  1.0828469e-03 ... -1.7626368e-02
  -3.6644034e-02  5.1747572e-02]]


In [338]:
# Find rows where all values are non-zero
rows_all_non_zero = x_train_word2[np.all(x_train_word2 == 0, axis=1)]
print(rows_all_non_zero)

[]


In [339]:
# check the shape of the x_train_word2 and x_test_word2
print("The shape of the x_train_word2 is: ", x_train_word2.shape)
print("The shape of the x_test_word2 is: ", x_test_word2.shape)

The shape of the x_train_word2 is:  (30834, 300)
The shape of the x_test_word2 is:  (10279, 300)


In [342]:
# create a function to fit different models
def fit_model(model, X_train, y_train, X_test, y_test):
    print('Fitting the Model \n')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print('Accuracy: ', accuracy)
    print('\nPrecision: ', precision)
    print('\nRecall: ', recall)
    print('\nF1 Score: ', f1)
    print('\nConfusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred, zero_division=0))

In [343]:
# Fit a logistic regression model to the data
logreg = LogisticRegression(max_iter = 1000)   
fit_model(logreg, x_train_word2, y_train, x_test_word2, y_test)

Fitting the Model 

Accuracy:  0.5010215001459286

Precision:  0.4991411072722821

Recall:  0.5010215001459286

F1 Score:  0.47362460142082236

Confusion Matrix: 
 [[1584  129 2143]
 [ 538  284 1047]
 [1118  154 3282]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.49      0.41      0.45      3856
           1       0.50      0.15      0.23      1869
           2       0.51      0.72      0.60      4554

    accuracy                           0.50     10279
   macro avg       0.50      0.43      0.42     10279
weighted avg       0.50      0.50      0.47     10279



In [344]:
# fit a random forest model to the data
rf = RandomForestClassifier()
fit_model(rf, x_train_word2, y_train, x_test_word2, y_test)

Fitting the Model 

Accuracy:  0.5070532152933165

Precision:  0.5037836838067045

Recall:  0.5070532152933165

F1 Score:  0.4934685532389234

Confusion Matrix: 
 [[1718  203 1935]
 [ 518  461  890]
 [1247  274 3033]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.49      0.45      0.47      3856
           1       0.49      0.25      0.33      1869
           2       0.52      0.67      0.58      4554

    accuracy                           0.51     10279
   macro avg       0.50      0.45      0.46     10279
weighted avg       0.50      0.51      0.49     10279



In [346]:
# fit the naive bayes model to the data
nb = MultinomialNB()
fit_model(nb, x_train_word2, y_train, x_test_word2, y_test)

Fitting the Model 



ValueError: Negative values in data passed to MultinomialNB (input X)

In [347]:
# fit the linear svc model to the data
svc = LinearSVC()
fit_model(svc, x_train_word2, y_train, x_test_word2, y_test)

Fitting the Model 







Accuracy:  0.49751921393131626

Precision:  0.49473539370695635

Recall:  0.49751921393131626

F1 Score:  0.4665161569874982

Confusion Matrix: 
 [[1615  105 2136]
 [ 577  226 1066]
 [1154  127 3273]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.48      0.42      0.45      3856
           1       0.49      0.12      0.19      1869
           2       0.51      0.72      0.59      4554

    accuracy                           0.50     10279
   macro avg       0.49      0.42      0.41     10279
weighted avg       0.49      0.50      0.47     10279



In [348]:
# fit the xgboost model to the data
xgb_model = xgb.XGBClassifier()
fit_model(xgb_model, x_train_word2, y_train, x_test_word2, y_test)

Fitting the Model 

Accuracy:  0.5006323572331939

Precision:  0.4957598947264716

Recall:  0.5006323572331939

F1 Score:  0.4919925498271207

Confusion Matrix: 
 [[1746  285 1825]
 [ 510  536  823]
 [1349  341 2864]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.48      0.45      0.47      3856
           1       0.46      0.29      0.35      1869
           2       0.52      0.63      0.57      4554

    accuracy                           0.50     10279
   macro avg       0.49      0.46      0.46     10279
weighted avg       0.50      0.50      0.49     10279

