In [50]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
import contractions

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional,Embedding, Dropout,BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel

In [51]:
data = pd.read_csv(r'../../../data/Corona_NLP_train.csv', encoding= 'ISO-8859-1')

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [53]:
# filter the data to only conbtain the tweets and the sentiment
data = data[['OriginalTweet', 'Sentiment']]
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [54]:
# plot the sentiment distribution using plotly
px.histogram(data, x='Sentiment', title='Sentiment Distribution')

In [55]:
# The data seems balanced, we can now proceed to clean the data

In [56]:
# Initialize the tokenizer and other resources
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [57]:
# function to clean the data
def preprocess_tweet(tweet):
    # Remove accents
    tweet = unidecode(tweet)
    # make the tweet to lower case
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Replace hastags with the space
    tweet = re.sub(r'#', ' ', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    tweet = re.sub(r'\d+', '', tweet)
    # Tokenize tweet
    tokens = tokenizer.tokenize(tweet)
    # There is no need to remove stop words since the model will learn the importance of each word
    # There is no need for lemmatization in Bert Model
    # fix the contractions in each token of the tweet
    tokens = [contractions.fix(token) for token in tokens]
    # Join tokens back to string
    tweet = ' '.join(tokens)
    # Remove multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Strip leading/trailing spaces
    tweet = tweet.strip()
    return tweet

In [58]:
# Apply preprocessing to the 'OriginalTweet	' column
data['cleaned_tweet'] = data['OriginalTweet'].apply(preprocess_tweet)

In [59]:
data

Unnamed: 0,OriginalTweet,Sentiment,cleaned_tweet
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,and and
1,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,Coronavirus Australia: Woolworths to give elde...,Positive,corona ##virus australia wool ##worth ##s to g...
3,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the co ##...
...,...,...,...
41152,Airline pilots offering to stock supermarket s...,Neutral,airline pilots offering to stock supermarket s...
41153,Response to complaint not provided citing COVI...,Extremely Negative,response to complaint not provided citing co #...
41154,You know itÂs getting tough when @KameronWild...,Positive,you know it ##as getting tough when is ratio #...
41155,Is it wrong that the smell of hand sanitizer i...,Neutral,is it wrong that the smell of hand san ##iti #...


In [60]:
# Check the empty values in the cleaned_tweet column
data[data['cleaned_tweet'] == ''].shape[0]

31

In [61]:
# Remove the empty values
data = data[data['cleaned_tweet'] != '']

In [62]:
data.shape

(41126, 3)

In [63]:
# Take only the cleaned tweets and the sentiment
data = data[['cleaned_tweet', 'Sentiment']]

In [64]:
# let's suffle the data and reset the index
data = data.sample(frac=1)
# reset the index because the index was not a good representation of the data
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,cleaned_tweet,Sentiment
0,we are currently blasting high prices with the...,Positive
1,so while were leading in the co ##vid infectio...,Extremely Positive
2,houston based call ##on petroleum cp ##e facin...,Negative
3,k will be wiped off the average value of a uk ...,Positive
4,co ##vid has really driven up the prices of tu...,Neutral


In [65]:
# Sentiment Column Analysis
data['Sentiment'].value_counts()

Sentiment
Positive              11420
Negative               9916
Neutral                7685
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [66]:
# convert the sentiments to only three categories|
data['Sentiment'] = data['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [67]:
data['Sentiment'].value_counts()

Sentiment
2    18044
0    15397
1     7685
Name: count, dtype: int64

In [68]:
# Split the data into X and y
X = data['cleaned_tweet']
y = data['Sentiment']


In [69]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(30844,) (10282,) (30844,) (10282,)


In [91]:
# Bert Sentiment Analysis
MAX_LEN = 128
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [94]:
def tokenize(data, max_len=MAX_LEN):
    # Initialize the tokenizer and other resources
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data.iloc[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    print(len(input_ids), len(attention_masks))
    return np.array(input_ids), np.array(attention_masks)

In [95]:
# Apply the tokenizer to the training and testing data
x_train_ids, x_train_masks = tokenize(X_train)
x_test_ids, x_test_masks = tokenize(X_test)

30844 30844
10282 10282


In [104]:
# Count non-zero values in the first row of both arrays
count_ids_non_zero = np.count_nonzero(x_train_ids[0] > 0)
count_masks_non_zero = np.count_nonzero(x_train_masks[0] > 0)
print(count_ids_non_zero, count_masks_non_zero)

64 64


In [105]:
# Initialize the Bert model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClas

In [107]:
# Create the model
input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
attention_masks = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
embeddings = embeddings = bert_model([input_ids,attention_masks])[1]
output = tf.keras.layers.Dense(3, activation="softmax")(embeddings)
model = tf.keras.models.Model(inputs = [input_ids,attention_masks], outputs = output)

# compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rater=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


The `lr` argument is deprecated, use `learning_rate` instead.



In [108]:
# print the summary of the model
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [111]:
# run the model on gpu
with tf.device('/GPU:0'):
    history = model.fit([x_train_ids, x_train_masks], y_train, validation_data=([x_test_ids, x_test_masks], y_test), epochs=4, batch_size=8)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [113]:
# import the testing data
test_data = pd.read_csv(r'../../../data/Corona_NLP_test.csv', encoding= 'ISO-8859-1')

In [114]:
# filter the data to only conbtain the tweets and the sentiment
test_data = test_data[['OriginalTweet', 'Sentiment']]

In [115]:
# split the data into X and y
X_test = test_data['OriginalTweet']
y_test = test_data['Sentiment']

In [116]:
# Apply the preprocessing to the test data
X_test = X_test.apply(preprocess_tweet)

In [117]:
# convert the sentiments to only three categories|
y_test = y_test.map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [118]:
# Apply the tokenizer to the testing data
x_test_ids, x_test_masks = tokenize(X_test)

3798 3798


In [119]:
# Evaluate the model
model.evaluate([x_test_ids, x_test_masks], y_test)



[0.43801721930503845, 0.8765139579772949]

In [120]:
# Create a classification report
y_pred = model.predict([x_test_ids, x_test_masks])
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      1633
           1       0.88      0.79      0.83       619
           2       0.84      0.93      0.88      1546

    accuracy                           0.88      3798
   macro avg       0.88      0.86      0.87      3798
weighted avg       0.88      0.88      0.88      3798

