In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import LinearSVC
from gensim.models import Word2Vec
import contractions

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional,Embedding, Dropout,BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from transformers import BertTokenizerFast, RobertaTokenizerFast, TFRobertaModel, TFBertModel,AutoTokenizer, TFAutoModel

In [16]:
data = pd.read_csv(r'../../data/Corona_NLP_train.csv', encoding= 'ISO-8859-1')

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [18]:
# filter the data to only conbtain the tweets and the sentiment
data = data[['OriginalTweet', 'Sentiment']]
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [19]:
# plot the sentiment distribution using plotly
px.histogram(data, x='Sentiment', title='Sentiment Distribution')

In [20]:
# The data seems balanced, we can now proceed to clean the data

In [21]:
# Sentiment Column Analysis
data['Sentiment'].value_counts()

Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

In [22]:
# convert the sentiments to only three categories|
data['Sentiment'] = data['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [23]:
data['Sentiment'].value_counts()

Sentiment
2    18046
0    15398
1     7713
Name: count, dtype: int64

In [24]:
# Split the data into X and y
X = data['OriginalTweet']
y = data['Sentiment']

In [25]:
# Custom Transformer for Preprocessing and Tokenization
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', tokenizer_type='bert', max_len=128):
        """
        Args:
        - model_name: Hugging Face model name (e.g., 'bert-base-uncased', 'roberta-base').
        - tokenizer_type: Tokenizer type to match the model (e.g., 'bert', 'roberta').
        - max_len: Maximum token length for input sequences.
        """
        self.model_name = model_name
        self.tokenizer_type = tokenizer_type
        self.max_len = max_len
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        self.model = TFBertModel.from_pretrained(model_name)
    
    def preprocess_text(self, text):
        text = unidecode(text)  # Normalize Unicode
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'#', ' ', text)  # Replace hashtags with space
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        input_ids = []
        attention_masks = []
        for i in range(len(X)):
            preprocessed_text = self.preprocess_text(X.iloc[i])
            encoded = self.tokenizer.encode_plus(
                preprocessed_text,
                add_special_tokens=True,
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True
            )
            input_ids.append(encoded['input_ids'])
            attention_masks.append(encoded['attention_mask'])
        return np.array(input_ids), np.array(attention_masks)

In [26]:
# create a function to build the model
def build_bert_model(bert_model, max_len, num_classes):
    input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    attention_masks = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_masks')
    embeddings = bert_model([input_ids, attention_masks])[1]
    output = tf.keras.layers.Dense(num_classes, activation='softmax')(embeddings)
    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
    model.compile(tf.optimizers.Adam(learning_rate=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [27]:
# Train, Test Split and Pipeline Integration
def create_pipeline(data, labels, model_name='bert-base-uncased', tokenizer_type='bert', max_len=128, batch_size=32, epochs=4):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    
    # Create the transformer
    transformer = BertTransformer(model_name=model_name, tokenizer_type=tokenizer_type, max_len=max_len)
    
    # Transform data
    X_train_ids, X_train_masks = transformer.fit_transform(X_train)
    X_test_ids, X_test_masks = transformer.transform(X_test)
    
    # Build the BERT model
    bert_model = transformer.model
    num_classes = len(np.unique(labels))
    model = build_bert_model(bert_model, max_len, num_classes)
    
    # create a keras callback to stop the training if the model does not improve
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True
    )
    
    # Train the model
    with tf.device('/GPU:0'):
        history = model.fit(
            [X_train_ids, X_train_masks], y_train,
            validation_data=([X_test_ids, X_test_masks], y_test),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=[early_stopping],
        )
    
    # Evaluate the model
    y_pred = np.argmax(model.predict([X_test_ids, X_test_masks]), axis=1)
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)
    return model, transformer, history

In [29]:
model, transformer, history = create_pipeline(X, y, model_name='bert-base-uncased', tokenizer_type='bert', max_len=128, batch_size=4, epochs=10)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      3062
           1       0.92      0.82      0.87      1553
           2       0.92      0.92      0.92      3617

    accuracy                           0.91      8232
   macro avg       0.91      0.89      0.90      8232
weighted avg       0.91      0.91      0.91      8232



In [113]:
# import the testing data
test_data = pd.read_csv(r'../../../data/Corona_NLP_test.csv', encoding= 'ISO-8859-1')

In [114]:
# filter the data to only conbtain the tweets and the sentiment
test_data = test_data[['OriginalTweet', 'Sentiment']]

In [115]:
# split the data into X and y
X_test = test_data['OriginalTweet']
y_test = test_data['Sentiment']

In [116]:
# Apply the preprocessing to the test data
X_test = X_test.apply(preprocess_tweet)

In [117]:
# convert the sentiments to only three categories|
y_test = y_test.map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [118]:
# Apply the tokenizer to the testing data
x_test_ids, x_test_masks = tokenize(X_test)

3798 3798


In [119]:
# Evaluate the model
model.evaluate([x_test_ids, x_test_masks], y_test)



[0.43801721930503845, 0.8765139579772949]

In [120]:
# Create a classification report
y_pred = model.predict([x_test_ids, x_test_masks])
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      1633
           1       0.88      0.79      0.83       619
           2       0.84      0.93      0.88      1546

    accuracy                           0.88      3798
   macro avg       0.88      0.86      0.87      3798
weighted avg       0.88      0.88      0.88      3798

