In [1]:
import pandas as pd
import numpy as np
import math
import kagglehub
import tensorflow as tf
import matplotlib.pyplot as plt
import re, string
import emoji
import nltk
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras.models import load_model
from tensorflow import keras
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
path = kagglehub.dataset_download('abhi8923shriv/sentiment-analysis-dataset')
train_dataset = path+'/train.csv'
test_dataset = path+'/test.csv'
train_df = pd.read_csv(train_dataset, encoding='ISO-8859-1')
test_df = pd.read_csv(test_dataset, encoding='ISO-8859-1')



In [3]:
train = train_df.dropna(subset = "text")[["text", "sentiment"]]
test = test_df.dropna()[["text", "sentiment"]]

In [23]:
x = train["text"].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train["sentiment"].values)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, stratify=y, random_state=42, test_size=0.3, shuffle=True)
x_test = test["text"].values
y_test = label_encoder.transform(test["sentiment"])
y_train = to_categorical(y_train, num_classes=3)
y_valid = to_categorical(y_valid, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [5]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

In [6]:
token_lens = []

for txt in x_train:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

112

In [7]:
def tokenize_roberta(data,max_len=128) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [8]:
train_input_ids, train_attention_masks = tokenize_roberta(x_train)
val_input_ids, val_attention_masks = tokenize_roberta(x_valid)
test_input_ids, test_attention_masks = tokenize_roberta(x_test)

In [24]:
def train_model(bert_model, max_len=128):
    
    input_ids = Input(shape=(max_len,),dtype='int32')
    attention_masks = Input(shape=(max_len,),dtype='int32')
    # output = bert_model(input_ids=input_ids, attention_mask=attention_masks)
    # output = output[1]
    def roberta_encoding(inputs):
        input_ids, attention_mask = inputs
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

    roberta_output = tf.keras.layers.Lambda(roberta_encoding, output_shape=(768,))([input_ids, attention_masks])

    dense = tf.keras.layers.Dense(64, activation='relu')(roberta_output)
    dropout = tf.keras.layers.Dropout(0.3)(dense)
    output = Dense(3, activation=tf.nn.softmax)(dropout)
    model = Model(inputs = [input_ids,attention_masks],outputs = output)
    lr_scheduler = LearningRateScheduler(lambda x: 1e-4 * math.exp(-0.1 * x))

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.compile(loss="categorical_crossentropy",
                    optimizer="adam",
                    metrics=["accuracy"])

    history = model.fit([train_input_ids,train_attention_masks], y_train,                   
                          epochs=10,
                          validation_data=([val_input_ids,val_attention_masks], y_valid),
                          batch_size=64,
                          verbose=1,
                          callbacks=[lr_scheduler, early_stopping])
    
    model.save("model_roberta.keras")

    return model, history

In [10]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.weight', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [22]:
y_train.shape

(19236,)

In [25]:
model, history = train_model(roberta_model)

Epoch 1/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2763s[0m 9s/step - accuracy: 0.3706 - loss: 1.1218 - val_accuracy: 0.4106 - val_loss: 1.0616 - learning_rate: 1.0000e-04
Epoch 2/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2757s[0m 9s/step - accuracy: 0.4267 - loss: 1.0649 - val_accuracy: 0.4431 - val_loss: 1.0368 - learning_rate: 9.0484e-05
Epoch 3/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2760s[0m 9s/step - accuracy: 0.4738 - loss: 1.0306 - val_accuracy: 0.5082 - val_loss: 1.0136 - learning_rate: 8.1873e-05
Epoch 4/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2762s[0m 9s/step - accuracy: 0.5147 - loss: 1.0088 - val_accuracy: 0.5067 - val_loss: 0.9936 - learning_rate: 7.4082e-05
Epoch 5/10
[1m301/301[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2753s[0m 9s/step - accuracy: 0.5297 - loss: 0.9866 - val_accuracy: 0.5492 - val_loss: 0.9753 - learning_rate: 6.7032e-05
Epoch 6/10
[1m301/301[0m [32m━━━━━━━━

In [26]:
preds = model.predict([test_input_ids, test_attention_masks])

[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 3s/step


In [35]:
y_pred = np.argmax(preds, axis=1)
y_true = np.argmax(y_test, axis=1)
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.62      0.44      0.52      1001
           1       0.52      0.74      0.61      1430
           2       0.68      0.49      0.57      1103

    accuracy                           0.58      3534
   macro avg       0.61      0.56      0.57      3534
weighted avg       0.60      0.58      0.57      3534

