In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import models
import tensorflow as tf
import pandas as pd 
import numpy as np
import kagglehub
import torch 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Test

In [2]:
# "cuda:0" if torch.cuda.is_available() else 
device =  "cpu"
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-cased")
embed_model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-large-cased").to(device)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at google-bert/bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with anot

In [3]:
tokens = tokenizer(["this is a test", "test2"], padding=True, return_tensors='pt').to(device)
tokens

{'input_ids': tensor([[ 101, 1142, 1110,  170, 2774,  102],
        [ 101, 2774, 1477,  102,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0]])}

In [4]:
embedding = embed_model(**tokens, output_hidden_states=True)
last_hidden = embedding['hidden_states'][-1].cpu().detach().numpy()
last_hidden.reshape(last_hidden.shape[0], last_hidden.shape[1] * last_hidden.shape[2]).shape

(2, 6144)

In [5]:
def get_last_hidden_state(l_of_text):
    tokens = tokenizer(l_of_text, padding="max_length", max_length=60, truncation=True, return_tensors='pt')
    embedding = embed_model(**tokens, output_hidden_states=True)
    last_hidden = embedding['hidden_states'][-1].detach().numpy()
    return last_hidden.reshape(last_hidden.shape[0], last_hidden.shape[1] * last_hidden.shape[2])

## General Setup

In [6]:
# Load Data
path = kagglehub.dataset_download('abhi8923shriv/sentiment-analysis-dataset')
train_dataset = path+'/train.csv'
test_dataset = path+'/test.csv'
train_df = pd.read_csv(train_dataset, encoding='ISO-8859-1')
test_df = pd.read_csv(test_dataset, encoding='ISO-8859-1')

In [7]:
# Basic Preprocessing (need to update with preprocessor)
train = train_df.dropna(subset = "text")[["text", "sentiment"]]
test = test_df.dropna()[["text", "sentiment"]]

x = train["text"].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train["sentiment"].values)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, stratify=y, random_state=42, test_size=0.3, shuffle=True)

y_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=3)

## Using Embedding

In [8]:
#### Vectorise X by embedding model
x_train_transformed = embed_model(**tokenizer(x_train.tolist(), padding="max_length", max_length=60, truncation=True, return_tensors='pt').to(device))
x_valid_transformed =  embed_model(**tokenizer(x_valid.tolist(), padding="max_length", max_length=60, truncation=True, return_tensors='pt').to(device))

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4727439360 bytes.

In [None]:
print(x_train_transformed.shape, x_valid_transformed.shape)

(100, 61440) (100, 61440)


In [None]:
# base layers
input_layer = ("input", {"shape": (x_train_transformed.shape[1], )})
dense_256 = ("dense", {"units": 256, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dropout = ("dropout", {}) 
dense_128 = ("dense", {"units": 128, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dense_64 = ("dense", {"units": 64, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
output_layer = ("output", {"units": 3, "activation": "softmax"})

In [None]:
l = [input_layer, dense_256, dropout, dense_64, dense_64, dropout, output_layer]
model = models.TfModel(l, batch_size=32, dropout_rate=0.4)

In [None]:
model.fit(x_train_transformed, y_train)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 111ms/step - accuracy: 0.3575 - loss: 8.1112 - learning_rate: 1.0000e-04
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.5098 - loss: 7.8767 - learning_rate: 9.0484e-05
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.6011 - loss: 7.6941 - learning_rate: 8.1873e-05
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.7378 - loss: 7.3543 - learning_rate: 7.4082e-05
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.7265 - loss: 7.3643 - learning_rate: 6.7032e-05
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.7818 - loss: 7.2393 - learning_rate: 6.0653e-05
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.7130 - loss: 7.3290 - lear

In [None]:
preds = model.predict(x_valid_transformed)
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.52      0.44      0.48        36
           1       0.49      0.62      0.55        34
           2       0.58      0.50      0.54        30

   micro avg       0.52      0.52      0.52       100
   macro avg       0.53      0.52      0.52       100
weighted avg       0.52      0.52      0.52       100
 samples avg       0.52      0.52      0.52       100



## Using last hidden state

In [None]:
#### Vectorise X by embedding model
x_train_transformed = get_last_hidden_state(x_train[:100].tolist())
x_valid_transformed =  get_last_hidden_state(x_valid[:100].tolist())

In [None]:
print(x_train_transformed.shape, x_valid_transformed.shape)

(100, 61440) (100, 61440)


In [None]:
# base layers
input_layer = ("input", {"shape": (x_train_transformed.shape[1], )})
dense_256 = ("dense", {"units": 256, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dropout = ("dropout", {}) 
dense_128 = ("dense", {"units": 128, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dense_64 = ("dense", {"units": 64, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
output_layer = ("output", {"units": 3, "activation": "softmax"})

In [None]:
l = [input_layer, dense_256, dropout, dense_64, dense_64, dropout, output_layer]
model = models.TfModel(l, batch_size=32, dropout_rate=0.4)

In [None]:
model.fit(x_train_transformed, y_train[:100])

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 111ms/step - accuracy: 0.3575 - loss: 8.1112 - learning_rate: 1.0000e-04
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.5098 - loss: 7.8767 - learning_rate: 9.0484e-05
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.6011 - loss: 7.6941 - learning_rate: 8.1873e-05
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.7378 - loss: 7.3543 - learning_rate: 7.4082e-05
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.7265 - loss: 7.3643 - learning_rate: 6.7032e-05
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.7818 - loss: 7.2393 - learning_rate: 6.0653e-05
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.7130 - loss: 7.3290 - lear

In [None]:
preds = model.predict(x_valid_transformed)
print(classification_report(y_valid[:100], preds))

              precision    recall  f1-score   support

           0       0.52      0.44      0.48        36
           1       0.49      0.62      0.55        34
           2       0.58      0.50      0.54        30

   micro avg       0.52      0.52      0.52       100
   macro avg       0.53      0.52      0.52       100
weighted avg       0.52      0.52      0.52       100
 samples avg       0.52      0.52      0.52       100

