In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset
import pickle


In [2]:
# Load IMDB dataset
df = pd.read_csv("../../data/raw/IMDB Dataset.csv")

# Check structure
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Encode sentiment labels: positive -> 1, negative -> 0
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review'].values, df['label'].values, test_size=0.3, random_state=42
)

In [4]:
# Use DeBERTa-v3-small for speed and performance balance

model_name = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Tokenize data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=256)




In [5]:
class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(model.config) 

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.41.1",
  "type_vocab_size": 0,
  "vocab_size": 128100
}



In [7]:
# Force CPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [8]:
# Further reduce batch size and enable gradient accumulation to avoid OOM

training_args = TrainingArguments(
    output_dir="./deberta-imdb-checkpoints",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    fp16=True, 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=3
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)



In [9]:
trainer.train()
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

print("Validation Accuracy:", accuracy_score(val_labels, preds))
print(classification_report(val_labels, preds, target_names=["Negative", "Positive"]))


  0%|          | 0/52500 [00:00<?, ?it/s]

{'loss': 0.7827, 'grad_norm': 0.07908125966787338, 'learning_rate': 4.952857142857143e-05, 'epoch': 0.03}
{'loss': 0.6052, 'grad_norm': 58.32316207885742, 'learning_rate': 4.905238095238095e-05, 'epoch': 0.06}
{'loss': 0.6208, 'grad_norm': 138.674072265625, 'learning_rate': 4.857619047619048e-05, 'epoch': 0.09}
{'loss': 0.6335, 'grad_norm': 0.17224936187267303, 'learning_rate': 4.8100952380952385e-05, 'epoch': 0.11}
{'loss': 0.5046, 'grad_norm': 0.05213278904557228, 'learning_rate': 4.762476190476191e-05, 'epoch': 0.14}
{'loss': 0.5645, 'grad_norm': 0.1248655915260315, 'learning_rate': 4.714857142857143e-05, 'epoch': 0.17}
{'loss': 0.5568, 'grad_norm': 36.40721893310547, 'learning_rate': 4.667238095238096e-05, 'epoch': 0.2}
{'loss': 0.5298, 'grad_norm': 135.52699279785156, 'learning_rate': 4.6196190476190474e-05, 'epoch': 0.23}
{'loss': 0.6405, 'grad_norm': 0.11017625778913498, 'learning_rate': 4.5721904761904765e-05, 'epoch': 0.26}
{'loss': 0.5564, 'grad_norm': 0.17866413295269012, 'l

  0%|          | 0/15000 [00:00<?, ?it/s]

{'eval_loss': 0.5072982311248779, 'eval_runtime': 270.2299, 'eval_samples_per_second': 55.508, 'eval_steps_per_second': 55.508, 'epoch': 1.0}
{'loss': 0.3003, 'grad_norm': 0.010231323540210724, 'learning_rate': 3.286857142857143e-05, 'epoch': 1.03}
{'loss': 0.2797, 'grad_norm': 0.032950349152088165, 'learning_rate': 3.239238095238095e-05, 'epoch': 1.06}
{'loss': 0.3434, 'grad_norm': 0.018723268061876297, 'learning_rate': 3.1918095238095244e-05, 'epoch': 1.09}
{'loss': 0.3368, 'grad_norm': 0.020533226430416107, 'learning_rate': 3.144190476190476e-05, 'epoch': 1.11}
{'loss': 0.355, 'grad_norm': 0.2256549596786499, 'learning_rate': 3.096571428571429e-05, 'epoch': 1.14}
{'loss': 0.3695, 'grad_norm': 0.5653647780418396, 'learning_rate': 3.048952380952381e-05, 'epoch': 1.17}
{'loss': 0.2713, 'grad_norm': 0.025481009855866432, 'learning_rate': 3.0013333333333333e-05, 'epoch': 1.2}
{'loss': 0.2737, 'grad_norm': 0.06420562416315079, 'learning_rate': 2.9538095238095236e-05, 'epoch': 1.23}
{'loss

  0%|          | 0/15000 [00:00<?, ?it/s]

{'eval_loss': 0.4727267622947693, 'eval_runtime': 293.7466, 'eval_samples_per_second': 51.064, 'eval_steps_per_second': 51.064, 'epoch': 2.0}
{'loss': 0.1931, 'grad_norm': 27.44675636291504, 'learning_rate': 1.6210476190476193e-05, 'epoch': 2.03}
{'loss': 0.1501, 'grad_norm': 0.03052656538784504, 'learning_rate': 1.5734285714285715e-05, 'epoch': 2.06}
{'loss': 0.1611, 'grad_norm': 0.00967489741742611, 'learning_rate': 1.5258095238095237e-05, 'epoch': 2.09}
{'loss': 0.1605, 'grad_norm': 0.04285068437457085, 'learning_rate': 1.4781904761904763e-05, 'epoch': 2.11}
{'loss': 0.1667, 'grad_norm': 0.005992744117975235, 'learning_rate': 1.4305714285714287e-05, 'epoch': 2.14}
{'loss': 0.1651, 'grad_norm': 0.04180140793323517, 'learning_rate': 1.382952380952381e-05, 'epoch': 2.17}
{'loss': 0.1549, 'grad_norm': 0.025427747517824173, 'learning_rate': 1.3353333333333335e-05, 'epoch': 2.2}
{'loss': 0.1314, 'grad_norm': 0.00918477401137352, 'learning_rate': 1.2877142857142857e-05, 'epoch': 2.23}
{'lo

  0%|          | 0/15000 [00:00<?, ?it/s]

{'eval_loss': 0.48863184452056885, 'eval_runtime': 263.6066, 'eval_samples_per_second': 56.903, 'eval_steps_per_second': 56.903, 'epoch': 3.0}
{'train_runtime': 15669.2219, 'train_samples_per_second': 6.701, 'train_steps_per_second': 3.351, 'train_loss': 0.3227773905436198, 'epoch': 3.0}


  0%|          | 0/15000 [00:00<?, ?it/s]

Validation Accuracy: 0.9154
              precision    recall  f1-score   support

    Negative       0.90      0.93      0.92      7411
    Positive       0.93      0.90      0.92      7589

    accuracy                           0.92     15000
   macro avg       0.92      0.92      0.92     15000
weighted avg       0.92      0.92      0.92     15000

