#### Installing pkgs

In [1]:
!pip install datasets
!pip install evaluate
!pip install transformers
!pip install seaborn
!pip install kaggle



In [2]:
import os
import sys
import warnings
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import classification_report
from datasets import Dataset
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
warnings.simplefilter("ignore")

In [3]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



Using device: cuda


In [5]:
import os
import json

from google.colab import files
uploaded = files.upload()  # Upload kaggle.json manually when prompted

# Read the uploaded kaggle.json
with open("kaggle.json", "r") as f:
    kaggle_creds = json.load(f)

# Set credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_creds["username"]
os.environ["KAGGLE_KEY"] = kaggle_creds["key"]

# Now you can use Kaggle API via kagglehub (safer) or direct download:
!pip install kagglehub --quiet
import kagglehub

# Downloads and unzips dataset
DATA_DIR = kagglehub.dataset_download('abhi8923shriv/sentiment-analysis-dataset')
print("✅ Dataset downloaded to:", DATA_DIR)


Saving kaggle.json to kaggle (1).json
✅ Dataset downloaded to: /kaggle/input/sentiment-analysis-dataset


In [7]:
# train_dataset = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), ...)
# test_dataset = pd.read_csv(os.path.join(DATA_DIR, "test.csv"), ...)

train_dataset = pd.read_csv(
    os.path.join(DATA_DIR, "train.csv"),
    encoding="unicode_escape",
    usecols=["text", "sentiment"]
)

test_dataset = pd.read_csv(
    os.path.join(DATA_DIR, "test.csv"),
    encoding="unicode_escape",
    usecols=["text", "sentiment"]
)

train_dataset.dropna(inplace=True)
test_dataset.dropna(inplace=True)

label_map = {"positive": 2, "neutral": 1, "negative": 0}
id2label = {v: k for k, v in label_map.items()}
train_dataset["label"] = train_dataset["sentiment"].map(label_map)
test_dataset["label"] = test_dataset["sentiment"].map(label_map)

train = Dataset.from_pandas(train_dataset[["text", "label"]])
test = Dataset.from_pandas(test_dataset[["text", "label"]])

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train.map(tokenize_function, batched=True)
tokenized_test = test.map(tokenize_function, batched=True)


Map:   0%|          | 0/27480 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./bert_finetuning",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    load_best_model_at_end=True,
    # evaluation_strategy="steps",
    max_steps=120,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    logging_steps=50,
    save_steps=100,
    learning_rate=1e-4,
    logging_dir="./logs",
    report_to = "none"
)

metric = evaluate.load("accuracy")



In [10]:
def compute_metrics(eval_predictions):
    logits, labels = eval_predictions
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Load from a checkpoint if needed
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3,id2label=id2label,label2id=label_map
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
tokenized_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 27480
})

In [12]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
0,0.545,0.511707,0.791454


TrainOutput(global_step=120, training_loss=0.616297705968221, metrics={'train_runtime': 1901.39, 'train_samples_per_second': 8.078, 'train_steps_per_second': 0.063, 'total_flos': 4041422096302080.0, 'train_loss': 0.616297705968221, 'epoch': 0.5581395348837209})

In [15]:

model = AutoModelForSequenceClassification.from_pretrained(
    "./bert_finetuning/checkpoint-120", num_labels=3
)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

gc.collect()
torch.cuda.empty_cache()

texts = test_dataset["text"].to_list()
all_predictions = []

for i in tqdm(range(0, len(texts), 32), unit="batch"):
    batch_texts = texts[i:i + 32]
    batch_inputs = tokenizer(
        batch_texts,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        model = model.to(DEVICE)
        outputs = model(**batch_inputs)

    batch_predictions = outputs.logits.argmax(dim=-1).tolist()
    all_predictions.extend(batch_predictions)

    del batch_inputs
    del outputs
    torch.cuda.empty_cache()


100%|██████████| 111/111 [01:46<00:00,  1.04batch/s]


In [16]:

print(
    classification_report(
        y_true=test_dataset["label"].to_list(),
        y_pred=all_predictions,
        target_names=["negative", "neutral", "positive"]
    )
)


              precision    recall  f1-score   support

    negative       0.76      0.82      0.79      1001
     neutral       0.76      0.76      0.76      1430
    positive       0.87      0.80      0.84      1103

    accuracy                           0.79      3534
   macro avg       0.80      0.80      0.80      3534
weighted avg       0.79      0.79      0.79      3534

