# 🧠 Sentiment Model Training – BERT (HuggingFace Transformers)
This notebook loads or fine-tunes a transformer model for sentiment classification using HuggingFace.

## 🔧 Step 1: Install and import required libraries

In [None]:
!pip install transformers datasets scikit-learn --quiet

In [None]:

import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch


## 📦 Step 2: Load Dataset

In [None]:

# Load IMDB movie review dataset
dataset = load_dataset("imdb")
dataset = dataset.shuffle(seed=42)
dataset["train"] = dataset["train"].select(range(3000))  # keep it small for demo
dataset["test"] = dataset["test"].select(range(1000))


## 🧼 Step 3: Tokenization

In [None]:

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


## 🧠 Step 4: Define Model and Trainer

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:

training_args = TrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


## 💾 Step 5: Save Model

In [None]:

trainer.save_model("model/sentiment-bert")
tokenizer.save_pretrained("model/sentiment-bert")
print("✅ Model and tokenizer saved.")
