
Step 0: Import Required Libraries


In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import BertTokenizer, BertForSequenceClassification

Step 1: Generate Dataset

In [4]:
# 50 positive and 50 negative sentences (total = 100)
sentences = [
    "I love this movie", "This film was amazing", "I enjoyed every moment",
    "The acting was great", "What a fantastic experience",
    "Absolutely wonderful movie", "The story was touching", "I liked the characters",
    "Very entertaining film", "This movie made me happy",

    "I hate this movie", "This film was terrible", "I disliked every moment",
    "The acting was awful", "What a boring experience",
    "Absolutely horrible movie", "The story was weak", "I hated the characters",
    "Very disappointing film", "This movie made me angry"
] * 5

labels = ["positive"] * 50 + ["negative"] * 50

df = pd.DataFrame({
    "sentence": sentences,
    "sentiment": labels
})


Step 2: Encode Labels

In [5]:
label_encoder = LabelEncoder()

# positive → 1, negative → 0
df["label"] = label_encoder.fit_transform(df["sentiment"])


Step 3: Train–Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df["sentence"].values,
    df["label"].values,
    test_size=0.2,
    random_state=42
)


Step 4: Load BERT Tokenizer

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Step 5: Tokenization + Padding (BERT Style)

In [8]:
def bert_tokenize(texts, labels, max_len=64):
    """
    Converts text data into BERT input format
    """
    encodings = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    return encodings, torch.tensor(labels)


In [9]:
train_encodings, train_labels = bert_tokenize(X_train, y_train)
test_encodings, test_labels = bert_tokenize(X_test, y_test)


Step 6: Create PyTorch Dataset

In [10]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Returns one training example
        """
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


In [11]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


Step 7: Load BERT Classification Model

In [12]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 8: Training Setup

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=2e-5)


Step 9: Train BERT Model

In [14]:
epochs = 3  # BERT needs fewer epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} | Training Loss: {total_loss:.4f}")


Epoch 1 | Training Loss: 7.0570
Epoch 2 | Training Loss: 6.8590
Epoch 3 | Training Loss: 6.9153


Step 10: Evaluate Model

In [15]:
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(batch["labels"].numpy())


In [16]:
accuracy = accuracy_score(true_labels, predictions)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.25


Step 11: Predict on New Sentences

In [17]:
def predict_sentiment_bert(text):
    """
    Predict sentiment using trained BERT model
    """
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=64
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"


In [20]:
print(predict_sentiment_bert("I really enjoyed this movie"))
print(predict_sentiment_bert("This film was very boring"))


Positive
Negative
