In [1]:
# ===============================
# Step 0: Install & Import Libraries
# ===============================
# Make sure to install: transformers, torch, sklearn, pandas
# pip install transformers torch scikit-learn pandas

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score



Step 1: Generate Dataset

In [2]:
# ===============================
# Step 1: Generate Dataset
# ===============================
sentences = [
    "I love this movie", "This film was amazing", "I enjoyed every moment",
    "The acting was great", "What a fantastic experience",
    "Absolutely wonderful movie", "The story was touching", "I liked the characters",
    "Very entertaining film", "This movie made me happy",
    
    "I hate this movie", "This film was terrible", "I disliked every moment",
    "The acting was awful", "What a boring experience",
    "Absolutely horrible movie", "The story was weak", "I hated the characters",
    "Very disappointing film", "This movie made me angry"
] * 5  # 100 samples

labels = ["positive"] * 50 + ["negative"] * 50

df = pd.DataFrame({"sentence": sentences, "sentiment": labels})

Step 2: Encode Labels

In [3]:


# ===============================
# Step 2: Encode Labels
# ===============================
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["sentiment"])  # positive=1, negative=0



Step 3: Trainâ€“Test Split

In [4]:
# ===============================
# Step 3: Train-Test Split
# ===============================
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["sentence"], df["label"], test_size=0.2, random_state=42
)


Step 4: Load BERT Tokenizer

In [5]:

# ===============================
# Step 4: Load BERT Tokenizer
# ===============================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and encode sequences
max_len = 32  # maximum sequence length
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=max_len)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Step 5: Create PyTorch Dataset

In [6]:
# ===============================
# Step 5: Create PyTorch Dataset
# ===============================
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, list(train_labels))
test_dataset = SentimentDataset(test_encodings, list(test_labels))

Step 6: Load BERT Classification Model

In [8]:
# ===============================
# Step 6: Load Pretrained BERT Model
# ===============================
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# ===============================
# Step 7: DataLoader and Optimizer
# ===============================
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

optimizer = AdamW(model.parameters(), lr=5e-5)

Step 8: Train BERT Model

In [10]:
# ===============================
# Step 8: Training Loop
# ===============================
model.train()
epochs = 3

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Loss: {loss.item():.4f}")

Epoch 1/3
Loss: 0.7094
Epoch 2/3
Loss: 0.7371
Epoch 3/3
Loss: 0.7351


Step 09: Evaluate Model

In [11]:

# ===============================
# Step 9: Evaluation
# ===============================
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.6000


Step 10: Predict on New Sentences

In [12]:
# ===============================
# Step 10: Predict New Sentences
# ===============================
def predict_sentiment(text):
    model.eval()
    encoding = tokenizer(text, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
    encoding = {key: val.to(device) for key, val in encoding.items()}
    with torch.no_grad():
        output = model(**encoding)
        prediction = torch.argmax(output.logits, dim=-1).item()
    return "Positive" if prediction == 1 else "Negative"

# Test
print(predict_sentiment("I really enjoyed this movie"))
print(predict_sentiment("This film was very boring"))


Positive
Positive
