In [5]:
# Step 1: Install required libraries (run once)
!pip install transformers datasets scikit-learn torch

# Step 2: Import necessary modules
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim  # Using PyTorch's native AdamW
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 3: Prepare your custom data (example data)
texts = ["I love this movie", "This film was terrible", "What an amazing experience", "Worst movie I've ever seen"]
labels = [1, 0, 1, 0]  # 1 = positive, 0 = negative

# Convert to Hugging Face Dataset format
data = Dataset.from_dict({"text": texts, "label": labels})

# Step 4: Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = data.map(tokenize_function, batched=True)

# Step 5: Split into train and validation sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.25)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Step 6: Create PyTorch DataLoaders
train_dataloader = DataLoader(train_dataset.shuffle(seed=42), batch_size=2, collate_fn=lambda x: {
    'input_ids': torch.stack([torch.tensor(d['input_ids']) for d in x]),
    'attention_mask': torch.stack([torch.tensor(d['attention_mask']) for d in x]),
    'labels': torch.tensor([d['label'] for d in x])
})
eval_dataloader = DataLoader(eval_dataset, batch_size=2, collate_fn=lambda x: {
    'input_ids': torch.stack([torch.tensor(d['input_ids']) for d in x]),
    'attention_mask': torch.stack([torch.tensor(d['attention_mask']) for d in x]),
    'labels': torch.tensor([d['label'] for d in x])
})

# Step 7: Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 8: Set up optimizer and device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)  # Updated here

# Step 9: Training loop
for epoch in range(100):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_dataloader)}")

    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch["labels"].cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {acc:.2f}")



Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Loss: 0.7721593677997589
Validation Accuracy: 0.00
Epoch 2 - Loss: 0.6144320517778397
Validation Accuracy: 0.00
Epoch 3 - Loss: 0.5967933535575867
Validation Accuracy: 0.00
Epoch 4 - Loss: 0.6077208817005157
Validation Accuracy: 0.00
Epoch 5 - Loss: 0.5585470199584961
Validation Accuracy: 0.00
Epoch 6 - Loss: 0.5034503042697906
Validation Accuracy: 0.00
Epoch 7 - Loss: 0.5427271127700806
Validation Accuracy: 0.00
Epoch 8 - Loss: 0.4485441744327545
Validation Accuracy: 0.00
Epoch 9 - Loss: 0.3630189299583435
Validation Accuracy: 1.00
Epoch 10 - Loss: 0.4601408541202545
Validation Accuracy: 1.00
Epoch 11 - Loss: 0.3757236748933792
Validation Accuracy: 1.00
Epoch 12 - Loss: 0.23819904029369354
Validation Accuracy: 1.00
Epoch 13 - Loss: 0.24410536140203476
Validation Accuracy: 1.00
Epoch 14 - Loss: 0.1763073429465294
Validation Accuracy: 1.00
Epoch 15 - Loss: 0.18064124137163162
Validation Accuracy: 1.00
Epoch 16 - Loss: 0.1473505198955536
Validation Accuracy: 1.00
Epoch 17 - Los