In [32]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

In [33]:
train_data = [
    ("I really enjoyed this movie!", 1),
    ("The plot was confusing.", 0),
    ("The acting was superb!", 1),
    ("The movie kept me on the edge of my seat.", 1),
    ("The ending was predictable.", 0),
    ("The characters were well-developed.", 1),
    ("I wouldn't recommend this film.", 0),
    ("The special effects were amazing.", 1),
    ("I loved the twists and turns in the story.", 1),
    ("The cinematography was breathtaking.", 1),
    ("The plot lacked originality.", 0),
    ("The movie failed to capture my interest.", 0),
    ("A masterpiece of storytelling.", 1),
    ("The film had a lackluster ending.", 0),
    ("I was on the edge of my seat throughout.", 1),
    ("The script was poorly written.", 0),
]

test_data = [
    ("Amazing film, highly recommend!", 1),
    ("Waste of time, terrible acting.", 0),
    ("One of the best movies I've seen.", 1),
    ("The script was weak.", 0),
    ("Incredible cinematography!", 1),
    ("The movie left me disappointed.", 0),
    ("A must-watch for all movie lovers.", 1),
    ("The plot lacked depth.", 0),
    ("I couldn't take my eyes off the screen.", 1),
    ("The performances were lackluster.", 0),
    ("A cinematic experience like no other.", 1),
    ("The story failed to engage me.", 0),
    ("A film that will stay with you long after.", 1),
    ("The movie was forgettable.", 0),
    ("I was moved by the emotional scenes.", 1),
    ("The dialogue felt forced.", 0),
]

In [34]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize and preprocess the data
def preprocess_data(texts, labels):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    print(inputs)
    input_ids = inputs["input_ids"].squeeze()
    attention_mask = inputs["attention_mask"].squeeze()
    return input_ids, attention_mask, labels

# Preprocess the training and testing data
train_dataset = torch.utils.data.TensorDataset(*[torch.tensor(t) for t in preprocess_data(*zip(*train_data))])
test_dataset = torch.utils.data.TensorDataset(*[torch.tensor(t) for t in preprocess_data(*zip(*test_data))])

# Compile and fit the model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

epochs = 2
for epoch in range(epochs):
    model.train()
    for input_ids, attention_mask, labels in train_dataset:
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), labels=labels.unsqueeze(0))
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_labels = []
    val_preds = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_dataset:
            outputs = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).item()  # Use item() for scalar tensor
            val_labels.append(labels.item())  # Use item() for scalar tensor
            val_preds.append(preds)


    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_accuracy:.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

{'input_ids': tensor([[  101,  1045,  2428,  5632,  2023,  3185,   999,   102,     0,     0,
             0,     0,     0],
        [  101,  1996,  5436,  2001, 16801,  1012,   102,     0,     0,     0,
             0,     0,     0],
        [  101,  1996,  3772,  2001, 21688,   999,   102,     0,     0,     0,
             0,     0,     0],
        [  101,  1996,  3185,  2921,  2033,  2006,  1996,  3341,  1997,  2026,
          2835,  1012,   102],
        [  101,  1996,  4566,  2001, 21425,  1012,   102,     0,     0,     0,
             0,     0,     0],
        [  101,  1996,  3494,  2020,  2092,  1011,  2764,  1012,   102,     0,
             0,     0,     0],
        [  101,  1045,  2876,  1005,  1056, 16755,  2023,  2143,  1012,   102,
             0,     0,     0],
        [  101,  1996,  2569,  3896,  2020,  6429,  1012,   102,     0,     0,
             0,     0,     0],
        [  101,  1045,  3866,  1996, 21438,  1998,  4332,  1999,  1996,  2466,
          1012,   102,     

In [35]:
new_examples = [
    "What a fantastic movie!",
    "A disappointing experience.",
    "Absolutely loved it!",
    "Couldn't stand it.",
    "Brilliant storytelling.",
    "Complete waste of time.",
    "Captivating from start to finish.",
    "Regret watching it.",
    "Highly recommend to everyone.",
    "Not worth the hype.",
]

inputs = tokenizer(new_examples, padding=True, truncation=True, max_length=128, return_tensors="pt")

input_ids = inputs["input_ids"].squeeze()
attention_mask = inputs["attention_mask"].squeeze()

# Ensure input_ids and attention_mask have batch dimension
if len(input_ids.shape) == 1:
    input_ids = input_ids.unsqueeze(0)
if len(attention_mask.shape) == 1:
    attention_mask = attention_mask.unsqueeze(0)

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predicted_labels = torch.argmax(outputs.logits, dim=1).numpy()

print("Predicted Labels:", predicted_labels)

Predicted Labels: [1 0 1 1 1 0 1 1 1 0]
