## 0. Install required libraries and Import them

In [10]:
# Install required libraries
# !pip install transformers datasets torch numpy scikit-learn accelerate     ## uncomment if running in a new environment

# Import necessary libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score
import pickle


## 1. Load & sample dataset

In [11]:
# Load dataset
train_data = pickle.load(open("train_data.pkl", "rb"))
test_data = pickle.load(open("test_data.pkl", "rb"))

# Create Dataset objects
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Select subsets for a quick demo (1000 examples each)
train_dataset = train_dataset.shuffle(seed=42).select(range(1000))
test_dataset = test_dataset.shuffle(seed=42).select(range(1000))

# Pick 3 examples to show predictions before and after finetuning
examples = test_dataset.select(range(5))


## 2. Load the and prepare the model and tokenizer

In [12]:
# Load the DistilBERT model and tokenizer
print("Loading the DistilBERT model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Function to predict sentiment for a single example
def predict(model, tokenizer, example):
    text = example['text']
    # Tokenize the text and prepare it for the model
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Ensure inputs are on the same device as the model (CPU or GPU)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Make prediction without computing gradients (faster)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()  # 0 = negative, 1 = positive
    return predicted_label


Loading the DistilBERT model and tokenizer...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Tokenize dataset

In [13]:
# Tokenize the datasets
print("\nTokenizing the datasets...")
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
# Set format to PyTorch tensors, keeping only necessary columns
tokenized_train.set_format('pt', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('pt', columns=['input_ids', 'attention_mask', 'label'])



Tokenizing the datasets...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## 4. Setup trainer with parameters

In [None]:
# Define a function to compute accuracy during evaluation (needed for Trainer)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',              # Where to save the model
    num_train_epochs=1,                  # Train for 1 epoch to keep it quick
    per_device_train_batch_size=4,       # Batch size for training
    per_device_eval_batch_size=4,        # Batch size for evaluation
    logging_dir='./logs',                # Where to save logs
    eval_strategy='no',                  # We'll evaluate manually
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)


## 5. Finetuning & evaluation 

### Pre finetune evaluation

In [15]:
# Pre-finetuning evaluation
print("\nEvaluating model before finetuning...")
pre_eval = trainer.evaluate()
print(f"Pre-finetuning accuracy: {pre_eval['eval_accuracy']:.4f}")

# Show predictions before finetuning
print("\nPre-finetuning predictions:")
for example in examples:
    predicted_label = predict(model, tokenizer, example)
    print(f"Text: {example['text'][:100]}...")  # Show first 100 characters
    print(f"True label: {example['label']}, Predicted label: {predicted_label}")



Evaluating model before finetuning...


Pre-finetuning accuracy: 0.4880

Pre-finetuning predictions:
Text: CONGO is probably the worst big-budget movie of the 1990s. It is so bad that it is watchable over an...
True label: 0, Predicted label: 0
Text: The acting was terrible, the cheesy, fake, CHEAP green screen effects were ridiculous, and the creat...
True label: 0, Predicted label: 0
Text: This movie is a sleeper - I've watched every miniseries that was ever on TV, some many times, and th...
True label: 1, Predicted label: 0
Text: Before I had seen this film, I had heard some negative comments about it. However, when watching it ...
True label: 0, Predicted label: 0
Text: As i watched "Wirey Spindell" i couldnt but laugh at what was taking place on screen. Wirey sure got...
True label: 1, Predicted label: 0


### Finetuning (training the model)

In [16]:
# Finetune the model
print("\nFinetuning the model...")
trainer.train()



Finetuning the model...


Step,Training Loss


TrainOutput(global_step=250, training_loss=0.5616519775390625, metrics={'train_runtime': 182.0922, 'train_samples_per_second': 5.492, 'train_steps_per_second': 1.373, 'total_flos': 132467398656000.0, 'train_loss': 0.5616519775390625, 'epoch': 1.0})

### Post finetune evaluation & predictions

In [17]:
# Post-finetuning evaluation
print("\nEvaluating model after finetuning...")
post_eval = trainer.evaluate()
print(f"Post-finetuning accuracy: {post_eval['eval_accuracy']:.4f}")

# Show predictions after finetuning
print("\nPost-finetuning predictions:")
for example in examples:
    predicted_label = predict(model, tokenizer, example)
    print(f"Text: {example['text'][:100]}...")
    print(f"True label: {example['label']}, Predicted label: {predicted_label}")



Evaluating model after finetuning...
Post-finetuning accuracy: 0.8710

Post-finetuning predictions:
Text: CONGO is probably the worst big-budget movie of the 1990s. It is so bad that it is watchable over an...
True label: 0, Predicted label: 0
Text: The acting was terrible, the cheesy, fake, CHEAP green screen effects were ridiculous, and the creat...
True label: 0, Predicted label: 0
Text: This movie is a sleeper - I've watched every miniseries that was ever on TV, some many times, and th...
True label: 1, Predicted label: 1
Text: Before I had seen this film, I had heard some negative comments about it. However, when watching it ...
True label: 0, Predicted label: 1
Text: As i watched "Wirey Spindell" i couldnt but laugh at what was taking place on screen. Wirey sure got...
True label: 1, Predicted label: 1


- Pre finetune score:  0.4880
- Post finetune score: 0.8710