In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Load the data, label encoder, BERT model, and tokenizer

In [3]:
df = pd.read_csv('classification_data_all.csv')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,prompt,task type
0,Produce animated content synced to this file.,ATV
1,I need a printed version of this narration.,STT
2,Provide a text output of this announcement photo.,ITT
3,Say the following status update.,TTS
4,"Tell me more about: I uploaded a file, now what?",UNK
...,...,...
595,Please read this aloud.,TTS
596,Convert this vocal recording into motion graph...,ATV
597,Tell me more about: Can you think like a person?,UNK
598,Make a text document from this conversation.,STT


In [4]:
possible_labels = df['task type'].unique().tolist()
print(f"Possible labels: {possible_labels}")

Possible labels: ['ATV', 'STT', 'ITT', 'TTS', 'UNK', 'VTT', 'TTI', 'ITA', 'TTV', 'ATI']


In [5]:
n_labels = len(possible_labels)
print(f"Number of labels: {n_labels}")

Number of labels: 10


In [None]:
label_encoder = joblib.load('saved_models/label_encoder.joblib')

FileNotFoundError: [Errno 2] No such file or directory: 'label_encoder.joblib'

In [None]:
df['task_type_encoded'] = label_encoder.transform(df['task type'])
df = df.drop('task type', axis=1)
df

Unnamed: 0,prompt,task_type_encoded
0,Produce animated content synced to this file.,1
1,I need a printed version of this narration.,4
2,Provide a text output of this announcement photo.,3
3,Say the following status update.,6
4,"Tell me more about: I uploaded a file, now what?",8
...,...,...
595,Please read this aloud.,6
596,Convert this vocal recording into motion graph...,1
597,Tell me more about: Can you think like a person?,8
598,Make a text document from this conversation.,4


In [None]:
X = pd.DataFrame(df['prompt'])
y = pd.DataFrame(df['task_type_encoded'])

# Split the data, stratifying by the target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

In [None]:
X_train

Unnamed: 0,prompt
274,Turn this logline into a dynamic video scene.
397,Capture the headline from this newspaper image.
371,Draw and animate scenes for this recorded voice.
408,Turn the visual elements into sound.
411,Can you explain: Make this file useful.
...,...
438,Can you write out this chat recording?
394,Identify text from this business card image.
415,What’s shown in this painting? Say it aloud.
238,I need subtitles for this recording.


In [None]:
y_train

Unnamed: 0,task_type_encoded
274,7
397,3
371,1
408,2
411,8
...,...
438,4
394,3
415,2
238,4


In [None]:
# === 1. Load your data ===
X_train_texts = X_train['prompt'].tolist()
y_train_labels = y_train['task_type_encoded'].tolist()

X_test_texts = X_test['prompt'].tolist()
y_test_labels = y_test['task_type_encoded'].tolist()

num_labels = len(set(y_train_labels))
print(f"Number of labels: {num_labels}")

# ---
## Tokenization and Dataset
# ---
# === 2. Tokenization ===
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

class PromptDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PromptDataset(X_train_texts, y_train_labels, tokenizer)
test_dataset = PromptDataset(X_test_texts, y_test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# ---
## Model, Optimizer, and Device Setup
# ---
# === 4. Load Model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.to(device)

# === 5. Optimizer ===
optimizer = AdamW(model.parameters(), lr=2e-5)

# ---
## Training Loop with Early Stopping
# ---
# === 6. Training Loop with Early Stopping ===
epochs = 50 # Maximum number of epochs
patience = 5 # Number of epochs to wait for improvement before stopping
best_accuracy = 0.0
epochs_no_improve = 0

model.train()
print(f"Training on {device}")

for epoch in range(epochs):
    # Training phase
    model.train() # Set model to training mode
    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1} (Training)")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

    # ---
    ## Evaluation Phase after Each Epoch
    # ---
    # === 7. Evaluation after each epoch ===
    model.eval() # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad(): # Disable gradient calculations for evaluation
        eval_loop = tqdm(test_loader, desc=f"Epoch {epoch + 1} (Evaluation)")
        for batch in eval_loop:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    current_accuracy = accuracy_score(all_labels, all_preds)
    print(f"\nEpoch {epoch + 1} - Test Accuracy: {current_accuracy:.4f}")
    #print(classification_report(all_labels, all_preds))

    # Check for early stopping
    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        epochs_no_improve = 0
        # Optional: Save the best model
        torch.save(model.state_dict(), f'bert_fine_tuned/best_model_bert_epoch_{epoch+1}.pt')
        print(f"New best accuracy! Model saved to bert_fine_tuned/best_model_bert_epoch_{epoch+1}.pt")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve} epochs.")

    if epochs_no_improve >= patience:
        print(f"Stopping early after {epoch + 1} epochs due to no improvement in test accuracy for {patience} consecutive epochs.")
        break # Exit the training loop

print("\n--- Training Finished ---")

**The max accuracy on the test set was reached on the 6th epoch, hence, model weights from this epoch will be saved and used later**