In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    RobertaTokenizer, 
    RobertaForSequenceClassification, 
    RobertaConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    TrainingArguments, 
    Trainer, 
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import evaluate
import os
from dataclasses import dataclass
from typing import Dict, List, Optional, Union

import pandas as pd
import joblib
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from transformers import TrainerCallback, DataCollatorWithPadding
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Define constants
MAX_LEN = 64
TARGET_COLUMNS = ['humor', 'offensiveness', 'sentiment']
# TARGET_COLUMNS = [ 'offensiveness', 'sentiment',  'humor']
NUM_TARGETS = len(TARGET_COLUMNS)
MODEL_CHECKPOINT = "FacebookAI/roberta-base" 

In [None]:
# Custom callback to record loss history
class LossHistory(TrainerCallback):
    def __init__(self):
        self.train_losses = []  # to store (global_step, training loss)
        self.eval_losses = []   # to store (global_step, evaluation loss)
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        # Log training loss if available
        if logs is not None and "loss" in logs:
            self.train_losses.append((state.global_step, logs["loss"]))
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Log evaluation loss if available
        if metrics is not None and "eval_loss" in metrics:
            self.eval_losses.append((state.global_step, metrics["eval_loss"]))

def plot_loss_history(loss_history, save_path="loss_plot.png"):
    """
    Plots the training and evaluation loss curves stored in the loss_history callback.
    """
    if loss_history.train_losses:
        train_steps, train_loss_values = zip(*loss_history.train_losses)
    else:
        train_steps, train_loss_values = [], []
    
    if loss_history.eval_losses:
        eval_steps, eval_loss_values = zip(*loss_history.eval_losses)
    else:
        eval_steps, eval_loss_values = [], []
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_steps, train_loss_values, label="Training Loss", marker='o')
    plt.plot(eval_steps, eval_loss_values, label="Evaluation Loss", marker='o')
    plt.xlabel("Global Step")
    plt.ylabel("Loss")
    plt.title("Training and Evaluation Loss Over Time")
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path, bbox_inches="tight")
    print(f"Loss plot saved as '{save_path}'.")

# Function to load data from a parquet file and process targets
def load_data(file_path, nrows=None):
    # Load dataset from a Parquet file
    df = pd.read_parquet(file_path)
    if nrows:
        df = df.head(nrows)
    
    # Ensure that the 'joke' column is of type string.
    df['joke'] = df['joke'].astype(str)
    # Cast the target columns to int for classification purposes.
    df[TARGET_COLUMNS] = df[TARGET_COLUMNS].astype(float)
    
    # drop duplicates
    df = df.drop_duplicates(subset=['joke'])
    # drop empty jokes
    df = df[df['joke'].str.strip() != '']
    
    return df


# Custom dataset class for classification
class JokeDataset(Dataset):
    def __init__(self, jokes, targets, tokenizer, max_len):
        self.jokes = jokes
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.jokes)
    
    def __getitem__(self, idx):
        joke = str(self.jokes[idx])

        targets = np.array(self.targets[idx]).astype(np.float32)
        
        encoding = self.tokenizer.encode_plus(
            joke,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(targets, dtype=torch.float)
        }
    

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Define the compute_metrics function for multi-label classification
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # overall metrics
    mse  = mean_squared_error(labels, preds)
    mae  = mean_absolute_error(labels, preds)
    rmse = np.sqrt(mse)
    r2   = r2_score(labels, preds)
    results = {
        "mse": mse,
        "mae": mae,
        "rmse": rmse,
        "r2": r2
    }
    # per‐target metrics
    for i, name in enumerate(TARGET_COLUMNS):
        m = mean_squared_error(labels[:, i], preds[:, i])
        a = mean_absolute_error(labels[:, i], preds[:, i])
        results[f"{name}_mse"]  = m
        results[f"{name}_mae"]  = a
        results[f"{name}_rmse"] = np.sqrt(m)
        results[f"{name}_r2"]   = r2_score(labels[:, i], preds[:, i])
    return results

def plot_probability_distributions(probabilities, labels, split_name, target_columns):
    """
    Plots the distribution of predicted probabilities for each target.
    
    Args:
        probabilities (np.ndarray): Array of shape (num_samples, num_targets) with probabilities.
        labels (np.ndarray): Actual labels (not used in plot but could be overlaid).
        split_name (str): Either "Train" or "Test" to label the plot.
        target_columns (list): List of target column names.
    """
    num_targets = len(target_columns)
    plt.figure(figsize=(5 * num_targets, 5))

    for i in range(num_targets):
        plt.subplot(1, num_targets, i + 1)
        sns.histplot(probabilities[:, i], bins=50, kde=True, color='skyblue')
        plt.title(f"{split_name} Set - {target_columns[i]}")
        plt.xlabel("Predicted Probability")
        plt.ylabel("Frequency")
        plt.xlim(0, 1)

    plt.tight_layout()
    plt.savefig(f"{split_name.lower()}_probability_distributions.png")
    plt.show()

    
def main(data_path, nrows):
    # Load data
    print(f"Loading data from {data_path}")
    df = load_data(data_path, nrows=nrows)

    # Split data into train, validation, and test sets (80%, 10%, 10%)
    train_df, temp_df = train_test_split(df, train_size=0.8, random_state=SEED)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)
    
    print(f"Train set: {len(train_df)} samples")
    print(f"Validation set: {len(val_df)} samples")
    print(f"Test set: {len(test_df)} samples")
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Create datasets
    train_dataset = JokeDataset(
        jokes=train_df['joke'].values,
        targets=train_df[TARGET_COLUMNS].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    val_dataset = JokeDataset(
        jokes=val_df['joke'].values,
        targets=val_df[TARGET_COLUMNS].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    test_dataset = JokeDataset(
        jokes=test_df['joke'].values,
        targets=test_df[TARGET_COLUMNS].values,
        tokenizer=tokenizer,
        max_len=MAX_LEN
    )
    
    # Configure the model for multi-label classification
    config = AutoConfig.from_pretrained(MODEL_CHECKPOINT)
    config.num_labels = NUM_TARGETS
    config.problem_type = "regression"
    
    # Add id to label and label to id mappings to the model config
    config.id2label = {i: t for i, t in enumerate(TARGET_COLUMNS)}
    config.label2id = {t: i for i, t in enumerate(TARGET_COLUMNS)}
    print("Mapping id to label:", config.id2label)
    print("Mapping label to id:", config.label2id)
    
    # Initialize model; using AutoModelForSequenceClassification sets up BCEWithLogitsLoss internally.
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        config=config
    )
    
    # Optionally, freeze the base model layers if you want to fine-tune only the classification head
    for param in model.base_model.parameters():
        param.requires_grad = True

    # Instantiate loss history callback
    loss_history = LossHistory()
    
    # Set up training arguments (note that we now use "f1" as our metric for best model)
    training_args = TrainingArguments(
        output_dir='./results_reg',
        num_train_epochs=10,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=150,
        weight_decay=0.1,
        logging_dir='./logs',
        logging_steps=150,
        eval_steps=150,
        save_steps=150,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="rmse",
        greater_is_better=False,
        save_total_limit=2,
        learning_rate=5.0e-5
        # fp16=True,  # Uncomment if using mixed precision
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3), loss_history],
        data_collator=data_collator,
    )
    

    # Train the model
    print("Starting training...")
    trainer.train()
    plot_loss_history(loss_history, save_path="loss_plot.png")
    
    # Evaluate on test set
    print("Evaluating on test set...")
    test_results = trainer.evaluate(test_dataset)
    print("Test results:", test_results)
    
    # Save model
    print("Saving model...")
    trainer.save_model("./joke_regression_model")

    return trainer


def predict_joke_ratings(joke_text, model_path="./joke_regression_model"):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    config    = AutoConfig.from_pretrained(model_path)
    model     = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
    model.eval()

    encoding = tokenizer(
        joke_text,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(**encoding)
        preds = outputs.logits.cpu().numpy().flatten()

    return {config.id2label[i]: float(preds[i]) for i in range(NUM_TARGETS)}


In [None]:
# Replace with your actual file path
data_path = "../data/labeled_jokes_regression_mistral:latest.parquet"
results = main(data_path, nrows=None)

Loading data from ../data/labeled_jokes_regression_mistral:latest.parquet
Train set: 45275 samples
Validation set: 5659 samples
Test set: 5660 samples


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping id to label: {0: 'humor', 1: 'offensiveness', 2: 'sentiment'}
Mapping label to id: {'humor': 0, 'offensiveness': 1, 'sentiment': 2}




Starting training...




Step,Training Loss,Validation Loss,Mse,Mae,Rmse,R2,Humor Mse,Humor Mae,Humor Rmse,Humor R2,Offensiveness Mse,Offensiveness Mae,Offensiveness Rmse,Offensiveness R2,Sentiment Mse,Sentiment Mae,Sentiment Rmse,Sentiment R2
15,2486.3352,2324.278076,2324.11377,36.954533,48.209063,-1.040609,3793.015625,51.680367,61.587463,-1.932648,326.263519,12.406184,18.062766,-0.858078,2853.068848,46.77702,53.414126,-0.331107
30,2340.4125,2162.59082,2162.445557,35.996502,46.502103,-0.882707,3440.713867,50.215233,58.657599,-1.660259,299.601166,11.727624,17.308991,-0.706236,2747.020752,46.046532,52.412029,-0.28163
45,2135.1651,2071.711914,2071.573486,35.415401,45.514541,-0.780703,3242.548584,49.34222,56.943381,-1.507043,277.020569,11.232904,16.643935,-0.577639,2695.159912,45.671345,51.91493,-0.257435
60,2085.3457,2012.130371,2011.996948,34.99535,44.855289,-0.708851,3125.855469,48.81303,55.90935,-1.416819,258.584991,10.815034,16.080578,-0.472648,2651.560303,45.357727,51.493303,-0.237093




In [None]:
# example inference
sample = "Fuck"
print("Predicted ratings:", predict_joke_ratings(sample))

Predicted ratings: {'humor': 2.2463624477386475, 'offensiveness': 0.4515058398246765, 'sentiment': 1.2660059928894043}
