In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
TARGET_COLUMNS = ['humor', 'offensiveness', 'sentiment']
MAX_LEN = 128

In [4]:
def load_labelled_data(file_path, nrows=None):
    # Load dataset from a Parquet file
    df = pd.read_parquet(file_path)
    if nrows:
        df = df.head(nrows)
    
    # Cast the target columns to int for classification purposes.
    df[TARGET_COLUMNS] = df[TARGET_COLUMNS].astype(int)
    
    # Drop rows where any value in the target columns isn't 0 or 1.
    # This creates a boolean mask that checks for binary values.
    df = df[df[TARGET_COLUMNS].isin([0, 1]).all(axis=1)]

    df[TARGET_COLUMNS] = df[TARGET_COLUMNS].astype(float)
    
    # Ensure that the 'joke' column is of type string.
    df['joke'] = df['joke'].astype(str)

    # drop duplicates
    df = df.drop_duplicates(subset=['joke'])

    df = df[df['joke'].apply(lambda x: isinstance(x, str))]
    # drop empty jokes
    df = df[df['joke'].str.strip() != '']
    return df


In [5]:
data = pd.read_csv("../data/rJokesData/data/preprocessed.csv.gz")
labeled_df = load_labelled_data("../data/labeled_jokes_classification_mistral:latest.parquet")

data = data[["date", "joke", "score"]]
data['date'] = pd.to_datetime(data['date'], unit='s')

data[TARGET_COLUMNS] = None

# this replaces every column in data for which there is a row in labeled_df
data.loc[labeled_df.index] = labeled_df

unlabeled_df = data[data[TARGET_COLUMNS].isnull().all(axis=1)]
unlabeled_df.drop(columns=TARGET_COLUMNS, inplace=True)
unlabeled_df = unlabeled_df[unlabeled_df['joke'].apply(lambda x: isinstance(x, str))]
# unlabeled_df = unlabeled_df.head(1000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_df.drop(columns=TARGET_COLUMNS, inplace=True)


In [6]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification

def predict_joke_ratings_bulk(
    jokes: list[str],
    model_path: str = "../traning_encoder/joke_classification_model",
    batch_size: int = 128,
    threshold: float = 0.5,
    device: str = None
) -> pd.DataFrame:
    """
    Predict binary ratings for multiple jokes in bulk.

    Args:
        jokes: List of joke strings.
        model_path: Path or checkpoint name of your saved model.
        batch_size: How many jokes to process per forward-pass.
        threshold: Probability threshold for converting to 0/1.
        device: 'cuda' or 'cpu'. If None, automatically picks cuda if available.

    Returns:
        DataFrame of shape (len(jokes), n_labels) with columns = label names.
    """
    # 1. Set up device
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # 2. Load tokenizer, config, id2label, and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    config = AutoConfig.from_pretrained(model_path)
    id2label = config.id2label  # e.g. {0: "humor", 1: "offensiveness", 2: "sentiment"}
    model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
    model.to(device)
    model.eval()

    all_preds = []
    # 3. Process in batches
    for i in tqdm(range(0, len(jokes), batch_size)):
        batch_texts = jokes[i : i + batch_size]
        # 3a. Tokenize
        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding="longest",
            max_length=MAX_LEN,
            return_tensors="pt"
        ).to(device)
        # 3b. Forward pass
        with torch.no_grad():
            outputs = model(**encodings)
            logits = outputs.logits  # shape (batch_size, n_labels)
        # 3c. Sigmoid → probs → binary
        probs = torch.sigmoid(logits).cpu().numpy()
        batch_preds = (probs > threshold).astype(int)
        all_preds.append(batch_preds)

    # 4. Concatenate and build DataFrame
    all_preds = np.vstack(all_preds)  # shape (len(jokes), n_labels)
    # Map columns via id2label in correct order
    columns = [id2label[i] for i in range(all_preds.shape[1])]
    return pd.DataFrame(all_preds, columns=columns)


# Suppose you have a DataFrame `df` with a 'joke' column:
jokes = unlabeled_df["joke"].tolist()
preds_df = predict_joke_ratings_bulk(jokes)
for col in preds_df.columns:
    unlabeled_df[col] = preds_df[col].values
prediction_df = unlabeled_df
# prediction_df = pd.concat([unlabeled_df, preds_df], axis=1)

prediction_df["source"] = "predicted"
labeled_df["source"] = "label"

df_combined = pd.concat([labeled_df, prediction_df], axis=0, ignore_index=True)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 4048/4048 [2:19:30<00:00,  2.07s/it]  


In [7]:
prediction_df.shape, labeled_df.shape, df_combined.shape

((518124, 7), (55278, 7), (573402, 7))

In [8]:
df_combined

Unnamed: 0,date,joke,score,humor,offensiveness,sentiment,source
0,2019-10-07 02:57:10,Anti-dandruff shampoo with sulfur causes hair ...,1.0,1.0,0.0,1.0,label
1,2015-03-23 17:59:44,I use to be addicted to the Hokie Pokie.... bu...,3.0,1.0,0.0,1.0,label
2,2016-03-24 13:09:05,A man goes into a job interview A man goes int...,11956.0,1.0,0.0,1.0,label
3,2015-02-20 02:35:30,What is the one food that diminishes a woman's...,76.0,1.0,0.0,0.0,label
4,2016-06-19 20:21:42,Rick Astley will let you borrow any movie from...,81.0,1.0,0.0,1.0,label
...,...,...,...,...,...,...,...
573397,2019-12-31 23:31:07,A German joke A German is driving his car in B...,16.0,1.0,0.0,1.0,predicted
573398,2019-12-31 23:37:28,My wife has asked me to help her with her diet...,4.0,1.0,0.0,1.0,predicted
573399,2019-12-31 23:45:39,Me arguing with my dad Me: I hate you motherfu...,2.0,0.0,0.0,1.0,predicted
573400,2019-12-31 23:54:35,"Early I know it's early, but a very happy new ...",21.0,1.0,0.0,1.0,predicted


In [9]:
df_combined.to_parquet("classified_data.parquet", index=False)

In [10]:
has_non_str = any(not isinstance(x, str) for x in jokes)
has_non_str

False

In [11]:
bad_indices = [i for i, x in enumerate(jokes) if not isinstance(x, str)]
bad_values  = [x for x in jokes           if not isinstance(x, str)]

print(bad_indices)  # → [1, 3, 4]
print(bad_values)   # → [None, 123, nan]

[]
[]
