In [1]:
!pip install transformers datasets scikit-learn pandas torch

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import torch
import time



In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

df = pd.read_csv("dataset.csv")
df["humor"] = df["humor"].astype(int)
print(df.head())

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["humor"])
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

                                                text  humor
0  Joe biden rules out 2020 bid: 'guys, i'm not r...      0
1  Watch: darvish gave hitter whiplash with slow ...      0
2  What do you call a turtle without its shell? d...      1
3      5 reasons the 2016 election feels so personal      0
4  Pasco police shot mexican migrant from behind,...      0


In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=64)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)
train_ds = train_ds.rename_column("humor", "labels")
test_ds = test_ds.rename_column("humor", "labels")

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])




Map:   0%|          | 0/160000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [11]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="./bert-humor",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
print("PyTorch sees device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("CUDA available:", torch.cuda.is_available())


In [12]:
start_time = time.time()
trainer.train()
end_time = time.time()

peak_gpu_memory_gb = torch.cuda.max_memory_allocated() / 1024**3

results = trainer.evaluate()

print(f"Training time: {end_time - start_time:.2f} seconds")
print(f"Peak GPU memory usage: {peak_gpu_memory_gb:.2f} GB")
print("Performance metrics:", results)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0555,0.040557,0.98825,0.988231,0.989867,0.9866
2,0.0168,0.052498,0.9888,0.988797,0.989093,0.9885
3,0.0001,0.067358,0.9894,0.989359,0.993198,0.98555


Training time: 2607.64 seconds
Peak GPU memory usage: 1.85 GB
Performance metrics: {'eval_loss': 0.06735849380493164, 'eval_accuracy': 0.9894, 'eval_f1': 0.9893590322742559, 'eval_precision': 0.9931976216869898, 'eval_recall': 0.98555, 'eval_runtime': 36.1856, 'eval_samples_per_second': 1105.411, 'eval_steps_per_second': 34.544, 'epoch': 3.0}
