In [None]:
import pandas as pd

df1 = pd.read_csv("/content/test_data.csv")              # columns: text, sarcastic
df1 = df1.rename(columns={"sarcastic": "label"})

df2 = pd.read_csv("/content/sarcasm_data.csv")           # columns: tweet, sarcastic, ...
df2 = df2.rename(columns={"tweet": "text", "sarcastic": "label"})

# Keep only what you need
df1 = df1[["text", "label"]]
df2 = df2[["text", "label"]]

df = pd.concat([df1, df2], ignore_index=True)
df = df.dropna(subset=["text", "label"])
df["label"] = df["label"].astype(int)


In [None]:
df["text"] = df["text"].astype(str).str.strip()
df = df[df["text"].str.len() > 3]


In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)


In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def encode_batch(texts, labels=None):
    enc = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    if labels is not None:
        enc["labels"] = torch.tensor(labels, dtype=torch.long)
    return enc


In [None]:
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

encoded = DatasetDict({
    "train": train_ds.map(tokenize_fn, batched=True),
    "validation": val_ds.map(tokenize_fn, batched=True),
    "test": test_ds.map(tokenize_fn, batched=True),
}).remove_columns(["text"])
encoded.set_format("torch")
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

encoded = DatasetDict({
    "train": train_ds.map(tokenize_fn, batched=True),
    "validation": val_ds.map(tokenize_fn, batched=True),
    "test": test_ds.map(tokenize_fn, batched=True),
}).remove_columns(["text"])
encoded.set_format("torch")


Map:   0%|          | 0/3892 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/3892 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

In [None]:
!pip install evaluate
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate # Import the evaluate library

metric = evaluate.load("f1") # Use evaluate.load instead of load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "f1": metric.compute(predictions=preds, references=labels)["f1"]
    }

training_args = TrainingArguments(
    output_dir="./checkpoints/bert-sarcasm",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate(encoded["test"])

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33matiar1103[0m ([33matiar1103-brac-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.498974,0.784394,0.132231
2,No log,0.506496,0.786448,0.324675
3,0.460000,0.557902,0.765914,0.4




{'eval_loss': 0.520974338054657,
 'eval_accuracy': 0.7946611909650924,
 'eval_f1': 0.4444444444444444,
 'eval_runtime': 177.3303,
 'eval_samples_per_second': 2.746,
 'eval_steps_per_second': 0.09,
 'epoch': 3.0}

In [None]:
import torch

def predict_sarcasm(texts):
    enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**enc)
        probs = outputs.logits.softmax(dim=-1).cpu().numpy()
    labels = probs.argmax(axis=-1)
    return labels, probs

example = "Oh great, another Monday meeting at 7am."
label, probs = predict_sarcasm([example])
print(label, probs)


[1] [[0.13323194 0.86676806]]


In [None]:
output_dir = "./sarcasm_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to ./sarcasm_model


## Hyperparameter Tuning

### Subtask:
Implement hyperparameter tuning for the `TrainingArguments` (e.g., learning rate, batch size, number of epochs) to potentially improve model performance using techniques like Grid Search or Randomized Search, perhaps with libraries like `Ray Tune` or `Optuna`.


**Reasoning**:
The subtask requires implementing hyperparameter tuning. I will start by installing the `optuna` library, which is a suitable choice for this task. Then I'll define an objective function that encapsulates the training and evaluation logic, allowing Optuna to explore different hyperparameter combinations.



In [None]:
import optuna
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

def model_init():
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Re-define compute_metrics if it's not globally available after previous execution
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "f1": metric.compute(predictions=preds, references=labels)["f1"]
    }

def objective(trial):
    # 3. Define the search space for the hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)

    # 2b. Instantiate TrainingArguments with the suggested hyperparameters
    training_args = TrainingArguments(
        output_dir=f"./checkpoints/bert-sarcasm-trial-{trial.number}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=32, # Keep this constant for evaluation consistency
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none" # Disable logging to external platforms for trials
    )

    # 2c. Create a Trainer instance
    trainer = Trainer(
        model_init=model_init, # Use model_init for Optuna
        args=training_args,
        train_dataset=encoded["train"],
        eval_dataset=encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # 2d. Train the model
    trainer.train()

    # 2e. Evaluate the model on the validation set and return the F1-score
    metrics = trainer.evaluate(encoded["validation"])
    return metrics["eval_f1"]


**Reasoning**:
The previous cell failed because the `optuna` library was not installed. I need to install `optuna` first before it can be imported and used.



In [None]:
import optuna
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

def model_init():
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Re-define compute_metrics if it's not globally available after previous execution
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "f1": metric.compute(predictions=preds, references=labels)["f1"]
    }

def objective(trial):
    # 3. Define the search space for the hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)

    # 2b. Instantiate TrainingArguments with the suggested hyperparameters
    training_args = TrainingArguments(
        output_dir=f"./checkpoints/bert-sarcasm-trial-{trial.number}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=32, # Keep this constant for evaluation consistency
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none" # Disable logging to external platforms for trials
    )

    # 2c. Create a Trainer instance
    trainer = Trainer(
        model_init=model_init, # Use model_init for Optuna
        args=training_args,
        train_dataset=encoded["train"],
        eval_dataset=encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # 2d. Train the model
    trainer.train()

    # 2e. Evaluate the model on the validation set and return the F1-score
    metrics = trainer.evaluate(encoded["validation"])
    return metrics["eval_f1"]


**Reasoning**:
The error `ModuleNotFoundError: No module named 'optuna'` indicates that the `optuna` library is not installed. I need to install it before proceeding with the hyperparameter tuning setup.



In [None]:
import sys
!{sys.executable} -m pip install optuna
print("Optuna installed successfully.")

Optuna installed successfully.


**Reasoning**:
Since Optuna has been successfully installed, I will now execute the code block that defines the `objective` function. This function is essential for the hyperparameter tuning process as it outlines how Optuna will evaluate different sets of hyperparameters.



In [None]:
!pip install optuna
import optuna
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import torch

def model_init():
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Re-define compute_metrics if it's not globally available after previous execution
metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": (preds == labels).mean(),
        "f1": metric.compute(predictions=preds, references=labels)["f1"]
    }

def objective(trial):
    # 3. Define the search space for the hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.3)

    # 2b. Instantiate TrainingArguments with the suggested hyperparameters
    training_args = TrainingArguments(
        output_dir=f"./checkpoints/bert-sarcasm-trial-{trial.number}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=32, # Keep this constant for evaluation consistency
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none" # Disable logging to external platforms for trials
    )

    # 2c. Create a Trainer instance
    trainer = Trainer(
        model_init=model_init, # Use model_init for Optuna
        args=training_args,
        train_dataset=encoded["train"],
        eval_dataset=encoded["validation"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # 2d. Train the model
    trainer.train()

    # 2e. Evaluate the model on the validation set and return the F1-score
    metrics = trainer.evaluate(encoded["validation"])
    return metrics["eval_f1"]


Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


**Reasoning**:
The `objective` function has been defined and Optuna is installed. Now I will create an Optuna study and run the optimization process to find the best hyperparameters. This will involve defining the number of trials and executing the study.



In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5) # Run 5 trials for demonstration

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-12-07 07:18:49,712] A new study created in memory with name: no-name-c3841d6d-871e-4142-84ef-843d88e049ef
  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.515984,0.780287,0.0




[I 2025-12-07 08:50:10,808] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 1.0576442688240266e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 1, 'weight_decay': 0.22032445499338968}. Best is trial 0 with value: 0.0.
  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.496644,0.786448,0.087719
2,No log,0.512503,0.792608,0.348387
3,0.460000,0.661178,0.776181,0.417112




[I 2025-12-07 13:13:25,982] Trial 1 finished with value: 0.41711229946524064 and parameters: {'learning_rate': 3.6433270858144685e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.27376258547490484}. Best is trial 1 with value: 0.41711229946524064.
  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
