In [19]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

In [20]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 136776192


In [21]:
model[0].max_seq_length = 512
model[0].do_lower_case = True

In [22]:
!git clone https://github.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection.git

Cloning into 'News-Headlines-Dataset-For-Sarcasm-Detection'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 75 (delta 6), reused 0 (delta 0), pack-reused 62 (from 1)[K
Receiving objects: 100% (75/75), 3.65 MiB | 28.76 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [23]:
import pandas as pd
import json


file_path = 'News-Headlines-Dataset-For-Sarcasm-Detection/Sarcasm_Headlines_Dataset.json'
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

df = pd.DataFrame(data)

In [24]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df, test_size=0.2,  stratify=df['is_sarcastic'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['is_sarcastic'], random_state=42)

In [25]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

CUDA Available: True
Device Name: Tesla T4


In [26]:
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch.optim import AdamW

from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, InputExample

import os
import zipfile

from transformers import AutoTokenizer, AutoModel
from peft import PeftModel, PrefixTuningConfig
import torch
import torch.nn as nn
import torch.optim as optim

In [30]:
def prepare_data(df, text_column, label_column):
    lst = []
    for _, row in df.iterrows():
        lst.append(InputExample(texts=[row[text_column]], label=row[label_column]))
    return lst

train_examples = prepare_data(train_data, text_column="headline", label_column="is_sarcastic")
validation_examples = prepare_data(val_data, text_column="headline", label_column="is_sarcastic")

In [31]:
def collate_fn(batch):
    texts = [example.texts[0] for example in batch]
    labels = [example.label for example in batch]

    tokenized = model.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float)

    return tokenized, labels

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_examples, shuffle=False, batch_size=16, collate_fn=collate_fn)

In [32]:
base_transformer = model._first_module().auto_model

prefix_config = PrefixTuningConfig(
    task_type=None,
    num_virtual_tokens=20,
    encoder_hidden_size=base_transformer.config.hidden_size,
)

peft_model = PeftModel(base_transformer, prefix_config)
model._first_module().auto_model = peft_model

In [33]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 368640


In [34]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: PeftModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [35]:
class ClassificationModel(nn.Module):
    def __init__(self, base_model):
        super(ClassificationModel, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.get_sentence_embedding_dimension(), 1)

    def forward(self, tokenized_inputs):
        embeddings = self.base_model(tokenized_inputs)["sentence_embedding"]
        logits = self.classifier(embeddings)
        return logits

In [36]:
classification_model = ClassificationModel(model).to(device)

In [37]:
trainable_params = filter(lambda p: p.requires_grad, classification_model.parameters())
optimizer = AdamW(trainable_params, lr=0.0001)
loss_fn = nn.BCEWithLogitsLoss()

In [38]:
metrics_per_epoch = []
num_epochs = 20
checkpoint_dir = "model_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(num_epochs):
    classification_model.train()
    all_train_predictions, all_train_labels = [], []
    train_loss = 0.0
    val_loss = 0.0

    for batch in train_dataloader:
        tokenized_inputs, labels = batch
        tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = classification_model(tokenized_inputs)
        loss = loss_fn(logits.squeeze(), labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        probabilities = torch.sigmoid(logits).squeeze()
        predictions = (probabilities > 0.5).long()

        all_train_predictions.extend(predictions.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())

    train_loss /= len(train_dataloader)
    train_precision, train_recall, train_f1, train_support = precision_recall_fscore_support(
        all_train_labels, all_train_predictions, average=None, zero_division = 0
    )

    # Validation phase
    classification_model.eval()
    all_val_predictions, all_val_labels = [], []
    with torch.no_grad():
        for batch in validation_dataloader:
            tokenized_inputs, labels = batch
            tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
            labels = labels.to(device)

            logits = classification_model(tokenized_inputs)
            loss = loss_fn(logits.squeeze(), labels)
            val_loss += loss.item()

            probabilities = torch.sigmoid(logits).squeeze()
            predictions = (probabilities > 0.5).long()

            all_val_predictions.extend(predictions.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    val_loss /= len(validation_dataloader)
    val_precision, val_recall, val_f1, val_support = precision_recall_fscore_support(
        all_val_labels, all_val_predictions, average=None, zero_division=0
    )

    # Save metrics for this epoch
    epoch_metrics = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "train_precision": train_precision,
        "train_recall": train_recall,
        "train_f1": train_f1,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1": val_f1
    }
    metrics_per_epoch.append(epoch_metrics)

    base_model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_gist_model.pth")
    torch.save(classification_model.base_model.state_dict(), base_model_path)

    classifier_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classifier_weights.pth")
    torch.save(classification_model.classifier.state_dict(), classifier_path)

    model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classification_model.pth")
    torch.save(classification_model.state_dict(), model_path)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training -> Loss: {train_loss}")
    print(f"  Precision: {train_precision}    Recall: {train_recall}    F1: {train_f1}")
    print(f"Validation -> Loss: {val_loss}")
    print(f"  Precision: {val_precision}    Recall: {val_recall}    F1: {val_f1}")
    print("============================================================================================================")

Epoch 1/20
Training -> Loss: 0.5712124103020322
  Precision: [0.7205448  0.70056972]    Recall: [0.7325659  0.68772348]    F1: [0.72650563 0.69408717]
Validation -> Loss: 0.48074808616877934
  Precision: [0.78719276 0.78571429]    Recall: [0.81187458 0.75862069]    F1: [0.79934319 0.77192982]
Epoch 2/20
Training -> Loss: 0.46814678596875453
  Precision: [0.79928405 0.7806671 ]    Recall: [0.80088422 0.7789493 ]    F1: [0.80008333 0.77980725]
Validation -> Loss: 0.4310585240412025
  Precision: [0.79848389 0.81626271]    Recall: [0.84322882 0.76595745]    F1: [0.82024659 0.79031037]
Epoch 3/20
Training -> Loss: 0.4344083760207054
  Precision: [0.81420307 0.79239606]    Recall: [0.81006006 0.79682773]    F1: [0.81212628 0.79460571]
Validation -> Loss: 0.4095871570057043
  Precision: [0.80805538 0.83110762]    Recall: [0.85657105 0.77622891]    F1: [0.83160622 0.80273141]
Epoch 4/20
Training -> Loss: 0.416644605054962
  Precision: [0.82348523 0.80564464]    Recall: [0.82307307 0.80608783] 

In [39]:
metrics_df = pd.DataFrame(metrics_per_epoch)
metrics_df.to_csv("gte_headlines_prompt_epoch_metrics.csv", index=False)