### Embedding Model

In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("avsolatorio/GIST-small-Embedding-v0")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
!git clone https://github.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection.git

Cloning into 'News-Headlines-Dataset-For-Sarcasm-Detection'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 75 (delta 6), reused 0 (delta 0), pack-reused 62 (from 1)[K
Receiving objects: 100% (75/75), 3.65 MiB | 29.91 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [3]:
import pandas as pd
import json


file_path = 'News-Headlines-Dataset-For-Sarcasm-Detection/Sarcasm_Headlines_Dataset.json'
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]

df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [5]:
df["is_sarcastic"].value_counts()

Unnamed: 0_level_0,count
is_sarcastic,Unnamed: 1_level_1
0,14985
1,13634


### Data splitting

In [6]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df, test_size=0.2,  stratify=df['is_sarcastic'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['is_sarcastic'], random_state=42)

### CPU or Cuda

In [7]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

CUDA Available: True
Device Name: Tesla T4


In [8]:
from torch.utils.data import DataLoader
from torch import nn
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer, InputExample
import os
import zipfile

In [9]:
def prepare_data(df, text_column, label_column):
    lst = []
    for _, row in df.iterrows():
        lst.append(InputExample(texts=[row[text_column]], label=row[label_column]))
    return lst

train_examples = prepare_data(train_data, text_column="headline", label_column="is_sarcastic")
validation_examples = prepare_data(val_data, text_column="headline", label_column="is_sarcastic")

In [10]:
def collate_fn(batch):
    texts = [example.texts[0] for example in batch]
    labels = [example.label for example in batch]

    tokenized = model.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float)

    return tokenized, labels

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_examples, shuffle=False, batch_size=16, collate_fn=collate_fn)

In [11]:
class ClassificationModel(nn.Module):
    def __init__(self, base_model):
        super(ClassificationModel, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.get_sentence_embedding_dimension(), 1)

    def forward(self, tokenized_inputs):
        embeddings = self.base_model(tokenized_inputs)["sentence_embedding"]
        logits = self.classifier(embeddings)
        return logits

In [None]:
"""
from peft import LoraConfig, get_peft_model

target_modules = ['query', 'key', 'value']  # 'dense'

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=target_modules
)

for layer in model[0].auto_model.encoder.layer:
    layer = get_peft_model(layer, lora_config)
"""

In [12]:
classification_model = ClassificationModel(model).to(device)

In [13]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(classification_model.parameters(), lr=0.00001)

In [14]:
metrics_per_epoch = []
num_epochs = 20
checkpoint_dir = "model_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(num_epochs):
    classification_model.train()
    all_train_predictions, all_train_labels = [], []
    train_loss = 0.0
    val_loss = 0.0

    for batch in train_dataloader:
        tokenized_inputs, labels = batch
        tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = classification_model(tokenized_inputs)
        loss = loss_fn(logits.squeeze(), labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        probabilities = torch.sigmoid(logits).squeeze()
        predictions = (probabilities > 0.5).long()

        all_train_predictions.extend(predictions.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())

    train_loss /= len(train_dataloader)
    train_precision, train_recall, train_f1, train_support = precision_recall_fscore_support(
        all_train_labels, all_train_predictions, average=None, zero_division = 0
    )

    # Validation phase
    classification_model.eval()
    all_val_predictions, all_val_labels = [], []
    with torch.no_grad():
        for batch in validation_dataloader:
            tokenized_inputs, labels = batch
            tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
            labels = labels.to(device)

            logits = classification_model(tokenized_inputs)
            loss = loss_fn(logits.squeeze(), labels)
            val_loss += loss.item()

            probabilities = torch.sigmoid(logits).squeeze()
            predictions = (probabilities > 0.5).long()

            all_val_predictions.extend(predictions.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    val_loss /= len(validation_dataloader)
    val_precision, val_recall, val_f1, val_support = precision_recall_fscore_support(
        all_val_labels, all_val_predictions, average=None, zero_division=0
    )

    # Save metrics for this epoch
    epoch_metrics = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "train_precision": train_precision,
        "train_recall": train_recall,
        "train_f1": train_f1,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1": val_f1
    }
    metrics_per_epoch.append(epoch_metrics)

    base_model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_gist_model.pth")
    torch.save(classification_model.base_model.state_dict(), base_model_path)

    classifier_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classifier_weights.pth")
    torch.save(classification_model.classifier.state_dict(), classifier_path)

    model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classification_model.pth")
    torch.save(classification_model.state_dict(), model_path)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training -> Loss: {train_loss}")
    print(f"  Precision: {train_precision}    Recall: {train_recall}    F1: {train_f1}")
    print(f"Validation -> Loss: {val_loss}")
    print(f"  Precision: {val_precision}    Recall: {val_recall}    F1: {val_f1}")
    print("============================================================================================================")

Epoch 1/10
Training -> Loss: 0.5103168432984129
  Precision: [0.86734953 0.84280816]    Recall: [0.85468802 0.8563308 ]    F1: [0.86097223 0.84951567]
Validation -> Loss: 0.4422040406885094
  Precision: [0.8878327  0.92367601]    Recall: [0.93462308 0.8701394 ]    F1: [0.91062723 0.8961088 ]
Epoch 2/10
Training -> Loss: 0.4080780850141553
  Precision: [0.92045455 0.91779684]    Recall: [0.92567568 0.91207481]    F1: [0.92305773 0.91492688]
Validation -> Loss: 0.3835649079117695
  Precision: [0.89885496 0.93333333]    Recall: [0.94262842 0.88334556]    F1: [0.92022143 0.90765172]
Epoch 3/10
Training -> Loss: 0.34530065547620226
  Precision: [0.93798579 0.93000915]    Recall: [0.93618619 0.93197029]    F1: [0.93708513 0.93098869]
Validation -> Loss: 0.3533397189565211
  Precision: [0.86901914 0.96134454]    Recall: [0.96931288 0.83932502]    F1: [0.91643015 0.89620055]
Epoch 4/10
Training -> Loss: 0.2925079349951674
  Precision: [0.9460357  0.94534562]    Recall: [0.95053387 0.94040524] 

In [17]:
metrics_df = pd.DataFrame(metrics_per_epoch)
metrics_df.to_csv("headlines_gist_FT_epoch_metrics.csv", index=False)

In [None]:
!zip -r /content/model_checkpoints.zip /content/model_checkpoints