In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/72.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 136776192


In [None]:
model[0].max_seq_length = 512
model[0].do_lower_case = True

In [None]:
!kaggle datasets download coldn00ldes/sarcasm-corpus-v2oraby-et-al
!unzip sarcasm-corpus-v2oraby-et-al.zip

Dataset URL: https://www.kaggle.com/datasets/coldn00ldes/sarcasm-corpus-v2oraby-et-al
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading sarcasm-corpus-v2oraby-et-al.zip to /content
 96% 1.00M/1.04M [00:00<00:00, 1.09MB/s]
100% 1.04M/1.04M [00:00<00:00, 1.13MB/s]
Archive:  sarcasm-corpus-v2oraby-et-al.zip
  inflating: GEN-sarc-notsarc.csv    
  inflating: HYP-sarc-notsarc.csv    
  inflating: RQ-sarc-notsarc.csv     


In [None]:
import pandas as pd

df = pd.read_csv("GEN-sarc-notsarc.csv")

class_mapping = {'notsarc': 0, 'sarc': 1}
df['class_encoded'] = df['class'].map(class_mapping)

In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

CUDA Available: True
Device Name: Tesla T4


In [None]:
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df, test_size=0.2,  stratify=df['class_encoded'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['class_encoded'], random_state=42)

In [None]:
from torch.utils.data import DataLoader
from torch import nn
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer, InputExample
import os
import zipfile

In [None]:
def prepare_data(df, text_column, label_column):
    lst = []
    for _, row in df.iterrows():
        lst.append(InputExample(texts=[row[text_column]], label=row[label_column]))
    return lst

train_examples = prepare_data(train_data, text_column="text", label_column="class_encoded")
validation_examples = prepare_data(val_data, text_column="text", label_column="class_encoded")

In [None]:
def collate_fn(batch):
    texts = [example.texts[0] for example in batch]
    labels = [example.label for example in batch]

    tokenized = model.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    labels = torch.tensor(labels, dtype=torch.float)

    return tokenized, labels

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16, collate_fn=collate_fn)
validation_dataloader = DataLoader(validation_examples, shuffle=False, batch_size=16, collate_fn=collate_fn)

In [None]:
class ClassificationModel(nn.Module):
    def __init__(self, base_model):
        super(ClassificationModel, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.get_sentence_embedding_dimension(), 1)

    def forward(self, tokenized_inputs):
        embeddings = self.base_model(tokenized_inputs)["sentence_embedding"]
        logits = self.classifier(embeddings)
        return logits

In [None]:
from peft import LoraConfig, get_peft_model

target_modules = ["up_gate_proj", "down_proj"] # "qkv_proj"

lora_config = LoraConfig(
    task_type="SEQ_CLS",
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=target_modules
)

for layer in model[0].auto_model.encoder.layer:
    layer = get_peft_model(layer, lora_config)

In [None]:
classification_model = ClassificationModel(model).to(device)

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = AdamW(classification_model.parameters(), lr=0.00001)

In [None]:
metrics_per_epoch = []
num_epochs = 10
checkpoint_dir = "model_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(num_epochs):
    classification_model.train()
    all_train_predictions, all_train_labels = [], []
    train_loss = 0.0
    val_loss = 0.0

    for batch in train_dataloader:
        tokenized_inputs, labels = batch
        tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = classification_model(tokenized_inputs)
        loss = loss_fn(logits.squeeze(), labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        probabilities = torch.sigmoid(logits).squeeze()
        predictions = (probabilities > 0.5).long()

        all_train_predictions.extend(predictions.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())

    train_loss /= len(train_dataloader)
    train_precision, train_recall, train_f1, train_support = precision_recall_fscore_support(
        all_train_labels, all_train_predictions, average=None, zero_division = 0
    )

    # Validation phase
    classification_model.eval()
    all_val_predictions, all_val_labels = [], []
    with torch.no_grad():
        for batch in validation_dataloader:
            tokenized_inputs, labels = batch
            tokenized_inputs = {key: val.to(device) for key, val in tokenized_inputs.items()}
            labels = labels.to(device)

            logits = classification_model(tokenized_inputs)
            loss = loss_fn(logits.squeeze(), labels)
            val_loss += loss.item()

            probabilities = torch.sigmoid(logits).squeeze()
            predictions = (probabilities > 0.5).long()

            all_val_predictions.extend(predictions.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    val_loss /= len(validation_dataloader)
    val_precision, val_recall, val_f1, val_support = precision_recall_fscore_support(
        all_val_labels, all_val_predictions, average=None, zero_division=0
    )

    # Save metrics for this epoch
    epoch_metrics = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "train_precision": train_precision,
        "train_recall": train_recall,
        "train_f1": train_f1,
        "val_precision": val_precision,
        "val_recall": val_recall,
        "val_f1": val_f1
    }
    metrics_per_epoch.append(epoch_metrics)

    base_model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_gist_model.pth")
    torch.save(classification_model.base_model.state_dict(), base_model_path)

    classifier_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classifier_weights.pth")
    torch.save(classification_model.classifier.state_dict(), classifier_path)

    model_path = os.path.join(checkpoint_dir, f"epoch{epoch + 1}_classification_model.pth")
    torch.save(classification_model.state_dict(), model_path)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training -> Loss: {train_loss}")
    print(f"  Precision: {train_precision}    Recall: {train_recall}    F1: {train_f1}")
    print(f"Validation -> Loss: {val_loss}")
    print(f"  Precision: {val_precision}    Recall: {val_recall}    F1: {val_f1}")
    print("============================================================================================================")

Epoch 1/10
Training -> Loss: 0.663796893018155
  Precision: [0.58990895 0.59186047]    Recall: [0.59624233 0.58550613]    F1: [0.59305873 0.58866615]
Validation -> Loss: 0.5927880050205603
  Precision: [0.69677419 0.67836257]    Recall: [0.66257669 0.71165644]    F1: [0.67924528 0.69461078]
Epoch 2/10
Training -> Loss: 0.5186541793536554
  Precision: [0.76340694 0.74925373]    Recall: [0.74233129 0.76993865]    F1: [0.75272162 0.75945537]
Validation -> Loss: 0.500372374203147
  Precision: [0.75213675 0.79401993]    Recall: [0.80981595 0.73312883]    F1: [0.77991137 0.76236045]
Epoch 3/10
Training -> Loss: 0.45561941269716605
  Precision: [0.78582317 0.78935185]    Recall: [0.79064417 0.7845092 ]    F1: [0.7882263  0.78692308]
Validation -> Loss: 0.47811049077568984
  Precision: [0.7592068  0.80602007]    Recall: [0.82208589 0.7392638 ]    F1: [0.78939617 0.7712    ]
Epoch 4/10
Training -> Loss: 0.43002888866545963
  Precision: [0.81235248 0.79693343]    Recall: [0.79179448 0.81710123] 

In [None]:
metrics_df = pd.DataFrame(metrics_per_epoch)
metrics_df.to_csv("epoch_metrics.csv", index=False)

In [None]:
!zip -r /content/model_checkpoints/epoch8_gist_model.zip /content/model_checkpoints/epoch8_gist_model
!zip -r /content/model_checkpoints/epoch18_gist_model.zip /content/model_checkpoints/epoch18_gist_model
!zip -r /content/model_checkpoints/epoch20_gist_model.zip /content/model_checkpoints/epoch20_gist_model