Import Libraries and Define Helper Functions

In [17]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device count:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")


PyTorch version: 2.3.0+cu118
CUDA available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 2060


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import re
import nltk
from nltk.corpus import stopwords

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
        text = ' '.join(word for word in text.split() if word not in stop_words)
        return text
    return ""


[nltk_data] Downloading package stopwords to C:\Users\Abdelrhman
[nltk_data]     Mersal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Load and Preprocess the Dataset

In [19]:
df = pd.read_csv('train-balanced-sarcasm.csv')
df['clean_comment'] = df['comment'].apply(clean_text)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['clean_comment'], df['label'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

# Tokenize the data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)




config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Create Dataset Objects

In [20]:
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(train_encodings, train_labels.tolist())
val_dataset = SarcasmDataset(val_encodings, val_labels.tolist())


Load the Model and Define Training Arguments

In [21]:
model = AutoModelForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define a Compute Metrics Function

In [22]:
import numpy as np


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    acc = accuracy_score(labels, pred)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


Initialize the Trainer and Train the Model

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


  0%|          | 0/151626 [00:00<?, ?it/s]

{'loss': 0.6933, 'grad_norm': 0.487682580947876, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 0.6925, 'grad_norm': 0.2390039563179016, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 0.6929, 'grad_norm': 0.21623438596725464, 'learning_rate': 3e-06, 'epoch': 0.0}
{'loss': 0.6928, 'grad_norm': 0.3293416202068329, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}
{'loss': 0.693, 'grad_norm': 0.3480598032474518, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 0.693, 'grad_norm': 0.9209429621696472, 'learning_rate': 6e-06, 'epoch': 0.0}
{'loss': 0.6935, 'grad_norm': 0.5342502593994141, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.0}
{'loss': 0.6933, 'grad_norm': 0.32238471508026123, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}
{'loss': 0.6935, 'grad_norm': 0.3651547133922577, 'learning_rate': 9e-06, 'epoch': 0.0}
{'loss': 0.6929, 'grad_norm': 0.25225040316581726, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 0.6933, 'grad_norm': 0.367494

  0%|          | 0/12636 [00:00<?, ?it/s]

{'eval_loss': 0.5743440389633179, 'eval_accuracy': 0.7033131189220739, 'eval_f1': 0.6748875277792834, 'eval_precision': 0.7462480820866897, 'eval_recall': 0.615983614667643, 'eval_runtime': 133.9049, 'eval_samples_per_second': 1509.773, 'eval_steps_per_second': 94.365, 'epoch': 1.0}
{'loss': 0.593, 'grad_norm': 3.5430874824523926, 'learning_rate': 3.3440969786800416e-05, 'epoch': 1.0}
{'loss': 0.4634, 'grad_norm': 2.095125913619995, 'learning_rate': 3.343766128925532e-05, 'epoch': 1.0}
{'loss': 0.6285, 'grad_norm': 3.6997225284576416, 'learning_rate': 3.343435279171023e-05, 'epoch': 1.0}
{'loss': 0.5484, 'grad_norm': 3.2135515213012695, 'learning_rate': 3.3431044294165136e-05, 'epoch': 1.0}
{'loss': 0.5321, 'grad_norm': 2.3526248931884766, 'learning_rate': 3.342773579662004e-05, 'epoch': 1.0}
{'loss': 0.5222, 'grad_norm': 3.798366069793701, 'learning_rate': 3.3424427299074945e-05, 'epoch': 1.0}
{'loss': 0.4831, 'grad_norm': 3.3528060913085938, 'learning_rate': 3.342111880152985e-05, 'e

  0%|          | 0/12636 [00:00<?, ?it/s]

{'eval_loss': 0.5697119235992432, 'eval_accuracy': 0.7088086028313366, 'eval_f1': 0.6872944963533892, 'eval_precision': 0.7419802502551869, 'eval_recall': 0.6401163596065937, 'eval_runtime': 132.6489, 'eval_samples_per_second': 1524.069, 'eval_steps_per_second': 95.259, 'epoch': 2.0}
{'loss': 0.5173, 'grad_norm': 2.9566314220428467, 'learning_rate': 1.671982319389119e-05, 'epoch': 2.0}
{'loss': 0.6248, 'grad_norm': 3.5906755924224854, 'learning_rate': 1.6716514696346098e-05, 'epoch': 2.0}
{'loss': 0.5501, 'grad_norm': 4.513851642608643, 'learning_rate': 1.6713206198801002e-05, 'epoch': 2.0}
{'loss': 0.5307, 'grad_norm': 4.85325813293457, 'learning_rate': 1.6709897701255906e-05, 'epoch': 2.0}
{'loss': 0.5051, 'grad_norm': 3.775388240814209, 'learning_rate': 1.670658920371081e-05, 'epoch': 2.0}
{'loss': 0.6393, 'grad_norm': 6.087200164794922, 'learning_rate': 1.6703280706165715e-05, 'epoch': 2.0}
{'loss': 0.405, 'grad_norm': 2.239900827407837, 'learning_rate': 1.669997220862062e-05, 'epo

  0%|          | 0/12636 [00:00<?, ?it/s]

{'eval_loss': 0.5837897062301636, 'eval_accuracy': 0.7084326741390738, 'eval_f1': 0.7002344421446625, 'eval_precision': 0.7203649642674033, 'eval_recall': 0.6811984247917202, 'eval_runtime': 133.171, 'eval_samples_per_second': 1518.094, 'eval_steps_per_second': 94.886, 'epoch': 3.0}
{'train_runtime': 6216.9345, 'train_samples_per_second': 390.221, 'train_steps_per_second': 24.389, 'train_loss': 0.5497450465702396, 'epoch': 3.0}


TrainOutput(global_step=151626, training_loss=0.5497450465702396, metrics={'train_runtime': 6216.9345, 'train_samples_per_second': 390.221, 'train_steps_per_second': 24.389, 'total_flos': 8696529514874880.0, 'train_loss': 0.5497450465702396, 'epoch': 3.0})

Evaluate the Model

In [24]:
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")


  0%|          | 0/12636 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.5837897062301636, 'eval_accuracy': 0.7084326741390738, 'eval_f1': 0.7002344421446625, 'eval_precision': 0.7203649642674033, 'eval_recall': 0.6811984247917202, 'eval_runtime': 133.9851, 'eval_samples_per_second': 1508.869, 'eval_steps_per_second': 94.309, 'epoch': 3.0}


Save Models

In [25]:
model.save_pretrained('tinybert_sarcasm_detector')
tokenizer.save_pretrained('tinybert_sarcasm_detector')


('tinybert_sarcasm_detector\\tokenizer_config.json',
 'tinybert_sarcasm_detector\\special_tokens_map.json',
 'tinybert_sarcasm_detector\\vocab.txt',
 'tinybert_sarcasm_detector\\added_tokens.json',
 'tinybert_sarcasm_detector\\tokenizer.json')