### Installing and importing the required modules

In [None]:
!pip install evaluate bitsandbytes

In [None]:
import os
import torch
import random
import kagglehub
import numpy as np
import pandas as pd
from evaluate import load
from typing import Dict, Any
from datasets import Dataset
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer

### Constants, hyperparameters and model configurations

In [None]:
test_size = 0.2 # Train-test split percentage
max_length = 128 # The maximum length of the input sequences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # The device to run the model on
model_id = "distilbert-base-uncased" # The model ID of the Llama model
model_path = "/content/drive/MyDrive/Colab Notebooks/FineTuningLLM/saved_models/spam_mails_classifier" # Path to save the trained model to

In [None]:
# Print the detected device
print(f"Detected device: {device}")

### Data loading

In [None]:
# Download the dataset
path = kagglehub.dataset_download("venky73/spam-mails-dataset")

In [None]:
# Load the dataset into a pandas dataframe
dataset = pd.read_csv(os.path.join(path, "spam_ham_dataset.csv"))

In [None]:
# Drop null values
dataset.dropna(inplace=True)

In [None]:
# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Preprocess data

In [None]:
# Instantiate  the label encoder
label_encoder = LabelEncoder()

# Encode the target column (category_description) into numeric labels
dataset.loc[:, "label"] = label_encoder.fit_transform(dataset["label"])

# Extract and print the total number of classes
num_classes = len(label_encoder.classes_)
print(f"Total number of classes: {num_classes}")

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size).values()

# Print the number of training and test samples
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

In [None]:
# Preprocess the dataset
def preprocess(examples: Dict[str, Any]) -> Dict[str, Any]:
    # Tokenize the input sequences
    return tokenizer(
        examples["text"],
        truncation = True,
        padding = "max_length",
        max_length = max_length
    )

# Tokenize the dataset
tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

# Display the sequence length
print(f"Sequence length: {len(tokenized_train_dataset[0]['input_ids'])}")

In [None]:
# Print a sample sequence
print(tokenizer.decode(random.choice(tokenized_train_dataset)['input_ids']))

### Building the model

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels = num_classes
)

In [None]:
# Move the model to the taret device
model.to(device);

In [None]:
# Display the model
model

### Trainig the model

In [None]:
# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define a custum function to compute the metrics
def compute_metrics(eval_pred: torch.Tensor) -> torch.Tensor:
    # Extract the logits and the lables from the output of the model
    logits, labels = eval_pred

    # Extract the predictions for each sample
    predictions = np.argmax(logits, axis=-1)

    # Compute and return the accuarcy
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./output",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-4,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    num_train_epochs = 20,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 10,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    greater_is_better = True,
    report_to = "none",
    fp16 = True
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    compute_metrics = compute_metrics
)

# Trainin the model
trainer.train()

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(model_path)

### Load the fine-tuned model

In [None]:
# Clear GPU cache
torch.cuda.empty_cache()

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True
)

In [None]:
# Load the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

In [None]:
# Move the fine-tuned model to the target device
model.to(device);

### Inference

In [None]:
# Tokenize a sample input
inputs = tokenizer(
    [
        """
        Subject: urgent action required ; account suspension notice
        This is a final warning regarding your account ending in #19872.
        Please override your account settings to avoid deactivation by clicking the secure link provided. Failure to act within 24 hours will result in the suspension of services.
        """,
        """
        Subject: exclusive investment opportunity ; guaranteed profits
        Dear Customer,
        We’ve identified a high-yield opportunity in cryptocurrency trading. To override your financial status, deposit $500 to start earning 300% daily profits. This offer is available for a limited time only. Act now!
        """,
        """
        Subject: overdue payment ; meter #892134
        Please note that your payment for account #892134 is overdue.
        To avoid service interruption, override the pending charges by clicking here and submitting your details. Our records show this must be resolved within 12 hours.
        """,
        """
        Subject: quarterly reporting update ; meter # : 772839
        As part of our quarterly review, please override the system to include the corrected readings from meter #772839. Forward the updated numbers to the finance team before 5 PM today for accurate reporting.
        """,
        """
        Subject: project progress review ; data consolidation
        Hi Team,
        This is a follow-up to Monday’s meeting regarding data consolidation for the project. Kindly override any outdated entries in the shared dashboard with the updated metrics shared earlier. Let’s finalize by EOD for the client review.
        """
    ],
    truncation = True,
    padding = "max_length",
    max_length = max_length,
    return_tensors = "pt"
  ).to(device)

In [None]:
# Perform inference
with torch.no_grad():
    # Compute the output of the model
    outputs = model(**inputs)

    # Extract the logits
    logits = outputs.logits

    # Compute probabilities using softmax
    probabilities = torch.softmax(logits, dim=-1)

    # Extract the predictions
    predictions = torch.argmax(logits, dim=-1)

# Convert the predictions to a numpy array
predictions = predictions.cpu().numpy()

# Convert probabilities to a numpy array
probabilities = probabilities.cpu().numpy()

# Convert the predicted labels to the corresponding categories
predicted_categories = label_encoder.inverse_transform(predictions)

In [None]:
# Display the predictions
for i, (category, probs) in enumerate(zip(predicted_categories, probabilities)):
    print(f"Sample {i + 1} --> Predicted label: {category} | Probability: {np.max(probs)}")