### Installing and importing the required modules

In [8]:
import sys
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from evaluate import load
from datasets import Dataset
from typing import Dict, Any
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction

# Add the parent directory to the system path
sys.path.append(str(Path().resolve().parent))

# Import local dependencies
from src.utils import get_device, set_seed

### Constants, hyperparameters and model configurations

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
validation_size = 0.1 # Train-validation split percentage
model_id = "bert-base-uncased" # The model ID of the Llama model
dataset_path = Path().resolve().parent.parent / "datasets" / "iphone_products.csv" # Path to the dataset
model_path = Path().resolve().parent.parent / "saved_models" / "iphone_products_classifier" # Path to save the trained model to

In [5]:
# Set the seed for reproducibility
set_seed(seed)

In [6]:
# Get the device available on the system
device = get_device()

# Print the detected device
print(f"Detected device: {device}")

Detected device: mps


### Data loading

In [12]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = ",",
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

In [None]:
# Basic cleanup
dataset = dataset.dropna(subset=["product", "title", "label"]).copy()
dataset["product"] = dataset["product"].astype(str)
dataset["title"] = dataset["title"].astype(str)
dataset["label"] = dataset["label"].astype(str)

In [None]:
# Show a subset of the samples
dataset.head()

### Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")

### Preprocess data

In [None]:
# Concatenate the product and its title
dataset["summary"] = dataset["product"] + " - " + dataset["title"]

In [None]:
# Instantiate  the label encoder
label_encoder = LabelEncoder()

# Encode the target column (category_description) into numeric labels
dataset["labels"] = label_encoder.fit_transform(dataset["label"])
dataset["labels"] = dataset["labels"].astype("int64")

# Extract and print the total number of classes
num_classes = len(label_encoder.classes_)
print(f"Total number of classes: {num_classes}")

In [None]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-valid-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()
train_dataset, valid_dataset = train_dataset.train_test_split(test_size=validation_size, seed=seed).values()

# Print the number of training and test samples
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(valid_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

In [None]:
# Preprocess the dataset
def preprocess(examples: Dict[str, Any], max_length: int = 48) -> Dict[str, Any]:
    # Tokenize the input sequences
    return tokenizer(
        examples["summary"],
        truncation = True,
        padding = "max_length",
        max_length = max_length
    )
    
# Apply the preprocessing to the datasets
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=["product", "title", "label", "summary"])
tokenized_valid = valid_dataset.map(preprocess, batched=True, remove_columns=["product", "title", "label", "summary"])
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=["product", "title", "label", "summary"])

# Confirm sequence length
print(f"Sequence length: {len(tokenized_train[0]['input_ids'])}")

### Building the model

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels = num_classes
).to(device)

In [None]:
# Display the model
model

In [None]:
# Print the number of parameters in the model
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in the model: {total_params}")

### Trainig the model

In [None]:
# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define the compute_metrics function
def compute_metrics(eval_pred: EvalPrediction) -> Dict[str, float]:
    # Extract the logits and labels from the EvalPrediction object
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    
    # Handle the case where logits is a tuple
    if isinstance(logits, tuple):
        logits = logits[0]
        
    # Get the predicted class labels and compute the accuracy
    preds = np.argmax(logits, axis=-1)
    out = accuracy_metric.compute(predictions=preds, references=labels)
    
    # Safety check
    assert out is not None, "Metrics computation failed."
    
    # Convert all metric values to float
    return {k: float(v) for k, v in out.items()}  

In [None]:
# Mixed precision settings
use_cuda = torch.cuda.is_available() and "cuda" in str(device).lower()
use_pin_memory = bool(use_cuda)
bf16 = bool(use_cuda and torch.cuda.is_bf16_supported())

# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./checkpoints/iphone_products_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 50,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_accuracy",
    greater_is_better = True,
    report_to = "none",
    dataloader_pin_memory = use_pin_memory,
    bf16 = bf16
)

In [None]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_valid,
    compute_metrics = compute_metrics
)

# Training the model
trainer_output = trainer.train()

# Pretty print the training results
print(trainer_output)

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(model_path)

### Load the fine-tuned model

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
	torch.cuda.empty_cache()

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = None
if use_cuda:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "nf4",
        bnb_4bit_compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        bnb_4bit_use_double_quant = True
    )

In [None]:
# Reload the fine-tuned model 
reload_kwargs = {}
if quantization_config is not None:
    reload_kwargs.update(dict(quantization_config=quantization_config, device_map="auto"))

# Reload the fine-tuned model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    **reload_kwargs
).to(device)

# Set the model to evaluation mode
model.eval()

### Evaluation

In [None]:
# Evaluate the model on the test set
predictions = []

# Iterate over the test dataset in batches
for i in range(0, len(tokenized_test), training_args.per_device_eval_batch_size):
    # Prepare the batch
    batch = tokenized_test[i : i + training_args.per_device_eval_batch_size]
    inputs = {k: torch.tensor(v).to(device) for k, v in batch.items() if k not in ["labels", "__index_level_0__"]}
    
    # Forward pass
    with torch.no_grad():
        # Predict the outputs
        outputs = model(**inputs)
        
    # Extract the predictions
    logits = outputs.logits
    preds = logits.argmax(dim=-1)
    
    # Append the predictions to the list
    predictions.extend(preds.cpu().numpy())

In [None]:
# Display metrics
eval_metrics = accuracy_metric.compute(predictions=predictions, references=tokenized_test["labels"])

# Display the test accuracy
assert eval_metrics is not None, "Evaluation metrics are not available."
print(f"Test Accuracy: {eval_metrics['accuracy']:.4f}")

### Inference

In [None]:
# Inference on sample inputs
inference_inputs = [
        "Iphone 12 pro 256 danneggiato", # 1
        "Cover antigraffio per iPhone", # 0
        "Drone per iphone con custodia", # 0
        "iPhone 13 mini 500TB Rosa", # 1
        "Set di pellicole per iphone 15 pro max" # 0
]

# Tokenize a sample input
inputs = tokenizer(
    inference_inputs,
    padding = True,
    truncation = True,
    return_tensors="pt"
  ).to(device)

In [None]:
# Perform inference
with torch.no_grad():
    # Compute the output of the model
    outputs = model(**inputs)

    # Extract the predictions
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# Convert the predicted labels to the corresponding categories
predicted_categories = label_encoder.inverse_transform(predictions)

In [None]:
# Display the predicted categories
for idx, (inference_input, predicted_category) in enumerate(zip(inference_inputs, predicted_categories)):
    print(f"Sample {idx + 1} --> Input: {inference_input} | Predicted label: {predictions[idx]} | Predicted Category: {predicted_category}")