### Installing and importing the required modules

In [21]:
import torch
import numpy as np
import pandas as pd
from evaluate import load
from typing import Dict, Any
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer

# Import local dependencies
from utils import get_device

### Constants, hyperparameters and model configurations

In [24]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
model_id = "bert-base-uncased" # The model ID of the Llama model
dataset_path = "./datasets/iphone_products.csv" # The path to the dataset
model_path = "./saved_models/iphone_products_classifier" # Path to save the trained model to

In [6]:
# Get the device available on the system
device = get_device()

# Print the detected device
print(f"Detected device: {device}")

Detected device: mps


### Data loading

In [7]:
# Load the dataset into a pandas DataFrame
dataset = pd.read_csv(
    dataset_path,
    delimiter = ",",
    on_bad_lines = "skip"  # Skip problematic lines if necessary
)

In [8]:
# Drop null values
dataset.dropna(inplace=True)

In [9]:
# Show a subset of the samples
dataset.head()

Unnamed: 0,product,title,label
0,iphone,Cover magsafe in pelle iPhone 12 mini,derived_product
1,iphone,Cover GUESS Hard x Iphone 12 e 12 PRO,derived_product
2,iphone,Display iphone 12 pro max,derived_product
3,iphone,display x IPHONE 12 pro max nuovo,derived_product
4,iphone,Iphone 12 pro 256 danneggiato,target_product


### Tokenizer

In [10]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, paddind_side="left")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Preprocess data

In [11]:
# Concatenate the product and its title
dataset["summary"] = dataset["product"] + " - " + dataset["title"]

In [12]:
# Instantiate  the label encoder
label_encoder = LabelEncoder()

# Encode the target column (category_description) into numeric labels
dataset.loc[:, "label"] = label_encoder.fit_transform(dataset["label"])

# Extract and print the total number of classes
num_classes = len(label_encoder.classes_)
print(f"Total number of classes: {num_classes}")

Total number of classes: 2


In [13]:
# Convert the Pandas DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# Train-test split
train_dataset, test_dataset = hf_dataset.train_test_split(test_size=test_size, seed=seed).values()

In [14]:
# Preprocess the dataset
def preprocess(examples: Dict[str, Any], max_length: int = 48) -> Dict[str, Any]:
    # Tokenize the input sequences
    return tokenizer(
        examples["summary"],
        truncation = True,
        padding = "max_length",
        max_length = max_length
    )

# Tokenize the dataset
tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

# Remove unnecessary columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["product", "title"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["product", "title"])

# Display the sequence length
print(f"Sequence length: {len(tokenized_train_dataset[0]['input_ids'])}")

Map:   0%|          | 0/468 [00:00<?, ? examples/s]

Map:   0%|          | 0/117 [00:00<?, ? examples/s]

Sequence length: 48


### Building the model

In [25]:
# Load the model
model = BertForSequenceClassification.from_pretrained(
    model_id,
    num_labels = num_classes
)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Move the model to the taret device
model.to(device);

In [27]:
# Display the model
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Trainig the model

In [28]:
# Load the accuracy metric
accuracy_metric = load("accuracy")

# Define a custum function to compute the metrics
def compute_metrics(eval_pred: torch.Tensor) -> torch.Tensor:
    # Extract the logits and the lables from the output of the model
    logits, labels = eval_pred

    # Extract the predictions for each sample
    predictions = np.argmax(logits, axis=-1)

    # Compute and return the accuarcy
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./iphone_products_classifier",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 20,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_strategy = "steps",
    logging_steps = 10,
    save_total_limit = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    greater_is_better = True,
    report_to = "none",
    pin_memory = False,
    fp16 = True
)

In [30]:
# Instantiate the trainer to train the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_test_dataset,
    compute_metrics = compute_metrics
)

# Trainin the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,,0.735043
2,0.0,,0.735043
3,0.0,,0.735043
4,0.0,,0.735043
5,0.0,,0.735043
6,0.0,,0.735043




KeyboardInterrupt: 

### Save the model

In [None]:
# Saving the adapter to the destination path
model.save_pretrained(model_path)

### Load the fine-tuned model

In [None]:
# Define the quantization configurations of the model (only for CUDA devices)
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16,
    bnb_4bit_use_double_quant = True
)

In [None]:
# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained(
    model_path,
    low_cpu_mem_usage = True,
    quantization_config = quantization_config
)

In [None]:
# Move the fine-tuned model to the target device
model.to(device);

### Inference

In [None]:
# Tokenize a sample input
inputs = tokenizer(
    [
        "Iphone 12 pro 256 danneggiato", # 1
        "Cover antigraffio per iPhone", # 0
        "Drone per iphone con custodia", # 0
        "iPhone 13 mini 500TB Rosa", # 1
        "Set di pellicole per iphone 15 pro max" # 0
    ],
    padding = True,
    truncation = True,
    return_tensors="pt"
  ).to(device)

In [None]:
# Perform inference
with torch.no_grad():
    # Compute the output of the model
    outputs = model(**inputs)

    # Extract the predictions
    predictions = torch.argmax(outputs.logits, dim=-1)

# Convert the predictions to a numpy array
predictions = predictions.cpu().numpy()

# Convert the predicted labels to the corresponding categories
predicted_categories = label_encoder.inverse_transform(predictions)

In [None]:
# Display the predicted categories
for idx, predicted_category in enumerate(predicted_categories):
    print(f"Sample {idx + 1} --> Predicted label: {predictions[idx]} | Predicted Category: {predicted_category}")