<a href="https://colab.research.google.com/github/Adithyan773/IKEA_recomendation_system/blob/main/IKEA_Finetuned_distilBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.50.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.49.0
    Uninstalling transformers-4.49.0:
      Successfully uninstalled transformers-4.49.0
Successfully installed transformers-4.50.0


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import numpy as np
import pandas as pd
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, DistilBertModel
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import torch

# Load the CSV dataset
df = pd.read_csv('/content/ikea_data_img_fixed.csv')

# Check for missing values in critical columns and handle them
df = df.dropna(subset=['name', 'short_description', 'image_description', 'category'])
print("Rows after dropping NaN in critical columns:", len(df))

# Check class distribution
print("Class distribution:\n", df['category'].value_counts())

# Map categories to numerical labels
unique_categories = df['category'].unique()
category_to_id = {cat: idx for idx, cat in enumerate(unique_categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}
df['label'] = df['category'].map(category_to_id)

# Combine text fields for input
df['text'] = df['name'] + ' ' + df['short_description'] + ' ' + df['image_description']

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Compute class weights
labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

# Load DistilBERT model for classification
num_labels = len(unique_categories)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model.to('cuda' if torch.cuda.is_available() else 'cpu')  # Ensure model is on the correct device

# Compute metrics with per-class F1 scores
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=1)
    per_class_f1 = precision_recall_fscore_support(labels, preds, average=None, zero_division=1)[2]
    per_class_f1_dict = {id_to_category[i]: f1_score for i, f1_score in enumerate(per_class_f1)}
    print("Per-class F1 scores:", per_class_f1_dict)
    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'per_class_f1': per_class_f1.tolist()
    }
    return metrics

# Define weighted trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    warmup_steps=100,
    report_to="none"
)

# Initialize trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./fine_tuned_distilbert_v3')
tokenizer.save_pretrained('./fine_tuned_distilbert_v3')

print("Fine-tuning completed.")

Rows after dropping NaN in critical columns: 1024
Class distribution:
 category
Bookcases & shelving units              225
Chairs                                  166
Tables & desks                          115
Beds                                     88
Cabinets & cupboards                     85
Chests of drawers & drawer units         66
Children's furniture                     55
Wardrobes                                52
Sofas & armchairs                        38
Outdoor furniture                        34
TV & media furniture                     33
Bar furniture                            24
Trolleys                                 18
Nursery furniture                        10
Café furniture                            9
Sideboards, buffets & console tables      4
Room dividers                             2
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Per Class F1
1,2.8249,2.833921,0.063415,0.927142,0.063415,0.009595,"[0.15873015873015872, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10666666666666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,2.7937,2.785726,0.063415,0.817019,0.063415,0.016876,"[0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11428571428571428, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09523809523809523]"
3,2.6883,2.617741,0.307317,0.786529,0.307317,0.261166,"[0.16326530612244897, 0.0, 0.5675675675675675, 0.21052631578947367, 0.0, 0.11428571428571428, 0.5581395348837209, 0.42105263157894735, 0.0, 0.5, 0.0, 0.2222222222222222, 0.0, 0.0, 0.0, 0.26666666666666666]"
4,2.3769,2.284667,0.580488,0.662827,0.580488,0.568526,"[0.38095238095238093, 0.8125, 0.6575342465753424, 0.7058823529411765, 0.0, 0.10810810810810811, 0.8275862068965517, 0.5217391304347826, 0.6666666666666666, 0.38095238095238093, 0.0, 0.5, 0.7636363636363637, 0.8, 0.5333333333333333, 0.6666666666666666]"
5,2.0466,1.937377,0.668293,0.728001,0.668293,0.651134,"[0.5, 0.8571428571428571, 0.7228915662650602, 0.6857142857142857, 0.14285714285714285, 0.15789473684210525, 0.96, 0.7058823529411765, 0.6666666666666666, 0.5882352941176471, 0.0, 0.6666666666666666, 0.8461538461538461, 1.0, 0.5263157894736842, 0.8695652173913043]"
6,1.8314,1.653263,0.668293,0.758913,0.668293,0.67598,"[0.6666666666666666, 0.8823529411764706, 0.6756756756756757, 0.5555555555555556, 0.0, 0.391304347826087, 0.9166666666666666, 0.7058823529411765, 0.6666666666666666, 0.7368421052631579, 0.0, 0.0, 0.631578947368421, 0.9019607843137255, 0.8571428571428571, 0.48, 0.8695652173913043]"
7,1.3682,1.394216,0.668293,0.751837,0.668293,0.678966,"[0.47619047619047616, 0.8947368421052632, 0.7532467532467533, 0.5714285714285714, 0.0, 0.34782608695652173, 0.8888888888888888, 0.7058823529411765, 0.6666666666666666, 0.7777777777777778, 0.0, 0.6666666666666666, 0.8260869565217391, 1.0, 0.45454545454545453, 0.9]"
8,1.157,1.212375,0.726829,0.810473,0.726829,0.749584,"[1.0, 0.9142857142857143, 0.7341772151898734, 0.6285714285714286, 0.0, 0.6037735849056604, 0.96, 0.7058823529411765, 0.6666666666666666, 0.7777777777777778, 0.0, 0.0, 0.6666666666666666, 0.92, 1.0, 0.43478260869565216, 0.9]"
9,0.9585,1.0854,0.736585,0.824849,0.736585,0.762314,"[1.0, 0.9444444444444444, 0.759493670886076, 0.5454545454545454, 0.18181818181818182, 0.6415094339622641, 0.9166666666666666, 0.8421052631578947, 0.6666666666666666, 0.7777777777777778, 0.0, 0.0, 0.6666666666666666, 0.9166666666666666, 1.0, 0.4166666666666667, 0.9]"
10,0.7159,1.015619,0.75122,0.833555,0.75122,0.773588,"[1.0, 0.8947368421052632, 0.72, 0.5882352941176471, 0.25, 0.75, 0.9166666666666666, 0.8421052631578947, 0.6666666666666666, 0.875, 0.0, 0.0, 0.7058823529411765, 0.9019607843137255, 1.0, 0.4, 0.9]"


Per-class F1 scores: {'Bar furniture': np.float64(0.15873015873015872), 'Beds': np.float64(0.0), 'Bookcases & shelving units': np.float64(0.0), 'Cabinets & cupboards': np.float64(0.0), 'Café furniture': np.float64(0.0), 'Chairs': np.float64(0.0), 'Chests of drawers & drawer units': np.float64(0.0), "Children's furniture": np.float64(0.10666666666666667), 'Nursery furniture': np.float64(0.0), 'Outdoor furniture': np.float64(0.0), 'Room dividers': np.float64(0.0), 'Sideboards, buffets & console tables': np.float64(0.0), 'Sofas & armchairs': np.float64(0.0), 'Tables & desks': np.float64(0.0), 'Trolleys': np.float64(0.0), 'TV & media furniture': np.float64(0.0), 'Wardrobes': np.float64(0.0)}
Per-class F1 scores: {'Bar furniture': np.float64(0.25), 'Beds': np.float64(0.0), 'Bookcases & shelving units': np.float64(0.0), 'Cabinets & cupboards': np.float64(0.0), 'Café furniture': np.float64(0.0), 'Chairs': np.float64(0.0), 'Chests of drawers & drawer units': np.float64(0.0), "Children's furnit