<a href="https://colab.research.google.com/github/Adithyan773/IKEA_recomendation_system/blob/main/IKEA_Finetuned_distilBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install datasets

In [None]:
import numpy as np
import pandas as pd
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, DistilBertModel
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import torch

# Load the CSV dataset
df = pd.read_csv('/content/ikea_data_img_fixed.csv')

# Check for missing values in critical columns and handle them
df = df.dropna(subset=['name', 'short_description', 'image_description', 'category'])
print("Rows after dropping NaN in critical columns:", len(df))

# Check class distribution
print("Class distribution:\n", df['category'].value_counts())

# Map categories to numerical labels
unique_categories = df['category'].unique()
category_to_id = {cat: idx for idx, cat in enumerate(unique_categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}
df['label'] = df['category'].map(category_to_id)

# Combine text fields for input
df['text'] = df['name'] + ' ' + df['short_description'] + ' ' + df['image_description']

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Compute class weights
labels = train_df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

# Load DistilBERT model for classification
num_labels = len(unique_categories)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model.to('cuda' if torch.cuda.is_available() else 'cpu')  # Ensure model is on the correct device

# Compute metrics with per-class F1 scores
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=1)
    per_class_f1 = precision_recall_fscore_support(labels, preds, average=None, zero_division=1)[2]
    per_class_f1_dict = {id_to_category[i]: f1_score for i, f1_score in enumerate(per_class_f1)}
    print("Per-class F1 scores:", per_class_f1_dict)
    metrics = {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'per_class_f1': per_class_f1.tolist()
    }
    return metrics

# Define weighted trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    warmup_steps=100,
    report_to="none"
)

# Initialize trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./fine_tuned_distilbert_v3')
tokenizer.save_pretrained('./fine_tuned_distilbert_v3')

print("Fine-tuning completed.")