<a href="https://colab.research.google.com/github/Devansh-Shukla-16/Smart-pricing-challenge/blob/main/Smart_Product_Pricing_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas==2.2.2 numpy==1.26.4 pillow==10.3.0 opencv-python==4.10.0.84 \
scikit-learn==1.5.2 tqdm==4.66.5 requests==2.32.3 transformers==4.44.2 tokenizers==0.19.1 torch torchvision


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests
from PIL import Image
from io import BytesIO

import torch
from torch import nn
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

üìÅ Step 3: Load your dataset (adjust paths)

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.shape, test_df.shape)
train_df.head()


üñºÔ∏è Step 4: Define image download helper

In [None]:
def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        return img
    except:
        return None


üî§ Step 5 ‚Äì Text Embeddings (using MiniLM for speed)

üß† Step 5: Generate text embeddings (optimized with batching ‚úÖ)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
text_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
text_model.eval()

def get_text_embeddings(texts, batch_size=16):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size].tolist()
        enc = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128).to(device)
        with torch.no_grad():
            out = text_model(**enc).last_hidden_state.mean(dim=1)
        all_embeddings.append(out.cpu())
    return torch.cat(all_embeddings).numpy()

train_text_emb = get_text_embeddings(train_df["catalog_content"][:5000])  # adjust if memory allows
test_text_emb  = get_text_embeddings(test_df["catalog_content"][:5000])


üñºÔ∏è Step 6 ‚Äì Image Embeddings (using Vision Transformer)

üèûÔ∏è Step 6: Generate image embeddings (optimized with batching ‚úÖ)

In [None]:
img_model = models.resnet50(pretrained=True)
img_model.fc = nn.Identity()
img_model = img_model.to(device)
img_model.eval()

img_preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def get_image_embeddings(df, batch_size=8):
    embeddings = []
    for i in tqdm(range(len(df))):
        img = download_image(df.iloc[i]["image_link"])
        if img is not None:
            img_t = img_preprocess(img).unsqueeze(0).to(device)
            with torch.no_grad():
                emb = img_model(img_t).cpu().numpy()
            embeddings.append(emb)
        else:
            embeddings.append(np.zeros((1, 2048)))
    return np.vstack(embeddings)

train_img_emb = get_image_embeddings(train_df[:5000])
test_img_emb  = get_image_embeddings(test_df[:5000])


‚ö° Step 7 ‚Äì Combine Features + Train Model

üèûÔ∏è Step 7: Chunked image embeddings (saves each batch)

In [None]:
def generate_image_embeddings(df, name_prefix, batch_size=8, chunk_size=1000):
    for start in range(0, len(df), chunk_size):
        end = min(start + chunk_size, len(df))
        all_emb = []
        for i in tqdm(range(start, end), desc=f"Image {start}-{end}"):
            img = download_image(df.iloc[i]["image_link"])
            if img is not None:
                img_t = img_preprocess(img).unsqueeze(0).to(device)
                with torch.no_grad():
                    emb = img_model(img_t).cpu().numpy()
                all_emb.append(emb)
            else:
                all_emb.append(np.zeros((1, 2048)))
            torch.cuda.empty_cache()
        np.save(f"/content/embeddings/{name_prefix}_{start}_{end}.npy", np.vstack(all_emb))
        del all_emb; gc.collect()


‚öôÔ∏è Step 8: Generate embeddings for train & test

In [None]:
get_text_embeddings(train_df, "train_text")
get_text_embeddings(test_df, "test_text")
generate_image_embeddings(train_df, "train_img")
generate_image_embeddings(test_df, "test_img")


üßæ Step 9 ‚Äì Generate Test Predictions & CSV

üß© Step 9: Merge embeddings from all chunks

In [None]:
def load_all_embeddings(prefix):
    files = sorted([f for f in os.listdir("/content/embeddings") if f.startswith(prefix)])
    arrays = [np.load(os.path.join("/content/embeddings", f)) for f in files]
    return np.vstack(arrays)

train_text_emb = load_all_embeddings("train_text")
train_img_emb = load_all_embeddings("train_img")
test_text_emb  = load_all_embeddings("test_text")
test_img_emb   = load_all_embeddings("test_img")

train_features = np.concatenate([train_text_emb, train_img_emb], axis=1)
test_features  = np.concatenate([test_text_emb, test_img_emb], axis=1)
train_labels = train_df["price"].values[:len(train_features)]


‚öñÔ∏è Step 10: Train model and normalize

In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_features)
test_scaled = scaler.transform(test_features)

model = RandomForestRegressor(n_estimators=250, max_depth=25, n_jobs=-1, random_state=42)
model.fit(train_scaled, train_labels)


üßæ Step 11: Predict and save output

In [None]:
test_preds = model.predict(test_scaled)

out = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": np.maximum(test_preds, 0)
})
out.to_csv("test_out.csv", index=False)
print("‚úÖ test_out.csv generated successfully!")


üìä Step 12 (Optional): Validate with a small sample

In [None]:
val_pred = model.predict(train_scaled[:1000])
val_true = train_labels[:1000]
smape = np.mean(np.abs(val_pred - val_true) / ((np.abs(val_pred) + np.abs(val_true)) / 2)) * 100
print(f"Validation SMAPE: {smape:.2f}%")
