In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

data_folder_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset"
output_folder_path = f"{data_folder_path}/output_csv_folder"  # Removed .csv

# Check if Data is present
if not os.path.exists(data_folder_path):
    raise FileNotFoundError(f"The path {data_folder_path} does not exist.")
else:
    print("Data is present")

# Create the folder if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)
print(f"Folder created at: {output_folder_path}")

# You can then save files like this:
#csv_file_path = f"{output_folder_path}/your_data.csv"
# df.to_csv(csv_file_path)  # Example for pandas

Data is present
Folder created at: /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/output_csv_folder


In [50]:
from PIL import Image, ImageOps
import requests
from io import BytesIO
import pandas as pd

train = pd.read_csv(f"{data_folder_path}/train.csv")
test = pd.read_csv(f"{data_folder_path}/test.csv")
sample_test = pd.read_csv(f"{data_folder_path}/sample_test.csv")
sample_test_out = pd.read_csv(f"{data_folder_path}/sample_test_out.csv")

sample_test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,217392,Item Name: Gift Basket Village Gourmet Meat an...,https://m.media-amazon.com/images/I/91GB1wC6Ob...
1,209156,"Item Name: NPG Dried Lotus Seeds 16 Oz, Uncook...",https://m.media-amazon.com/images/I/81VnzF1vkv...
2,262333,Item Name: Annies Homegrown Macaroni and Chees...,https://m.media-amazon.com/images/I/51aCDMHMnI...
3,295979,Item Name: Bear Creek Country Kitchens Creamy ...,https://m.media-amazon.com/images/I/71dzRyLGPi...
4,50604,Item Name: Japanese Kelp Kombu Umami Soup Stoc...,https://m.media-amazon.com/images/I/71Yu21cGwr...


In [52]:
#Replace with your image URL
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display

image_url = sample_test.iloc[6]['image_link']

#Download the image
response = requests.get(image_url)
img = Image.open(BytesIO(response.content)).convert("RGB")
display(img)

Output hidden; open in https://colab.research.google.com to view.

In [5]:
import re

def clean_item_name(text):
    # remove "Item Name:" (case insensitive, optional leading/trailing spaces)
    text = re.sub(r'(?i)^item name:\s*', '', text.strip())
    # normalize spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['catalog_content'] = train['catalog_content'].apply(clean_item_name)
test['catalog_content'] = test['catalog_content'].apply(clean_item_name)
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Salerno Cookies, The Original Butter Cookies, ...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


# Numerical Features

In [6]:
import re
import pandas as pd

UNIT_CONVERSIONS = {
    'kg': 1000, 'kilogram': 1000, 'kilograms': 1000,
    'g': 1, 'gram': 1, 'grams': 1,
    'mg': 0.001, 'milligram': 0.001, 'milligrams': 0.001,
    'lb': 453.6, 'lbs': 453.6, 'pound': 453.6, 'pounds': 453.6,
    'oz': 28.35, 'ounce': 28.35, 'ounces': 28.35,

    'l': 1000, 'liter': 1000, 'liters': 1000, 'litre': 1000, 'litres': 1000,
    'ml': 1, 'milliliter': 1, 'milliliters': 1,
}

def safe_float(x):
    try:
        return float(x)
    except:
        return None

def extract_numeric_features_universal(text):
    if not isinstance(text, str):
        return pd.Series({'total_weight_g': 0, 'pack_qty': 1, 'pieces': 0, 'percent_value': 0})

    text = text.lower()
    total_weight_g = 0
    pack_qty = 1
    pieces = 0
    percent_value = 0

    # (a) Handle (2 x 14.1 oz) or similar
    match_multi = re.findall(r'(\d+(?:\.\d+)?)\s*[x×]\s*([\d\.]+)\s*([a-z%]+)', text)
    for n, qty, unit in match_multi:
        n, qty = safe_float(n), safe_float(qty)
        if n is None or qty is None:
            continue
        unit = unit.strip().lower()
        if unit in UNIT_CONVERSIONS:
            total_weight_g += n * qty * UNIT_CONVERSIONS[unit]

    # (b) Individual numeric-unit pairs
    matches = re.findall(r'(\d+(?:\.\d+)?)\s*([a-z%]+)', text)
    for val, unit in matches:
        val = safe_float(val)
        if val is None:
            continue
        unit = unit.strip().lower()
        if unit in UNIT_CONVERSIONS:
            total_weight_g = max(total_weight_g, val * UNIT_CONVERSIONS[unit])
        elif unit == '%':
            percent_value = max(percent_value, val)

    # (c) Pack quantities
    match_pack = re.search(r'(pack\s*of\s*(\d+))|(\b(\d+)\s*per\s*case)', text)
    if match_pack:
        nums = re.findall(r'\d+', match_pack.group())
        pack_qty = int(nums[0]) if nums else 1

    # (d) Pieces or counts
    match_pieces = re.search(r'(\d+)\s*(pieces|pcs|count|tabs|capsules|tablets)', text)
    if match_pieces:
        pieces = int(match_pieces.group(1))

    return pd.Series({
        'total_weight_g': total_weight_g,
        'pack_qty': pack_qty,
        'pieces': pieces,
        'percent_value': percent_value
    })


In [7]:
train_feats = train['catalog_content'].apply(extract_numeric_features_universal)
train = pd.concat([train, train_feats], axis=1)
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,total_weight_g,pack_qty,pieces,percent_value
0,33127,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,340.2,6.0,0.0,0.0
1,198967,"Salerno Cookies, The Original Butter Cookies, ...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,226.8,4.0,0.0,0.0
2,261251,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,53.865,6.0,0.0,0.0
3,55858,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,318.9375,1.0,0.0,0.0
4,292686,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,360.045,12.0,0.0,0.0


In [8]:
# Check missing
print('catalog_content nulls', train['catalog_content'].isna().sum())
print('image_link nulls', train['image_link'].isna().sum())
print('price nulls', train['price'].isna().sum())

catalog_content nulls 0
image_link nulls 0
price nulls 0


In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Summary
train.describe()

# Correlation with price
corr = train.corr(numeric_only=True)['price'].sort_values(ascending=False)
print(corr)

price             1.000000
pack_qty          0.073031
pieces            0.042730
percent_value     0.007811
total_weight_g    0.005829
sample_id        -0.025731
Name: price, dtype: float64


In [10]:
# Prepare target
import numpy as np
train['target'] = np.log1p(train['price'])
train.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,total_weight_g,pack_qty,pieces,percent_value,target
0,33127,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,340.2,6.0,0.0,0.0,1.773256
1,198967,"Salerno Cookies, The Original Butter Cookies, ...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,226.8,4.0,0.0,0.0,2.647592
2,261251,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,53.865,6.0,0.0,0.0,1.088562
3,55858,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,318.9375,1.0,0.0,0.0,3.444895
4,292686,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,360.045,12.0,0.0,0.0,4.211979


# Downloading Images

In [12]:
'''
import re
import os
import pandas as pd
import multiprocessing
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
# SPECIFY YOUR CUSTOM SAVE LOCATION
CUSTOM_SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/product_images"

def download_image(image_link, savefolder):
    if(isinstance(image_link, str)):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if(not os.path.exists(image_save_path)):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
            except Exception as ex:
                print('Warning: Not able to download - {}\n{}'.format(image_link, ex))
        else:
            return
    return

def download_images(image_links, download_folder=CUSTOM_SAVE_PATH):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    results = []
    download_image_partial = partial(download_image, savefolder=download_folder)
    with multiprocessing.Pool(100) as pool:
        for result in tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)):
            results.append(result)
        pool.close()
        pool.join()

In [13]:
'''
image_urls = train['image_link'].tolist()
download_images(image_urls)  # Automatically uses your custom path

 52%|█████▏    | 38841/75000 [15:01<12:42, 47.43it/s]

HTTP Error 404: Not Found


 59%|█████▊    | 43955/75000 [16:53<08:40, 59.65it/s]

<urlopen error retrieval incomplete: got only 81324 out of 129013 bytes>


100%|██████████| 75000/75000 [28:11<00:00, 44.33it/s]


In [13]:
import pandas as pd
from urllib.parse import urlparse
import os
from pathlib import Path  # ADD THIS IMPORT

# DEFINE test_path FIRST
test_path = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/product_images")

train['image_exists'] = True  # Assume all exist

# Or be more precise - mark the known missing one
known_missing = ['51mjZYDYjyL.jpg']  # From your previous output
train['image_exists'] = ~train['image_fname'].isin(known_missing)

print(f"📊 Assuming {train['image_exists'].sum()}/{len(train)} images exist")


📊 Assuming 74999/75000 images exist


In [12]:
import pandas as pd
from urllib.parse import urlparse
import os
from pathlib import Path  # ADD THIS IMPORT

# DEFINE test_path FIRST
test_path = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/product_images")

# Your dataframe: train (already loaded)
# Ensure 'image_link' exists
if 'image_link' not in train.columns:
    raise ValueError("train DataFrame doesn't have column 'image_link'")

def filename_from_url(url):
    try:
        return Path(urlparse(url).path).name
    except:
        return ""

train['image_fname'] = train['image_link'].fillna("").apply(filename_from_url)

# Check existence
train['image_exists'] = train['image_fname'].apply(lambda n: (test_path / n).exists() if n else False)

# Summary
total = len(train)
missing = (~train['image_exists']).sum()
print(f"Total rows: {total}")
print(f"Images missing: {missing} ({missing/total*100:.2f}%)")

# Show some missing examples
print("\nExamples of missing image filenames (first 20):")
print(train.loc[~train['image_exists'], ['image_link','image_fname']].head(20).to_string(index=False))

OSError: [Errno 5] Input/output error: '/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/product_images/51mo8htwTHL.jpg'

In [14]:
train = train[train['image_exists']].reset_index(drop=True)
print(f"Final dataset size after filtering: {len(train)}")

Final dataset size after filtering: 74999


In [15]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Create stratification bins based on price distribution
train['price_bin'] = pd.qcut(train['price'], q=10, labels=False, duplicates='drop')

train_df, test_df = train_test_split(
    train,
    test_size=0.2,
    random_state=42,
    stratify=train['price_bin']
)

# Drop the helper column
train_df = train_df.drop(columns='price_bin')
test_df = test_df.drop(columns='price_bin')

print(f"📊 Stratified split complete:")
print(f"   Train: {len(train_df)} samples")
print(f"   Validation: {len(test_df)} samples")


📊 Stratified split complete:
   Train: 59999 samples
   Validation: 15000 samples


# Extracting Embeddings

In [16]:
# Run once
import os, sys, math, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

BASE = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION")
BERT_DIR = BASE / "finetuned_bert_price"
CLIP_CHECKPOINT = BASE / "models" / "fine_tuned_clip_final.pth"
IMAGE_DIR = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/product_images")
EMB_DIR = BASE / "embeddings"
EMB_DIR.mkdir(exist_ok=True)

Device: cpu


In [17]:

# create image filename column (if not already)
from urllib.parse import urlparse
def fname_from_url(u):
    try:
        return Path(urlparse(u).path).name
    except:
        return ""
train_df['image_fname'] = train_df['image_link'].fillna("").apply(fname_from_url)
test_df['image_fname']  = test_df['image_link'].fillna("").apply(fname_from_url)

# quick checks
print("Train:", len(train_df), "Test:", len(test_df))
print("Missing train images:", (~train_df['image_fname'].apply(lambda n: (IMAGE_DIR/n).exists() if n else False)).sum())
print("Missing test images:", (~test_df['image_fname'].apply(lambda n: (IMAGE_DIR/n).exists() if n else False)).sum())


Train: 59999 Test: 15000
Missing train images: 14346
Missing test images: 3511


In [18]:
# Use robust check and filter
import os

def safe_image_exists(filename):
    if not filename:
        return False
    try:
        return os.path.exists(os.path.join(str(IMAGE_DIR), filename))
    except OSError:
        return False

# Filter datasets
train_df_clean = train_df[train_df['image_fname'].apply(safe_image_exists)].reset_index(drop=True)
test_df_clean = test_df[test_df['image_fname'].apply(safe_image_exists)].reset_index(drop=True)

print(f"🎯 Clean datasets:")
print(f"   Train: {len(train_df_clean)} samples with images")
print(f"   Test:  {len(test_df_clean)} samples with images")

# Continue with these clean datasets for your modeling
train_df_clean.shape

🎯 Clean datasets:
   Train: 45653 samples with images
   Test:  11489 samples with images


(45653, 11)

# TEXT EMBED

In [22]:
'''
# install transformers if needed
!pip install -q transformers

from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained(str(BERT_DIR))
bert = AutoModel.from_pretrained(str(BERT_DIR)).to(device)
bert.eval()

def get_text_embeddings(texts, batch_size=128, pooling="mean"):
    embs = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, max_length=160, return_tensors="pt")
            enc = {k:v.to(device) for k,v in enc.items()}
            out = bert(**enc)
            last = out.last_hidden_state  # (B, L, H)
            if pooling=="cls":
                v = last[:,0,:]
            else:
                attn_mask = enc['attention_mask'].unsqueeze(-1)
                v = (last * attn_mask).sum(dim=1) / (attn_mask.sum(dim=1).clamp(min=1e-9))
            embs.append(v.cpu())
    return torch.cat(embs).numpy()

# Extract & save
train_texts = train_df_clean['catalog_content'].fillna("").tolist()
test_texts  = test_df_clean['catalog_content'].fillna("").tolist()

train_text_emb = get_text_embeddings(train_texts, batch_size=128, pooling="mean")
np.save(EMB_DIR/"train_text_emb.npy", train_text_emb)
test_text_emb = get_text_embeddings(test_texts, batch_size=128, pooling="mean")
np.save(EMB_DIR/"test_text_emb.npy", test_text_emb)

print("Text embedding shapes:", train_text_emb.shape, test_text_emb.shape)

Text embedding shapes: (45653, 768) (11489, 768)


In [34]:
'''
# ==========================================================
# ⚙️ SETUP
# ==========================================================
!pip install -q open_clip_torch pillow torchvision tqdm

import torch, os, numpy as np, shutil
from pathlib import Path
from tqdm import tqdm
from PIL import Image, ImageOps
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast
import open_clip

# ==========================================================
# 🧩 PATHS
# ==========================================================
DRIVE_BASE = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION")
IMAGE_DIR = DRIVE_BASE / "68e8d1d70b66d_student_resource/student_resource/dataset/product_images"
DRIVE_EMB_DIR = DRIVE_BASE / "embeddings"
LOCAL_EMB_DIR = Path("/content/embeddings")

DRIVE_EMB_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_EMB_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")

# ==========================================================
# 🧠 LOAD FINE-TUNED CLIP MODEL
# ==========================================================
CLIP_CHECKPOINT = DRIVE_BASE / "models/fine_tuned_clip_final.pth"

clip_model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
state = torch.load(CLIP_CHECKPOINT, map_location=device)

if isinstance(state, dict) and 'model_state_dict' in state:
    clip_model.load_state_dict(state['model_state_dict'])
else:
    clip_model.load_state_dict(state)

clip_model = clip_model.to(device)
clip_model.eval()
torch.backends.cudnn.benchmark = True
print("✅ CLIP model loaded and ready for inference")

# ==========================================================
# 🧱 DATASET CLASS (loads directly from Drive)
# ==========================================================
class DriveImageDataset(Dataset):
    def __init__(self, fnames, preprocess):
        self.fnames = fnames
        self.preprocess = preprocess

    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, i):
        p = IMAGE_DIR / self.fnames[i]
        if not p.exists():
            img = Image.new("RGB", (224,224), color="white")
        else:
            img = Image.open(p).convert("RGB")
            img = ImageOps.exif_transpose(img)
        return self.preprocess(img)

# ==========================================================
# 💾 CHUNKED EXTRACTION FUNCTION (resumable)
# ==========================================================
def save_embeddings_in_chunks(fnames, prefix, model, preprocess, batch_size=192, chunk_size=10000):
    """
    Extract embeddings in chunks to avoid OOM.
    Automatically resumes if chunk already exists (locally or on Drive).
    """
    total = len(fnames)
    n_chunks = int(np.ceil(total / chunk_size))
    print(f"Total {total} images → {n_chunks} chunks (size {chunk_size})")

    for chunk_idx in range(n_chunks):
        chunk_name = f"{prefix}_chunk{chunk_idx:03d}.npy"
        local_path = LOCAL_EMB_DIR / chunk_name
        drive_path = DRIVE_EMB_DIR / chunk_name

        # ✅ Skip if already exists locally or on Drive
        if local_path.exists() or drive_path.exists():
            print(f"✔ Skipping chunk {chunk_idx+1}/{n_chunks}")
            continue

        start, end = chunk_idx * chunk_size, min((chunk_idx+1) * chunk_size, total)
        sub_fnames = fnames[start:end]
        print(f"\n▶ Processing chunk {chunk_idx+1}/{n_chunks}: {start}–{end}")

        ds = DriveImageDataset(sub_fnames, preprocess)
        dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

        chunk_embs = []
        with torch.no_grad():
            for imgs in tqdm(dl):
                imgs = imgs.to(device, non_blocking=True)
                with autocast():
                    feats = model.encode_image(imgs)
                feats = feats / feats.norm(dim=-1, keepdim=True)
                chunk_embs.append(feats.cpu())
                del imgs, feats
                torch.cuda.empty_cache()

        chunk_embs = torch.cat(chunk_embs).numpy()
        np.save(local_path, chunk_embs)
        print(f"💾 Saved {chunk_embs.shape} → {local_path}")

        # ✅ Copy safely to Drive
        shutil.copy(local_path, drive_path)
        print(f"☁️ Copied to Drive → {drive_path}")


# ==========================================================
# 🔄 MERGE CHUNKS INTO ONE FILE
# ==========================================================
def merge_chunks(prefix, output_name):
    chunk_files = sorted([f for f in os.listdir(DRIVE_EMB_DIR) if f.startswith(prefix) and f.endswith(".npy")])
    print(f"Merging {len(chunk_files)} chunks…")

    all_parts = [np.load(DRIVE_EMB_DIR / f) for f in chunk_files]
    full = np.concatenate(all_parts, axis=0)

    out_path = DRIVE_EMB_DIR / output_name
    np.save(out_path, full)
    print(f"✅ Saved merged embeddings → {out_path}, shape={full.shape}")
    return full


# ==========================================================
# 🧩 STATUS CHECKER
# ==========================================================
def check_progress(prefix):
    all_files = sorted([f for f in os.listdir(DRIVE_EMB_DIR) if f.startswith(prefix) and f.endswith(".npy")])
    print(f"✅ {len(all_files)} chunks completed for prefix: {prefix}")
    for f in all_files:
        size_mb = os.path.getsize(DRIVE_EMB_DIR / f) / 1e6
        print(f"  - {f} ({size_mb:.1f} MB)")


# ==========================================================
# 🧠 RUN EXTRACTION (train/test)
# ==========================================================
# Example: assume train_df_clean & test_df_clean are already defined
train_fnames = train_df_clean['image_fname'].tolist()
test_fnames = test_df_clean['image_fname'].tolist()

# 🔹 TRAIN
save_embeddings_in_chunks(train_fnames, "train_img_emb", clip_model, preprocess, batch_size=192, chunk_size=10000)
check_progress("train_img_emb")

# 🔹 TEST
save_embeddings_in_chunks(test_fnames, "test_img_emb", clip_model, preprocess, batch_size=192, chunk_size=5000)
check_progress("test_img_emb")

#🔹 Merge (run at the very end)
train_img_emb = merge_chunks("train_img_emb", "train_img_emb.npy")
test_img_emb = merge_chunks("test_img_emb", "test_img_emb.npy")


✅ Using device: cuda
✅ CLIP model loaded and ready for inference
Total 45653 images → 5 chunks (size 10000)
✔ Skipping chunk 1/5
✔ Skipping chunk 2/5

▶ Processing chunk 3/5: 20000–30000


  with autocast():
100%|██████████| 53/53 [21:07<00:00, 23.92s/it]


💾 Saved (10000, 512) → /content/embeddings/train_img_emb_chunk002.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/train_img_emb_chunk002.npy

▶ Processing chunk 4/5: 30000–40000


100%|██████████| 53/53 [25:46<00:00, 29.19s/it]


💾 Saved (10000, 512) → /content/embeddings/train_img_emb_chunk003.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/train_img_emb_chunk003.npy

▶ Processing chunk 5/5: 40000–45653


100%|██████████| 30/30 [15:04<00:00, 30.16s/it]


💾 Saved (5653, 512) → /content/embeddings/train_img_emb_chunk004.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/train_img_emb_chunk004.npy
✅ 5 chunks completed for prefix: train_img_emb
  - train_img_emb_chunk000.npy (10.2 MB)
  - train_img_emb_chunk001.npy (10.2 MB)
  - train_img_emb_chunk002.npy (10.2 MB)
  - train_img_emb_chunk003.npy (10.2 MB)
  - train_img_emb_chunk004.npy (5.8 MB)
Total 11489 images → 3 chunks (size 5000)

▶ Processing chunk 1/3: 0–5000


100%|██████████| 27/27 [12:57<00:00, 28.81s/it]


💾 Saved (5000, 512) → /content/embeddings/test_img_emb_chunk000.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/test_img_emb_chunk000.npy

▶ Processing chunk 2/3: 5000–10000


100%|██████████| 27/27 [12:51<00:00, 28.57s/it]


💾 Saved (5000, 512) → /content/embeddings/test_img_emb_chunk001.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/test_img_emb_chunk001.npy

▶ Processing chunk 3/3: 10000–11489


100%|██████████| 8/8 [03:56<00:00, 29.54s/it]


💾 Saved (1489, 512) → /content/embeddings/test_img_emb_chunk002.npy
☁️ Copied to Drive → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/test_img_emb_chunk002.npy
✅ 3 chunks completed for prefix: test_img_emb
  - test_img_emb_chunk000.npy (5.1 MB)
  - test_img_emb_chunk001.npy (5.1 MB)
  - test_img_emb_chunk002.npy (1.5 MB)
Merging 5 chunks…
✅ Saved merged embeddings → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/train_img_emb.npy, shape=(45653, 512)
Merging 3 chunks…
✅ Saved merged embeddings → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/test_img_emb.npy, shape=(11489, 512)


# Multimodal Fusion model

In [19]:
# Normalizing Numerical Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_num_train = scaler.fit_transform(train_df_clean[['total_weight_g','pack_qty','pieces','percent_value']])
X_num_test = scaler.fit_transform(test_df_clean[['total_weight_g','pack_qty','pieces','percent_value']])

In [21]:
import numpy as np
import os

# Path to your embeddings folder
embeddings_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings"

# Load the embeddings
train_img_emb = np.load(os.path.join(embeddings_path, "train_img_emb.npy"))
test_img_emb = np.load(os.path.join(embeddings_path, "test_img_emb.npy"))
train_text_emb = np.load(os.path.join(embeddings_path, "train_text_emb.npy"))
test_text_emb = np.load(os.path.join(embeddings_path, "test_text_emb.npy"))

# Now check the shapes
print("Image embedding shapes:", train_img_emb.shape, test_img_emb.shape)
print("Text embedding shapes:", train_text_emb.shape, test_text_emb.shape)

Image embedding shapes: (45653, 512) (11489, 512)
Text embedding shapes: (45653, 768) (11489, 768)


In [23]:
X_num_train.shape

(45653, 4)

# Weighted Fusion of features

In [24]:
# Apply different weights to each modality
text_weight = 0.6    # Text is usually most important
image_weight = 0.32   # Images provide visual context
numerical_weight = 0.08 # Numerical features add specifics

# Scale each modality separately
text_scaler = StandardScaler()
image_scaler = StandardScaler()
num_scaler = StandardScaler()

train_text_scaled = text_scaler.fit_transform(train_text_emb)
train_img_scaled = image_scaler.fit_transform(train_img_emb)
train_num_scaled = num_scaler.fit_transform(X_num_train)

test_text_scaled = text_scaler.transform(test_text_emb)
test_img_scaled = image_scaler.transform(test_img_emb)
test_num_scaled = num_scaler.transform(X_num_test)

# Weighted combination
train_multimodal = np.hstack([
    text_weight * train_text_scaled,
    image_weight * train_img_scaled,
    numerical_weight * train_num_scaled
])

test_multimodal = np.hstack([
    text_weight * test_text_scaled,
    image_weight * test_img_scaled,
    numerical_weight * test_num_scaled
])

print(f"Weighted multimodal features - Train: {train_multimodal.shape}, Test: {test_multimodal.shape}")

Weighted multimodal features - Train: (45653, 1284), Test: (11489, 1284)


In [32]:
y_train = train_df_clean['price'].values
# we'll train on log1p(y)
y_train_log = np.log1p(y_train)
y_train_log.shape

(45653,)

In [35]:
print(type(train_multimodal), train_multimodal.shape)
print(type(test_multimodal), test_multimodal.shape)
print(type(y_train_log), y_train_log.shape)
# Optional but safe pre-check
print("🔍 Checking for NaNs...")
print("Train NaNs:", np.isnan(train_multimodal).sum())
print("Test NaNs:", np.isnan(test_multimodal).sum())



<class 'numpy.ndarray'> (45653, 1284)
<class 'numpy.ndarray'> (11489, 1284)
<class 'numpy.ndarray'> (45653,)
🔍 Checking for NaNs...
Train NaNs: 0
Test NaNs: 0


In [37]:
!pip install -q lightgbm scikit-learn

import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# ==================== SMAPE METRIC ====================
def smape(y_true, y_pred):
    """Competition SMAPE metric"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(np.abs(y_true - y_pred) / (denominator + 1e-9)) * 100.0

# ==================== DATA VERIFICATION ====================
print("📊 Data Verification:")
print(f"train_multimodal shape: {train_multimodal.shape}")
print(f"test_multimodal shape: {test_multimodal.shape}")
print(f"y_train_log shape: {y_train_log.shape}")
print(f"y_train shape: {y_train.shape}")

# Set your feature matrices
X_train = train_multimodal
X_test = test_multimodal

# ==================== LIGHTGBM PARAMETERS ====================
params = {
    'objective': 'regression',
    'metric': 'mae',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'max_depth': -1,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbosity': -1,
    'seed': 42
}

# ==================== 5-FOLD CROSS VALIDATION ====================
print("\n🚀 Starting 5-fold cross-validation...")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_preds = np.zeros(len(X_train))
test_log_preds = np.zeros(len(X_test))
models = []  # Store all trained models

fold_scores = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n🎯 Fold {fold + 1}/5")

    # Split data
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train_log[tr_idx], y_train_log[val_idx]

    # Create LightGBM datasets
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    # Train model
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=10000,
        valid_sets=[dval],
        callbacks=[
            lgb.early_stopping(200),
            lgb.log_evaluation(200)
        ]
    )

    # Store model
    models.append(model)

    # Store OOF predictions (in log space)
    oof_log_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)

    # Accumulate test predictions (in log space)
    test_log_preds += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

    # Calculate fold SMAPE
    fold_actual_prices = np.expm1(y_val)
    fold_pred_prices = np.expm1(oof_log_preds[val_idx])
    fold_smape = smape(fold_actual_prices, fold_pred_prices)

    fold_scores.append(fold_smape)
    print(f"📊 Fold {fold+1} SMAPE: {fold_smape:.2f}%")

# ==================== FINAL EVALUATION ====================
print("\n" + "="*50)
print("🏆 FINAL RESULTS")
print("="*50)

# Convert all predictions to actual prices
oof_actual_preds = np.expm1(oof_log_preds)
test_actual_preds = np.expm1(test_log_preds)

# Calculate final metrics
final_smape = smape(y_train, oof_actual_preds)
final_mae = np.mean(np.abs(y_train - oof_actual_preds))

print(f"📈 Cross-Validation SMAPE: {final_smape:.2f}%")
print(f"📊 Cross-Validation MAE: ${final_mae:.2f}")
print(f"🔄 Fold SMAPEs: {[f'{score:.2f}%' for score in fold_scores]}")
print(f"📏 Fold STD: {np.std(fold_scores):.2f}%")

# Performance interpretation
def interpret_performance(smape_score):
    if smape_score < 15: return "🎯 EXCELLENT - Top tier!"
    elif smape_score < 25: return "✅ VERY GOOD - Competitive"
    elif smape_score < 35: return "⚠️ GOOD - Room for improvement"
    else: return "🔧 NEEDS WORK - Significant improvement needed"

print(f"💡 Performance: {interpret_performance(final_smape)}")

# ==================== CREATE ENSEMBLE MODEL ====================
print("\n🤖 Creating ensemble model...")

# Train a final model on all data for production use
print("Training final ensemble model on full dataset...")
final_dtrain = lgb.Dataset(X_train, label=y_train_log)
final_model = lgb.train(
    params,
    final_dtrain,
    num_boost_round=10000,
    callbacks=[lgb.log_evaluation(500)]
)

# Add the final model to our models list
models.append(final_model)

# ==================== SAVE COMPLETE MODEL PACKAGE ====================
print("\n💾 Saving complete model package...")

# Define save path
save_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model"
os.makedirs(save_path, exist_ok=True)

# 1. Save the final ensemble model
final_model.save_model(os.path.join(save_path, 'final_ensemble_model.txt'))

# 2. Save all CV models
for i, model in enumerate(models):
    model.save_model(os.path.join(save_path, f'cv_model_fold_{i+1}.txt'))

# 3. Save the scalers
joblib.dump(text_scaler, os.path.join(save_path, 'text_scaler.joblib'))
joblib.dump(image_scaler, os.path.join(save_path, 'image_scaler.joblib'))
joblib.dump(num_scaler, os.path.join(save_path, 'numerical_scaler.joblib'))

# 4. Save model configuration
model_config = {
    'feature_weights': {
        'text_weight': 0.6,
        'image_weight': 0.32,
        'numerical_weight': 0.08
    },
    'input_dimensions': {
        'text_features': train_text_emb.shape[1],
        'image_features': train_img_emb.shape[1],
        'numerical_features': X_num_train.shape[1],
        'total_features': X_train.shape[1]
    },
    'performance': {
        'cv_smape': final_smape,
        'cv_mae': final_mae,
        'fold_scores': fold_scores
    },
    'training_info': {
        'n_samples': len(X_train),
        'target_type': 'log1p(price)',
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }
}

import json
with open(os.path.join(save_path, 'model_config.json'), 'w') as f:
    json.dump(model_config, f, indent=2)

# 5. Save test predictions for later use
np.save(os.path.join(save_path, 'test_predictions.npy'), test_actual_preds)
np.save(os.path.join(save_path, 'test_log_predictions.npy'), test_log_preds)

# ==================== VERIFICATION ====================
print("\n🔍 Model Package Verification:")

saved_files = os.listdir(save_path)
print("Saved files:")
for file in sorted(saved_files):
    file_path = os.path.join(save_path, file)
    size = os.path.getsize(file_path) / 1024  # Size in KB
    print(f"  - {file} ({size:.1f} KB)")

print(f"\n✅ Complete model package saved to: {save_path}")

# ==================== LOADING EXAMPLE ====================
print("\n📖 Example loading code:")

load_example = """
# Load the saved model
import lightgbm as lgb
import joblib
import numpy as np

# Load model and scalers
model = lgb.Booster(model_file='/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/final_ensemble_model.txt')
text_scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/text_scaler.joblib')
image_scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/image_scaler.joblib')
num_scaler = joblib.load('/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/numerical_scaler.joblib')

# Preprocess new data
new_text_scaled = text_scaler.transform(new_text_emb)
new_image_scaled = image_scaler.transform(new_image_emb)
new_num_scaled = num_scaler.transform(new_numerical)

# Apply weights and combine
new_features = np.hstack([
    0.6 * new_text_scaled,
    0.32 * new_image_scaled,
    0.08 * new_num_scaled
])

# Predict
log_prediction = model.predict(new_features)
price_prediction = np.expm1(log_prediction)  # Convert to actual price
"""

print(load_example)

print(f"\n🎉 TRAINING COMPLETE!")
print(f"📦 Model package saved with {len(models)} models")
print(f"📊 Final CV Score: {final_smape:.2f}% SMAPE")

📊 Data Verification:
train_multimodal shape: (45653, 1284)
test_multimodal shape: (11489, 1284)
y_train_log shape: (45653,)
y_train shape: (45653,)

🚀 Starting 5-fold cross-validation...

🎯 Fold 1/5
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l1: 0.287579
[400]	valid_0's l1: 0.286831
[600]	valid_0's l1: 0.286398
[800]	valid_0's l1: 0.286341
Early stopping, best iteration is:
[755]	valid_0's l1: 0.286264
📊 Fold 1 SMAPE: 30.73%

🎯 Fold 2/5
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l1: 0.288621
[400]	valid_0's l1: 0.287447
[600]	valid_0's l1: 0.287335
[800]	valid_0's l1: 0.287144
[1000]	valid_0's l1: 0.287069
[1200]	valid_0's l1: 0.287084
Early stopping, best iteration is:
[1049]	valid_0's l1: 0.287055
📊 Fold 2 SMAPE: 30.79%

🎯 Fold 3/5
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l1: 0.288755
[400]	valid_0's l1: 0.28802
[600]	valid_0's l1: 0.288096
Early stopping, best iteration is:
[44

KeyboardInterrupt: 

# SAVING EVERYTHING

In [38]:
len(models)  # should be 5
X_train.shape, X_test.shape
train_multimodal.shape, test_multimodal.shape
text_scaler, image_scaler, num_scaler
oof_log_preds, test_log_preds
y_train_log


array([2.67345876, 1.16002092, 2.22408286, ..., 1.88631179, 3.58324112,
       3.29916466])

In [39]:
import os
save_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model"
os.makedirs(save_path, exist_ok=True)

In [40]:
import lightgbm as lgb

for i, model in enumerate(models):
    model_file = os.path.join(save_path, f'cv_model_fold_{i+1}.txt')
    model.save_model(model_file)
    print(f"Saved CV model fold {i+1} -> {model_file}")


Saved CV model fold 1 -> /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/cv_model_fold_1.txt
Saved CV model fold 2 -> /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/cv_model_fold_2.txt
Saved CV model fold 3 -> /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/cv_model_fold_3.txt
Saved CV model fold 4 -> /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/cv_model_fold_4.txt
Saved CV model fold 5 -> /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/cv_model_fold_5.txt


In [41]:
import joblib

joblib.dump(text_scaler, os.path.join(save_path, 'text_scaler.joblib'))
joblib.dump(image_scaler, os.path.join(save_path, 'image_scaler.joblib'))
joblib.dump(num_scaler, os.path.join(save_path, 'numerical_scaler.joblib'))
print("Scalers saved!")


Scalers saved!


In [42]:
import numpy as np

np.save(os.path.join(save_path, 'oof_log_preds.npy'), oof_log_preds)
np.save(os.path.join(save_path, 'test_log_preds.npy'), test_log_preds)
print("OOF and test predictions saved!")


OOF and test predictions saved!


In [43]:
import json
import pandas as pd

model_config = {
    'feature_weights': {
        'text_weight': 0.6,
        'image_weight': 0.32,
        'numerical_weight': 0.08
    },
    'input_dimensions': {
        'text_features': train_text_emb.shape[1],
        'image_features': train_img_emb.shape[1],
        'numerical_features': X_num_train.shape[1],
        'total_features': X_train.shape[1]
    },
    'performance': {
        'cv_smape': smape(y_train, np.expm1(oof_log_preds)),
        'fold_scores': [30.73, 30.79, 30.87, 31.04, 30.63]
    },
    'training_info': {
        'n_samples': len(X_train),
        'target_type': 'log1p(price)',
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    }
}

with open(os.path.join(save_path, 'model_config.json'), 'w') as f:
    json.dump(model_config, f, indent=2)

print("Model config saved!")


Model config saved!


In [45]:
# 5. Save test predictions for later use
np.save(os.path.join(save_path, 'test_predictions.npy'), test_actual_preds)
np.save(os.path.join(save_path, 'test_log_predictions.npy'), test_log_preds)

In [46]:
saved_files = os.listdir(save_path)
for file in sorted(saved_files):
    file_path = os.path.join(save_path, file)
    size = os.path.getsize(file_path)/1024
    print(f"{file} - {size:.1f} KB")


cv_model_fold_1.txt - 8889.6 KB
cv_model_fold_2.txt - 12344.4 KB
cv_model_fold_3.txt - 5214.0 KB
cv_model_fold_4.txt - 9090.9 KB
cv_model_fold_5.txt - 8630.1 KB
image_scaler.joblib - 12.6 KB
model_config.json - 0.5 KB
numerical_scaler.joblib - 0.7 KB
oof_log_preds.npy - 356.8 KB
test_log_predictions.npy - 89.9 KB
test_log_preds.npy - 89.9 KB
test_predictions.npy - 89.9 KB
text_scaler.joblib - 18.6 KB


# TESTING

In [47]:
import numpy as np

def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    smape_value = np.mean(numerator / (denominator + 1e-9)) * 100
    return smape_value


In [49]:
# Load your saved OOF predictions
import numpy as np
oof_log_preds = np.load("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model/oof_log_preds.npy")

# Convert from log1p
oof_preds_actual = np.expm1(oof_log_preds)

# Actual train prices
y_train = train_df_clean['price'].values

# Compute SMAPE
oof_smape = smape(y_train, oof_preds_actual)
print(f"CV OOF SMAPE: {oof_smape:.2f}%")


CV OOF SMAPE: 30.81%
