In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

data_folder_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset"
output_folder_path = f"{data_folder_path}/output_csv_folder"  # Removed .csv

# Check if Data is present
if not os.path.exists(data_folder_path):
    raise FileNotFoundError(f"The path {data_folder_path} does not exist.")
else:
    print("Data is present")

# Create the folder if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)
print(f"Folder created at: {output_folder_path}")

# You can then save files like this:
#csv_file_path = f"{output_folder_path}/your_data.csv"
# df.to_csv(csv_file_path)  # Example for pandas

Data is present
Folder created at: /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/68e8d1d70b66d_student_resource/student_resource/dataset/output_csv_folder


In [3]:
from PIL import Image, ImageOps
import requests
from io import BytesIO
import pandas as pd

train = pd.read_csv(f"{data_folder_path}/train.csv")
test = pd.read_csv(f"{data_folder_path}/test.csv")
sample_test = pd.read_csv(f"{data_folder_path}/sample_test.csv")
sample_test_out = pd.read_csv(f"{data_folder_path}/sample_test_out.csv")

test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [4]:
import re

def clean_item_name(text):
    # remove "Item Name:" (case insensitive, optional leading/trailing spaces)
    text = re.sub(r'(?i)^item name:\s*', '', text.strip())
    # normalize spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

test['catalog_content'] = test['catalog_content'].apply(clean_item_name)
test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Natural MILK TEA Flavoring extract by HALO PAN...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"McCormick Culinary Vanilla Extract, 32 fl oz -...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [5]:
import re
import pandas as pd

UNIT_CONVERSIONS = {
    'kg': 1000, 'kilogram': 1000, 'kilograms': 1000,
    'g': 1, 'gram': 1, 'grams': 1,
    'mg': 0.001, 'milligram': 0.001, 'milligrams': 0.001,
    'lb': 453.6, 'lbs': 453.6, 'pound': 453.6, 'pounds': 453.6,
    'oz': 28.35, 'ounce': 28.35, 'ounces': 28.35,

    'l': 1000, 'liter': 1000, 'liters': 1000, 'litre': 1000, 'litres': 1000,
    'ml': 1, 'milliliter': 1, 'milliliters': 1,
}

def safe_float(x):
    try:
        return float(x)
    except:
        return None

def extract_numeric_features_universal(text):
    if not isinstance(text, str):
        return pd.Series({'total_weight_g': 0, 'pack_qty': 1, 'pieces': 0, 'percent_value': 0})

    text = text.lower()
    total_weight_g = 0
    pack_qty = 1
    pieces = 0
    percent_value = 0

    # (a) Handle (2 x 14.1 oz) or similar
    match_multi = re.findall(r'(\d+(?:\.\d+)?)\s*[x×]\s*([\d\.]+)\s*([a-z%]+)', text)
    for n, qty, unit in match_multi:
        n, qty = safe_float(n), safe_float(qty)
        if n is None or qty is None:
            continue
        unit = unit.strip().lower()
        if unit in UNIT_CONVERSIONS:
            total_weight_g += n * qty * UNIT_CONVERSIONS[unit]

    # (b) Individual numeric-unit pairs
    matches = re.findall(r'(\d+(?:\.\d+)?)\s*([a-z%]+)', text)
    for val, unit in matches:
        val = safe_float(val)
        if val is None:
            continue
        unit = unit.strip().lower()
        if unit in UNIT_CONVERSIONS:
            total_weight_g = max(total_weight_g, val * UNIT_CONVERSIONS[unit])
        elif unit == '%':
            percent_value = max(percent_value, val)

    # (c) Pack quantities
    match_pack = re.search(r'(pack\s*of\s*(\d+))|(\b(\d+)\s*per\s*case)', text)
    if match_pack:
        nums = re.findall(r'\d+', match_pack.group())
        pack_qty = int(nums[0]) if nums else 1

    # (d) Pieces or counts
    match_pieces = re.search(r'(\d+)\s*(pieces|pcs|count|tabs|capsules|tablets)', text)
    if match_pieces:
        pieces = int(match_pieces.group(1))

    return pd.Series({
        'total_weight_g': total_weight_g,
        'pack_qty': pack_qty,
        'pieces': pieces,
        'percent_value': percent_value
    })


In [6]:
train_feats = test['catalog_content'].apply(extract_numeric_features_universal)
test = pd.concat([test, train_feats], axis=1)
test.head()

Unnamed: 0,sample_id,catalog_content,image_link,total_weight_g,pack_qty,pieces,percent_value
0,100179,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,https://m.media-amazon.com/images/I/71hoAn78AW...,300.0,1.0,0.0,0.0
1,245611,Natural MILK TEA Flavoring extract by HALO PAN...,https://m.media-amazon.com/images/I/61ex8NHCIj...,56.7,1.0,0.0,0.0
2,146263,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,https://m.media-amazon.com/images/I/61KCM61J8e...,907.2,1.0,180.0,0.0
3,95658,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,453.6,2.0,0.0,0.0
4,36806,"McCormick Culinary Vanilla Extract, 32 fl oz -...",https://m.media-amazon.com/images/I/71QYlrOMoS...,0.0,1.0,0.0,0.0


In [7]:
# Check missing
print('catalog_content nulls', test['catalog_content'].isna().sum())
print('image_link nulls', test['image_link'].isna().sum())


catalog_content nulls 0
image_link nulls 0


# DOWNLOADING IMAGES

In [8]:
import re
import os
import pandas as pd
import multiprocessing
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
# SPECIFY YOUR CUSTOM SAVE LOCATION
CUSTOM_SAVE_PATH = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/sample_images"

def download_image(image_link, savefolder):
    if(isinstance(image_link, str)):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if(not os.path.exists(image_save_path)):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
            except Exception as ex:
                print('Warning: Not able to download - {}\n{}'.format(image_link, ex))
        else:
            return
    return

def download_images(image_links, download_folder=CUSTOM_SAVE_PATH):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    results = []
    download_image_partial = partial(download_image, savefolder=download_folder)
    with multiprocessing.Pool(100) as pool:
        for result in tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)):
            results.append(result)
        pool.close()
        pool.join()

In [10]:
'''
image_urls = test['image_link'].tolist()
download_images(image_urls)

 56%|█████▌    | 41965/75000 [00:20<00:12, 2632.37it/s]

HTTP Error 404: Not Found


100%|██████████| 75000/75000 [00:37<00:00, 2011.88it/s]


# EXTRACTING EMBEDDINGS

In [8]:
# Run once
import os, sys, math, time, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

BASE = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION")
BERT_DIR = BASE / "finetuned_bert_price"
CLIP_CHECKPOINT = BASE / "models" / "fine_tuned_clip_final.pth"
IMAGE_DIR = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/sample_images")
EMB_DIR = BASE / "embeddings"
EMB_DIR.mkdir(exist_ok=True)

Device: cuda


In [9]:
print(EMB_DIR)


/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings


# text embed

In [12]:

# install transformers if needed
!pip install -q transformers

from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained(str(BERT_DIR))
bert = AutoModel.from_pretrained(str(BERT_DIR)).to(device)
bert.eval()

def get_text_embeddings(texts, batch_size=128, pooling="mean"):
    embs = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(batch, padding=True, truncation=True, max_length=160, return_tensors="pt")
            enc = {k:v.to(device) for k,v in enc.items()}
            out = bert(**enc)
            last = out.last_hidden_state  # (B, L, H)
            if pooling=="cls":
                v = last[:,0,:]
            else:
                attn_mask = enc['attention_mask'].unsqueeze(-1)
                v = (last * attn_mask).sum(dim=1) / (attn_mask.sum(dim=1).clamp(min=1e-9))
            embs.append(v.cpu())
    return torch.cat(embs).numpy()


In [13]:
# Assume `sample_test` is a DataFrame with the same text column as training
texts = test['catalog_content'].fillna("").tolist()  # replace with actual column

# Reuse your function
test_text_emb = get_text_embeddings(texts, batch_size=128, pooling="mean")
np.save(EMB_DIR/"test_text_emb.npy", test_text_emb)
print("Test text embeddings shape:", test_text_emb.shape)


Test text embeddings shape: (75000, 768)


# IMAGE EMBED

In [10]:
# create image filename column (if not already)
from urllib.parse import urlparse
def fname_from_url(u):
    try:
        return Path(urlparse(u).path).name
    except:
        return ""
test['image_fname']  = test['image_link'].fillna("").apply(fname_from_url)
test.head()

Unnamed: 0,sample_id,catalog_content,image_link,total_weight_g,pack_qty,pieces,percent_value,image_fname
0,100179,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,https://m.media-amazon.com/images/I/71hoAn78AW...,300.0,1.0,0.0,0.0,71hoAn78AWL.jpg
1,245611,Natural MILK TEA Flavoring extract by HALO PAN...,https://m.media-amazon.com/images/I/61ex8NHCIj...,56.7,1.0,0.0,0.0,61ex8NHCIjL.jpg
2,146263,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,https://m.media-amazon.com/images/I/61KCM61J8e...,907.2,1.0,180.0,0.0,61KCM61J8eL.jpg
3,95658,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,453.6,2.0,0.0,0.0,51Ex6uOH7yL.jpg
4,36806,"McCormick Culinary Vanilla Extract, 32 fl oz -...",https://m.media-amazon.com/images/I/71QYlrOMoS...,0.0,1.0,0.0,0.0,71QYlrOMoSL.jpg


In [23]:

# ==========================================================
# ⚙️ SETUP
# ==========================================================
!pip install -q open_clip_torch pillow torchvision tqdm

import torch, os, numpy as np, shutil
from pathlib import Path
from tqdm import tqdm
from PIL import Image, ImageOps
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast
import open_clip

# ==========================================================
# 🧩 PATHS
# ==========================================================
DRIVE_BASE = Path("/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION")
IMAGE_DIR = DRIVE_BASE / "sample_images"
DRIVE_EMB_DIR = DRIVE_BASE / "embeddings"
LOCAL_EMB_DIR = Path("/content/embeddings")

DRIVE_EMB_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_EMB_DIR.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Using device: {device}")

# ==========================================================
# 🧠 LOAD FINE-TUNED CLIP MODEL
# ==========================================================
CLIP_CHECKPOINT = DRIVE_BASE / "models/fine_tuned_clip_final.pth"

clip_model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
state = torch.load(CLIP_CHECKPOINT, map_location=device)

if isinstance(state, dict) and 'model_state_dict' in state:
    clip_model.load_state_dict(state['model_state_dict'])
else:
    clip_model.load_state_dict(state)

clip_model = clip_model.to(device)
clip_model.eval()
torch.backends.cudnn.benchmark = True
print("✅ CLIP model loaded and ready for inference")

# ==========================================================
#  DATASET CLASS (loads directly from Drive)
# ==========================================================
class DriveImageDataset(Dataset):
    def __init__(self, fnames, preprocess):
        self.fnames = fnames
        self.preprocess = preprocess

    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, i):
        p = IMAGE_DIR / self.fnames[i]
        try:
            if not p.exists():
                raise FileNotFoundError
            img = Image.open(p).convert("RGB")
            img = ImageOps.exif_transpose(img)
        except Exception as e:
            # If file missing or corrupted, use blank placeholder image
            img = Image.new("RGB", (224, 224), color="white")
        return self.preprocess(img)

# ==========================================================
# 💾 CHUNKED EXTRACTION FUNCTION (resumable)
# ==========================================================
def save_embeddings_in_chunks(fnames, prefix, model, preprocess, batch_size=192, chunk_size=10000):
    """
    Extract embeddings in chunks to avoid OOM.
    Automatically resumes if chunk already exists (locally or on Drive).
    """
    total = len(fnames)
    n_chunks = int(np.ceil(total / chunk_size))
    print(f"Total {total} images → {n_chunks} chunks (size {chunk_size})")

    for chunk_idx in range(n_chunks):
        chunk_name = f"{prefix}_chunk{chunk_idx:03d}.npy"
        local_path = LOCAL_EMB_DIR / chunk_name
        drive_path = DRIVE_EMB_DIR / chunk_name

        # ✅ Skip if already exists locally or on Drive
        if local_path.exists() or drive_path.exists():
            print(f"✔ Skipping chunk {chunk_idx+1}/{n_chunks}")
            continue

        start, end = chunk_idx * chunk_size, min((chunk_idx+1) * chunk_size, total)
        sub_fnames = fnames[start:end]
        print(f"\n▶ Processing chunk {chunk_idx+1}/{n_chunks}: {start}–{end}")

        ds = DriveImageDataset(sub_fnames, preprocess)
        dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

        chunk_embs = []
        with torch.no_grad():
            for imgs in tqdm(dl):
                imgs = imgs.to(device, non_blocking=True)
                with autocast():
                    feats = model.encode_image(imgs)
                feats = feats / feats.norm(dim=-1, keepdim=True)
                chunk_embs.append(feats.cpu())
                del imgs, feats
                torch.cuda.empty_cache()

        chunk_embs = torch.cat(chunk_embs).numpy()
        np.save(local_path, chunk_embs)
        print(f"💾 Saved {chunk_embs.shape} → {local_path}")

        # ✅ Copy safely to Drive
        shutil.copy(local_path, drive_path)
        print(f"☁️ Copied to Drive → {drive_path}")


# ==========================================================
# 🔄 MERGE CHUNKS INTO ONE FILE
# ==========================================================
def merge_chunks(prefix, output_name):
    chunk_files = sorted([f for f in os.listdir(DRIVE_EMB_DIR) if f.startswith(prefix) and f.endswith(".npy")])
    print(f"Merging {len(chunk_files)} chunks…")

    all_parts = [np.load(DRIVE_EMB_DIR / f) for f in chunk_files]
    full = np.concatenate(all_parts, axis=0)

    out_path = DRIVE_EMB_DIR / output_name
    np.save(out_path, full)
    print(f"✅ Saved merged embeddings → {out_path}, shape={full.shape}")
    return full


# ==========================================================
# 🧩 STATUS CHECKER
# ==========================================================
def check_progress(prefix):
    all_files = sorted([f for f in os.listdir(DRIVE_EMB_DIR) if f.startswith(prefix) and f.endswith(".npy")])
    print(f"✅ {len(all_files)} chunks completed for prefix: {prefix}")
    for f in all_files:
        size_mb = os.path.getsize(DRIVE_EMB_DIR / f) / 1e6
        print(f"  - {f} ({size_mb:.1f} MB)")


# ==========================================================
# 🧠 RUN EXTRACTION (train/test)
# ==========================================================
# Example: assume train_df_clean & test_df_clean are already defined
test_fnames = test['image_fname'].tolist()


# 🔹 TEST
save_embeddings_in_chunks(test_fnames, "test_img_emb", clip_model, preprocess, batch_size=192, chunk_size=10000)
check_progress("test_img_emb")

#🔹 Merge (run at the very end)
sample_test_img_emb = merge_chunks("test_img_emb", "test_img_emb.npy")


✅ Using device: cuda
✅ CLIP model loaded and ready for inference
Total 75000 images → 8 chunks (size 10000)
✔ Skipping chunk 1/8
✔ Skipping chunk 2/8
✔ Skipping chunk 3/8
✔ Skipping chunk 4/8
✔ Skipping chunk 5/8
✔ Skipping chunk 6/8
✔ Skipping chunk 7/8
✔ Skipping chunk 8/8
✅ 8 chunks completed for prefix: test_img_emb
  - test_img_emb_chunk000.npy (10.2 MB)
  - test_img_emb_chunk001.npy (10.2 MB)
  - test_img_emb_chunk002.npy (10.2 MB)
  - test_img_emb_chunk003.npy (10.2 MB)
  - test_img_emb_chunk004.npy (10.2 MB)
  - test_img_emb_chunk005.npy (10.2 MB)
  - test_img_emb_chunk006.npy (10.2 MB)
  - test_img_emb_chunk007.npy (5.1 MB)
Merging 8 chunks…
✅ Saved merged embeddings → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/embeddings/test_img_emb.npy, shape=(75000, 512)


In [14]:
from joblib import load
num_scaler = load(DRIVE_BASE / "final_fusion_model/numerical_scaler.joblib")  # adjust path if needed

X_num_test = test[['total_weight_g', 'pack_qty', 'pieces', 'percent_value']].copy()
test_num_scaled = num_scaler.transform(X_num_test)
print("✅ Scaled sample_test numerical features:", test_num_scaled.shape)



✅ Scaled sample_test numerical features: (75000, 4)




In [24]:
test_img_emb = np.load(DRIVE_EMB_DIR / "test_img_emb.npy")


# Fused EMBEDDINGS

In [25]:
text_scaler = load(DRIVE_BASE / "final_fusion_model/text_scaler.joblib")
image_scaler = load(DRIVE_BASE / "final_fusion_model/image_scaler.joblib")
text_weight = 0.6
image_weight = 0.32
numerical_weight = 0.08

test_features = np.hstack([
    text_weight * text_scaler.transform(test_text_emb),
    image_weight * image_scaler.transform(test_img_emb),
    numerical_weight * test_num_scaled
])

print("Test features shape:", test_features.shape)


Test features shape: (75000, 1284)


In [26]:
print("Text embeddings shape:", test_text_emb.shape)
print("Image embeddings shape:", test_img_emb.shape)
print("Test DataFrame length:", len(test))


Text embeddings shape: (75000, 768)
Image embeddings shape: (75000, 512)
Test DataFrame length: 75000


# PREDICTIONS

In [27]:
import lightgbm as lgb
import os

save_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/final_fusion_model"

# Load all 5 models
cv_models = [lgb.Booster(model_file=os.path.join(save_path, f"cv_model_fold_{i}.txt")) for i in range(1, 6)]

print(f"✅ Loaded {len(cv_models)} CV models")


✅ Loaded 5 CV models


In [28]:
preds_log = np.mean([model.predict(test_features) for model in cv_models], axis=0)
preds_actual = np.expm1(preds_log)
preds_actual = np.clip(preds_actual, 0, None)  # Ensure no negatives

In [30]:
# === Paths ===
save_path = "/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION"
output_path = os.path.join(save_path, "test_out.csv")
# === 4️⃣ Create output DataFrame ===
pred_df = pd.DataFrame({
    "sample_id": test["sample_id"],  # from original test CSV
    "price": preds_actual
})

# List of corrupted or missing image URLs / filenames
corrupted_images = ["813CjSgHj0S.jpg"]

# Match sample_ids in test that correspond to these images
corrupted_ids = test.loc[
    test['image_link'].str.contains('|'.join(corrupted_images), na=False),
    'sample_id'
].tolist()

print(f"⚠️ Found {len(corrupted_ids)} corrupted entries → {corrupted_ids}")

# Set price = 0 for those
pred_df.loc[pred_df['sample_id'].isin(corrupted_ids), 'price'] = 0.0


# === 5️⃣ Save to CSV ===
pred_df.to_csv(output_path, index=False)
print(f"✅ Saved predictions → {output_path}")
print(pred_df.head())

⚠️ Found 1 corrupted entries → [286800]
✅ Saved predictions → /content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/test_out.csv
   sample_id      price
0     100179  12.335172
1     245611   9.181778
2     146263  23.185008
3      95658   6.744482
4      36806  34.413048


In [34]:
test_output = pd.read_csv(f"/content/drive/MyDrive/Colab Notebooks/AMAZON ML PRICE PREDICTION/test_out.csv")
test_output.shape

(75000, 2)

In [36]:
# Check missing
print('catalog_content nulls', test_output['sample_id'].isna().sum())
print('image_link nulls', test_output['price'].isna().sum())
zero_price_rows = test_output[test_output['price'] == 0]
print(f"🧾 Found {len(zero_price_rows)} rows with price = 0")
display(zero_price_rows)


catalog_content nulls 0
image_link nulls 0
🧾 Found 1 rows with price = 0


Unnamed: 0,sample_id,price
42045,286800,0.0
