In [2]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import os

# --- Machine Learning ---
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# --- Deep Learning (for Embeddings) ---
import torch
import timm
from sentence_transformers import SentenceTransformer
from PIL import Image
from torchvision import transforms

# --- Setup ---
# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

print(f"Setup complete. Using device: {device}")

Setup complete. Using device: cuda


In [3]:
# Load the datasets
try:
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')
except FileNotFoundError:
    print("Please make sure train.csv and test.csv are in a 'dataset' folder.")
    # Create dummy dataframes to allow the rest of the notebook to run for demonstration
    train_df = pd.DataFrame() 
    test_df = pd.DataFrame()

print("--- Training Data ---")
print(f"Shape: {train_df.shape}")
print(train_df.head())

print("\n--- Test Data ---")
print(f"Shape: {test_df.shape}")
print(test_df.head())

--- Training Data ---
Shape: (75000, 4)
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  

--- Test Data ---
Shape: (75000, 3)
   sample_id                                    catalog_content  \
0     100179  Item Name: Rani 14-Spice Eshamaya's Mango Chut...   
1     245611  

In [4]:
def parse_catalog_content(text):
    pack_size_match = re.search(r'\(Pack of (\d+)\)', text, re.IGNORECASE)
    pack_size = int(pack_size_match.group(1)) if pack_size_match else 1
    
    value_match = re.search(r'Value: ([\d.]+)', text)
    value = float(value_match.group(1)) if value_match else np.nan
    
    unit_match = re.search(r'Unit: ([a-zA-Z]+)', text)
    unit = unit_match.group(1) if unit_match else 'Unknown'
    
    return pack_size, value, unit

# Apply parsing
train_df[['Pack_Size', 'Value', 'Unit']] = train_df['catalog_content'].apply(lambda x: pd.Series(parse_catalog_content(x)))
test_df[['Pack_Size', 'Value', 'Unit']] = test_df['catalog_content'].apply(lambda x: pd.Series(parse_catalog_content(x)))

# Handle missing 'Value' by filling with a neutral value (like the mean)
mean_value = train_df['Value'].mean()
train_df['Value'].fillna(mean_value, inplace=True)
test_df['Value'].fillna(mean_value, inplace=True)

# Encode 'Unit' feature
all_units = pd.concat([train_df['Unit'], test_df['Unit']]).astype(str).unique()
unit_encoder = LabelEncoder().fit(all_units)
train_df['Unit_Encoded'] = unit_encoder.transform(train_df['Unit'].astype(str))
test_df['Unit_Encoded'] = unit_encoder.transform(test_df['Unit'].astype(str))

print("--- Training Data After Parsing ---")
print(train_df[['sample_id', 'Pack_Size', 'Value', 'Unit']].head())

--- Training Data After Parsing ---
   sample_id  Pack_Size  Value   Unit
0      33127          6  72.00     Fl
1     198967          4  32.00  Ounce
2     261251          6  11.40  Ounce
3      55858          1  11.25  Ounce
4     292686          1  12.00  Count


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Value'].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Value'].fillna(mean_value, inplace=True)


In [5]:
# Cell 3b: Create Log-Transformed Numerical Features

# Apply log transform to skewed numerical features to help the model
# We use log1p which handles zeros safely (log(1+x))
for col in ['Pack_Size', 'Value']:
    train_df[f'{col}_log'] = np.log1p(train_df[col])
    test_df[f'{col}_log'] = np.log1p(test_df[col])

print("--- Created Log-Transformed Features ---")
print(train_df[['Pack_Size', 'Pack_Size_log', 'Value', 'Value_log']].head())

--- Created Log-Transformed Features ---
   Pack_Size  Pack_Size_log  Value  Value_log
0          6       1.945910  72.00   4.290459
1          4       1.609438  32.00   3.496508
2          6       1.945910  11.40   2.517696
3          1       0.693147  11.25   2.505526
4          1       0.693147  12.00   2.564949


In [6]:
# Cell 3c: Create a Clean Text Column for Embeddings

def clean_text(text):
    # Remove boilerplate patterns
    text = re.sub(r'Item Name:', '', text)
    text = re.sub(r'Bullet Point \d+:', '', text)
    text = re.sub(r'Value: [\d.]+', '', text)
    text = re.sub(r'Unit: \w+', '', text)
    text = re.sub(r'Product Description:', '', text)
    # Remove extra whitespace and newlines
    text = ' '.join(text.split())
    return text

print("Creating clean text column...")
train_df['clean_catalog_content'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_catalog_content'] = test_df['catalog_content'].apply(clean_text)

print("--- Sample Cleaned Text ---")
print(train_df['clean_catalog_content'].iloc[0])

Creating clean text column...
--- Sample Cleaned Text ---
La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6) Oz


In [7]:
# Load the sentence transformer model
# This model is lightweight and effective for creating semantic embeddings.
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# --- Generate embeddings using the CLEANED text column ---
print("Generating text embeddings for training data using 'clean_catalog_content'...")
train_text_embeddings = text_model.encode(train_df['clean_catalog_content'].tolist(), show_progress_bar=True)

print("Generating text embeddings for test data using 'clean_catalog_content'...")
test_text_embeddings = text_model.encode(test_df['clean_catalog_content'].tolist(), show_progress_bar=True)

# --- Convert the embeddings into DataFrames ---
# Each dimension of the embedding becomes a column.
train_text_embed_df = pd.DataFrame(train_text_embeddings, columns=[f'txt_{i}' for i in range(train_text_embeddings.shape[1])])
test_text_embed_df = pd.DataFrame(test_text_embeddings, columns=[f'txt_{i}' for i in range(test_text_embeddings.shape[1])])

print(f"\nText embedding shape for training data: {train_text_embed_df.shape}")
print("Sample of the new text embedding DataFrame:")
print(train_text_embed_df.head())

Generating text embeddings for training data using 'clean_catalog_content'...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Generating text embeddings for test data using 'clean_catalog_content'...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]


Text embedding shape for training data: (75000, 384)
Sample of the new text embedding DataFrame:
      txt_0     txt_1     txt_2     txt_3     txt_4     txt_5     txt_6  \
0  0.039429 -0.025435 -0.018353  0.068541 -0.007958  0.020131  0.056719   
1 -0.028763  0.003258 -0.023879  0.048128 -0.021228 -0.093302  0.040878   
2 -0.009176  0.009118  0.074105  0.103930 -0.019267  0.022982  0.015732   
3 -0.103300 -0.086185  0.044509 -0.066582  0.064387  0.011055  0.015681   
4  0.077502  0.063593 -0.032874  0.036160 -0.119086 -0.026762  0.020524   

      txt_7     txt_8     txt_9  ...   txt_374   txt_375   txt_376   txt_377  \
0  0.015985  0.053348 -0.041363  ...  0.017251 -0.010899  0.058022 -0.027491   
1  0.044752 -0.044969 -0.033900  ...  0.026277 -0.064846  0.004683  0.031427   
2 -0.107359 -0.033161 -0.030437  ...  0.035247  0.034958 -0.009895 -0.059075   
3  0.093202 -0.007221 -0.067258  ... -0.005042 -0.039194 -0.012712  0.007271   
4 -0.000910  0.028441 -0.076278  ...  0.015324  0.0

In [8]:
# Cell 4b: Download Images from URLs

import requests
import os
from tqdm.notebook import tqdm

IMAGE_FOLDER = 'images'
os.makedirs(IMAGE_FOLDER, exist_ok=True) # Create the folder if it doesn't exist

def download_image(sample_id, url, folder=IMAGE_FOLDER):
    """Downloads an image from a URL and saves it locally."""
    filepath = os.path.join(folder, f"{sample_id}.jpg")
    
    # Skip download if the file already exists to save time
    if os.path.exists(filepath):
        return

    try:
        response = requests.get(url, stream=True, timeout=10)
        # Check if the request was successful
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
        else:
            # Uncomment the line below to see which specific images fail
            # print(f"Warning: Failed to download {url} with status code {response.status_code}")
            pass
    except Exception as e:
        # Uncomment the line below to see download errors (e.g., timeouts)
        # print(f"Warning: Error downloading {url}: {e}")
        pass

# --- Download all training images ---
print(f"Downloading {len(train_df)} training images to '{IMAGE_FOLDER}' folder...")
for index, row in tqdm(train_df.iterrows(), total=len(train_df)):
    download_image(row['sample_id'], row['image_link'])

# --- Download all test images ---
print(f"\nDownloading {len(test_df)} test images to '{IMAGE_FOLDER}' folder...")
for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
    download_image(row['sample_id'], row['image_link'])
    
print("\nImage downloading complete.")

Downloading 75000 training images to 'images' folder...


  0%|          | 0/75000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
# Cell 4b: High-Speed, Parallel Image Download

import requests
import os
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

IMAGE_FOLDER = 'images'
os.makedirs(IMAGE_FOLDER, exist_ok=True) 

# The function to download a single image (this stays the same)
def download_image(args):
    """Downloads a single image, but only if it doesn't already exist."""
    sample_id, url, folder = args
    filepath = os.path.join(folder, f"{sample_id}.jpg")
    
    # Skip if the file already exists
    if os.path.exists(filepath):
        return (sample_id, "skipped")

    try:
        response = requests.get(url, stream=True, timeout=15)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            return (sample_id, "success")
        else:
            return (sample_id, f"failed_status_{response.status_code}")
    except Exception:
        return (sample_id, "failed_exception")

# --- Create a list of all images to download ---
all_images_to_download = []
for _, row in train_df.iterrows():
    all_images_to_download.append((row['sample_id'], row['image_link'], IMAGE_FOLDER))
for _, row in test_df.iterrows():
    all_images_to_download.append((row['sample_id'], row['image_link'], IMAGE_FOLDER))

# --- Use ThreadPoolExecutor for parallel downloading ---
# Adjust MAX_WORKERS based on your connection. 8-16 is a good start.
MAX_WORKERS = 16 
print(f"Starting parallel download with {MAX_WORKERS} workers...")

# Use a context manager to ensure threads are cleaned up properly
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all download tasks to the executor
    futures = [executor.submit(download_image, args) for args in all_images_to_download]
    
    # Use tqdm to create a progress bar that updates as tasks are completed
    for future in tqdm(as_completed(futures), total=len(all_images_to_download)):
        # You can optionally check results here if needed
        # result = future.result() 
        pass

print("\nHigh-speed image downloading complete.")

Starting parallel download with 16 workers...


  0%|          | 0/150000 [00:00<?, ?it/s]


High-speed image downloading complete.


In [10]:
# Cell 5: Generate Image Embeddings from Local Files

# --- Check if the image folder exists ---
IMAGE_FOLDER = 'images' 
if not os.path.exists(IMAGE_FOLDER):
    print(f" ERROR: The '{IMAGE_FOLDER}' directory was not found.")
    print("Please run the image download cell first.")
else:
    print(f" Image folder '{IMAGE_FOLDER}' found. Proceeding with embedding.")

# --- Image Model Setup (remains the same) ---
img_model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0, global_pool='avg').to(device)
img_model.eval()
config = img_model.default_cfg
transform = transforms.Compose([
    transforms.Resize(config['input_size'][1:]),
    transforms.CenterCrop(config['input_size'][1:]),
    transforms.ToTensor(),
    transforms.Normalize(mean=config['mean'], std=config['std']),
])

# --- Image Feature Extraction Function ---
def get_image_embedding(sample_id, model, device, transform, image_folder=IMAGE_FOLDER):
    image_path = os.path.join(image_folder, f"{sample_id}.jpg")
    
    if not os.path.exists(image_path):
        return np.zeros(1280)
        
    try:
        img = Image.open(image_path).convert('RGB')
        batch_img = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model(batch_img)
        return embedding.cpu().numpy().flatten()
    except Exception as e:
        return np.zeros(1280)

# --- Generate Image Embeddings ---
print("\nGenerating image embeddings for training data...")
train_image_embeddings = train_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

print("Generating image embeddings for test data...")
test_image_embeddings = test_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

# --- Convert to DataFrames ---
train_img_embed_df = pd.DataFrame(train_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])
test_img_embed_df = pd.DataFrame(test_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])

print(f"\nImage embedding shape: {train_img_embed_df.shape}")
print("Sample of new image embedding DataFrame (should NOT be all zeros):")
print(train_img_embed_df.head())

 Image folder 'images' found. Proceeding with embedding.

Generating image embeddings for training data...


  0%|          | 0/75000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
# Cell 6: Combine All Features (Using Log Features)

# Use our new log-transformed numerical features
numerical_features = ['Pack_Size_log', 'Value_log', 'Unit_Encoded']
X_train_base = train_df[numerical_features]
X_test_base = test_df[numerical_features]

# Combine all features (assuming embed dfs are created)
X_train = pd.concat([X_train_base, train_text_embed_df, train_img_embed_df], axis=1)
X_test = pd.concat([X_test_base, test_text_embed_df, test_img_embed_df], axis=1)

# Go back to predicting the log of the original price
y_train_log = np.log1p(train_df['price'])

print(f"X_train shape: {X_train.shape}")
print(f"Using features: {numerical_features}")

X_train shape: (75000, 1667)
Using features: ['Pack_Size_log', 'Value_log', 'Unit_Encoded']


In [16]:
# Cell 6b: Robust Model Validation with K-Fold Cross-Validation

from sklearn.model_selection import KFold

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

# We will store out-of-fold (OOF) predictions here
oof_preds = np.zeros(len(X_train))
oof_true = np.zeros(len(X_train))

# Set up K-Fold cross-validation
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Use the balanced hyperparameters from our last attempt
params = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 4000,
    'learning_rate': 0.01, 'seed': 42, 'n_jobs': -1, 'verbose': -1,
    'num_leaves': 51, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'lambda_l1': 0.5, 'lambda_l2': 0.5, 'min_child_samples': 20
}

print(f"Starting {N_SPLITS}-Fold Cross-Validation...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train_log)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")

    # Get the data for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_log_fold, y_val_log_fold = y_train_log.iloc[train_idx], y_train_log.iloc[val_idx]

    # Initialize and train a fresh model for this fold
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train_fold, y_train_log_fold,
              eval_set=[(X_val_fold, y_val_log_fold)],
              callbacks=[lgb.early_stopping(100, verbose=False)])

    # Make predictions on the validation set for this fold
    val_preds_log = model.predict(X_val_fold)
    
    # Store the predictions and true values
    oof_preds[val_idx] = np.expm1(val_preds_log)
    oof_true[val_idx] = np.expm1(y_val_log_fold)

# Calculate the overall SMAPE score across all folds
overall_smape = smape(oof_true, oof_preds)
print("\n-------------------------------------------")
print(f"✅ Overall Cross-Validation SMAPE Score: {overall_smape:.4f}%")
print("-------------------------------------------")

Starting 5-Fold Cross-Validation...
--- Fold 1/5 ---
--- Fold 2/5 ---
--- Fold 3/5 ---
--- Fold 4/5 ---
--- Fold 5/5 ---

-------------------------------------------
✅ Overall Cross-Validation SMAPE Score: 55.6683%
-------------------------------------------


In [17]:
# The model 'model_for_validation' is already trained from the previous cell.
# The validation SMAPE is also already calculated.

# 1. Make predictions on the TRAINING data (the data it just learned from)
print("Making predictions on the training split data...")
train_preds_log = model_for_validation.predict(X_train_split)

# 2. Convert predictions and true values back from log scale
train_preds = np.expm1(train_preds_log)
y_train_true = np.expm1(y_train_log_split)

# 3. Calculate the SMAPE score on the TRAINING data
training_smape = smape(y_train_true, train_preds)

print("\n--- Overfitting Diagnosis ---")
print(f" SMAPE on Training Data (80%): {training_smape:.4f}%")
print(f" SMAPE on Validation Data (20%): {validation_smape:.4f}%")
print("---------------------------------")

# 4. Interpret the results
gap = validation_smape - training_smape
if gap > 15: # A gap of >15% is a strong sign of overfitting
    print(f" Result: High Overfitting Detected! The gap between scores is {gap:.2f}%.")
    print("   The model has memorized the training data and is not generalizing well.")
elif gap > 5:
    print(f" Result: Moderate Overfitting. The gap is {gap:.2f}%. Try to reduce it.")
else:
    print(" Result: Good Fit. The model is generalizing well.")

Making predictions on the training split data...

--- Overfitting Diagnosis ---
 SMAPE on Training Data (80%): 40.3308%
 SMAPE on Validation Data (20%): 56.3582%
---------------------------------
 Result: High Overfitting Detected! The gap between scores is 16.03%.
   The model has memorized the training data and is not generalizing well.
