<a href="https://colab.research.google.com/github/munnurumahesh03-coder/Amazon-ML-Hackathon-2025/blob/main/Model_Gauntlet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importings**

---



In [None]:
# =============================================================================
# SECTION 1: PROJECT SETUP
# =============================================================================

# -- 1.1: Import Libraries --
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
import lightgbm as lgb

print("Section 1.1: Libraries imported successfully.")


Section 1.1: Libraries imported successfully.


In [None]:
# -- 1.2: Define File Paths --
# IMPORTANT: Please update these paths to the correct location of your files.
TRAIN_FILE_PATH = "train.csv"  # <--- EDIT THIS PATH
TEST_FILE_PATH = "test.csv"    # <--- EDIT THIS PATH

print("Section 1.2: File paths defined.")
print(f"Train file is set to: {TRAIN_FILE_PATH}")
print(f"Test file is set to: {TEST_FILE_PATH}")


Section 1.2: File paths defined.
Train file is set to: train.csv
Test file is set to: test.csv


In [None]:
# -- 1.3: Load Datasets --
try:
    train_df = pd.read_csv(TRAIN_FILE_PATH)
    test_df = pd.read_csv(TEST_FILE_PATH)

    print("Section 1.3: Data loaded successfully.")
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")

    # Display the first few rows to verify
    print("\nFirst 3 rows of training data:")
    pd.set_option('display.max_colwidth', 100)
    print(train_df.head(3))

except FileNotFoundError:
    print("\n---")
    print("ERROR in Section 1.3: Data files not found.")
    print(f"Please check if the paths defined in the previous cell are correct.")
    print("---")
except Exception as e:
    print(f"\nAn unexpected error occurred in Section 1.3: {e}")


Section 1.3: Data loaded successfully.
Training data shape: (75000, 4)
Test data shape: (75000, 3)

First 3 rows of training data:
   sample_id  \
0      33127   
1     198967   
2     261251   

                                                                                       catalog_content  \
0       Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)\nValue: 72.0\nUnit: Fl Oz\n   
1  Item Name: Salerno Cookies, The Original Butter Cookies, 8 Ounce (Pack of 4)\nBullet Point 1: Or...   
2  Item Name: Bear Creek Hearty Soup Bowl, Creamy Chicken with Rice, 1.9 Ounce (Pack of 6)\nBullet ...   

                                            image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTHL.jpg   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAAL.jpg  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-L.jpg   1.97  


In [None]:
# -- 1.4: Define Evaluation Metric --
def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    A small epsilon is added to the denominator to avoid division by zero.
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

print("Section 1.4: SMAPE function defined successfully.")
print("\n--- Section 1 Complete ---")


Section 1.4: SMAPE function defined successfully.

--- Section 1 Complete ---


In [None]:
train_df

Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)\nValue: 72.0\nUnit: Fl Oz\n",https://m.media-amazon.com/images/I/51mo8htwTHL.jpg,4.890
1,198967,"Item Name: Salerno Cookies, The Original Butter Cookies, 8 Ounce (Pack of 4)\nBullet Point 1: Or...",https://m.media-amazon.com/images/I/71YtriIHAAL.jpg,13.120
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy Chicken with Rice, 1.9 Ounce (Pack of 6)\nBullet ...",https://m.media-amazon.com/images/I/51+PFEe-w-L.jpg,1.970
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz - Gluten-Free and Nut-Free - Use in Seasonings an...,https://m.media-amazon.com/images/I/41mu0HAToDL.jpg,30.340
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Ounce - 12 per case.\nBullet Point: kedem Sherry Cook...",https://m.media-amazon.com/images/I/41sA037+QvL.jpg,66.490
...,...,...,...,...
74995,41424,"Item Name: ICE BREAKERS Spearmint Sugar Free Mints Tins, 1.5 oz (8 Count)\nBullet Point 1: Conta...",https://m.media-amazon.com/images/I/81p9PcPsffL.jpg,10.395
74996,35537,"Item Name: Davidson's Organics, Vanilla Essence, 100-count Individually Wrapped Tea Bags\nBullet...",https://m.media-amazon.com/images/I/51DDKoa+mbL.jpg,35.920
74997,249971,Item Name: Jolly Rancher Hard Candy - Blue Raspberry - 5 Pound Resealable Bag\nProduct Descripti...,https://m.media-amazon.com/images/I/91R2XCcpUfL.jpg,50.330
74998,188322,"Item Name: Nescafe Dolce Gusto Capsules - CARAMEL MACCHIATO, 16 Pods\nBullet Point 1: Nescafe Do...",https://m.media-amazon.com/images/I/51W40YU98+L.jpg,15.275


# **EDA**

---



In [None]:
# =============================================================================
# SECTION 2: EXPLORATORY DATA ANALYSIS (EDA) FOR FEATURE ENGINEERING
# =============================================================================

# -- 2.1: Objective and Setup --

print("Starting Section 2: Targeted EDA on 'catalog_content'...")
print("Objective: To discover patterns for brand, weight, volume, and other keywords before extraction.")

# Combine train and test for a complete overview of text patterns.
# This ensures we capture patterns present in either dataset.
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# For this EDA, it's helpful to see the full text content
pd.set_option('display.max_colwidth', 300)

print(f"\nCombined DataFrame created for EDA with shape: {combined_df.shape}")


Starting Section 2: Targeted EDA on 'catalog_content'...
Objective: To discover patterns for brand, weight, volume, and other keywords before extraction.

Combined DataFrame created for EDA with shape: (150000, 4)


In [None]:
# -- 2.2: Discovering All Potential Units (Weight & Volume) --

print("Searching for all number-unit patterns in the dataset...")

# This regex finds any number (integer or decimal) followed by a word (the potential unit).
# \b is a word boundary, ensuring we don't just grab parts of words.
# It captures the number in group 1 and the unit in group 2.
# We add a condition to make sure the "unit" is at least one character long.
unit_pattern = re.compile(r'(\d+\.?\d*)\s*([a-zA-Z]{1,})\b')

# Apply this regex to every row in the 'catalog_content' column.
# .str.findall() will return a list of all (number, unit) tuples for each row.
# .dropna() removes any rows where catalog_content might be null.
all_matches = combined_df['catalog_content'].str.findall(unit_pattern).dropna()

# Now, let's collect every unique unit found across the entire dataset.
discovered_units = set()
for matches_in_row in all_matches:
    for number, unit in matches_in_row:
        # We add a simple filter to avoid purely numeric "units" if any slip through
        if not unit.isnumeric():
            discovered_units.add(unit.lower()) # Use .lower() to treat 'G' and 'g' as the same

# Sort the list for cleaner viewing
sorted_units = sorted(list(discovered_units))

print(f"\nDiscovered {len(sorted_units)} unique potential units.")
print("Here is the complete list:")
print(sorted_units)

print("\n---")
print("Analysis: Please review this list carefully. We will use it to identify all variations of weight (g, oz, kg, lb) and volume (ml, l, floz).")


Searching for all number-unit patterns in the dataset...

Discovered 5230 unique potential units.
Here is the complete list:

---
Analysis: Please review this list carefully. We will use it to identify all variations of weight (g, oz, kg, lb) and volume (ml, l, floz).


In [None]:
# -- 2.3: Analyze and Define Unit Groups --

print("Analyzing the discovered units and grouping them into categories...")

# From our manual review of the 'sorted_units' list, we define our whitelists.
# We are creating these lists based on the evidence from the data.

# -- WEIGHT UNITS --
# We include singular, plural, and common abbreviations.
WEIGHT_UNITS = [
    'g', 'gr', 'gram', 'grams', 'gramos',
    'kg', 'kgs', 'kilo', 'kilogram', 'kilograms', 'quilogramas',
    'oz', 'ounce', 'ounces', 'onza', 'onzas',
    'lb', 'lbs', 'libra', 'libras', 'pound', 'pounds',
    'mg', 'milligram', 'milligrams',
    'mcg', 'microgram', 'micrograms'
]

# -- VOLUME UNITS --
VOLUME_UNITS = [
    'ml', 'milliliter', 'milliliters', 'mililiters',
    'l', 'liter', 'liters', 'litre', 'litres', 'litros',
    'floz', 'fl', 'fluid', # 'fluid' is often followed by 'oz' but can be standalone
    'gal', 'gallon', 'gallons', 'galon',
    'qt', 'qts', 'quart', 'quarts',
    'pt', 'pint', 'pints',
    'cup', 'cups'
]

# -- DIMENSION UNITS --
DIMENSION_UNITS = [
    'cm', 'centimeter', 'centimeters',
    'mm', 'millimeter', 'millimeters',
    'in', 'inch', 'inches',
    'ft', 'feet'
]

# -- COUNT/PACK UNITS --
# These are units that imply a quantity of items.
COUNT_UNITS = [
    'pack', 'packs', 'pk', 'pkg', 'pck',
    'count', 'ct', 'cnt',
    'ea', 'each',
    'pc', 'pcs', 'piece', 'pieces',
    'dozen', 'doz', 'dz',
    'roll', 'rolls',
    'can', 'cans',
    'bottle', 'bottles', 'btl',
    'bag', 'bags',
    'box', 'boxes', 'bx',
    'sachet', 'sachets',
    'tablet', 'tablets', 'tabs',
    'capsule', 'capsules', 'caps'
]

# Now, let's filter our big 'sorted_units' list to see which of our chosen units were actually found
found_weight_units = [unit for unit in sorted_units if unit in WEIGHT_UNITS]
found_volume_units = [unit for unit in sorted_units if unit in VOLUME_UNITS]
found_dimension_units = [unit for unit in sorted_units if unit in DIMENSION_UNITS]
found_count_units = [unit for unit in sorted_units if unit in COUNT_UNITS]

print("\n--- Verification of Unit Groups ---")
print(f"\nWEIGHT units found in data ({len(found_weight_units)}):")
print(found_weight_units)

print(f"\nVOLUME units found in data ({len(found_volume_units)}):")
print(found_volume_units)

print(f"\nDIMENSION units found in data ({len(found_dimension_units)}):")
print(found_dimension_units)

print(f"\nCOUNT units found in data ({len(found_count_units)}):")
print(found_count_units)

print("\n\nAnalysis: These clean, verified lists will be the foundation for our precise extraction rules.")

Analyzing the discovered units and grouping them into categories...

--- Verification of Unit Groups ---

WEIGHT units found in data (24):
['g', 'gr', 'gram', 'gramos', 'grams', 'kg', 'kgs', 'kilo', 'kilogram', 'kilograms', 'lb', 'lbs', 'libras', 'mcg', 'mg', 'milligram', 'milligrams', 'onzas', 'ounce', 'ounces', 'oz', 'pound', 'pounds', 'quilogramas']

VOLUME units found in data (26):
['cup', 'cups', 'fl', 'floz', 'fluid', 'gal', 'gallon', 'gallons', 'galon', 'l', 'liter', 'liters', 'litre', 'litres', 'litros', 'mililiters', 'milliliter', 'milliliters', 'ml', 'pint', 'pints', 'pt', 'qt', 'qts', 'quart', 'quarts']

DIMENSION units found in data (11):
['centimeter', 'centimeters', 'cm', 'feet', 'ft', 'in', 'inch', 'inches', 'millimeter', 'millimeters', 'mm']

COUNT units found in data (37):
['bag', 'bags', 'bottle', 'bottles', 'box', 'boxes', 'btl', 'bx', 'can', 'cans', 'caps', 'capsule', 'capsules', 'cnt', 'count', 'ct', 'doz', 'dozen', 'dz', 'ea', 'each', 'pack', 'packs', 'pc', 'pck',

In [None]:
# -- 2.4: Inspecting Keyword Context (V5 - Final Corrected Version) --

print("Inspecting the context of our verified keywords to finalize extraction strategy...")

def inspect_content(df, keywords, title, num_samples=5):
    """
    Finds and prints random samples of 'catalog_content' containing specific keywords.
    Uses the correct column name 'sample_id'.
    """
    print(f"\n{'='*25}\n--- Inspecting for: {title} ---\n{'='*25}")

    pattern = r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b'

    # This warning is expected and can be ignored for this EDA step.
    relevant_rows = df[df['catalog_content'].str.contains(pattern, case=False, na=False)]

    if relevant_rows.empty:
        print(f"No samples found for keywords: {keywords}")
        return

    print(f"Found {len(relevant_rows)} rows containing at least one of the '{title}' keywords.")

    sample_size = min(num_samples, len(relevant_rows))
    if sample_size > 0:
        print(f"Showing {sample_size} random samples:\n")

        sample_indices = relevant_rows.sample(n=sample_size, random_state=101).index

        for idx in sample_indices:
            row = df.loc[idx]

            # THE FIX: Use 'sample_id' instead of 'id'
            print(f"--- [Sample Record ID: {row['sample_id']}] ---")
            print(row['catalog_content'])
            print("-" * 40)

# --- Run Inspections ---
# 1. For Brand
inspect_content(combined_df, ['brand:'], "Brand Keyword")

# 2. For Weight
inspect_content(combined_df, ['g', 'oz', 'kg', 'lb'], "Weight Keywords")

# 3. For Volume
inspect_content(combined_df, ['ml', 'l', 'floz', 'gallon'], "Volume Keywords")

# 4. For Pack Size / Count
inspect_content(combined_df, ['pack', 'count', 'piece', 'bottle'], "Pack/Count Keywords")

print("\n\n--- Section 2 Complete ---")
print("Analysis: Review the samples above to confirm regex patterns. This completes our EDA.")


Inspecting the context of our verified keywords to finalize extraction strategy...

--- Inspecting for: Brand Keyword ---
Found 5 rows containing at least one of the 'Brand Keyword' keywords.
Showing 5 random samples:

--- [Sample Record ID: 188616] ---
Item Name: 50 Jamaican Sorrel Seeds, Florida Cranberry, Indian Roselle, Hibiscus Sabdariffa
Bullet Point 1: Country/Region Of Manufacture:Jamaica, Mpn:Florida Cranberry Indian ,Roselle,Hibisc
Bullet Point 2: Brand:Hibiscus Sabdariffa
Bullet Point 3: Model:Sorrel
Value: 50.0
Unit: Count

----------------------------------------
--- [Sample Record ID: 42177] ---
Item Name: Eden Foods Organic Garbanzo Beans - Case of 12 - 29 oz.
Product Description: Specification<br>Brand:EDEN FOODS<br>Dairy Free:Yes<br>Fair Trade:No<br>Gluten Free:Yes<br>GMO Free?:GMO Free<br>Ingredients:ORGANIC GARBANZO BEANS;WATER;KOMBU SEAWEED<br>Kosher:Yes<br>Organic:Yes<br>Selling Units:case<br>Size:29 OZ<br>Vegan:No<br>Wheat Free:Yes<br>Yeast Free:Yes<br>Details<br>

In [None]:
# =============================================================================
# SECTION 2.5: EDA - INSPECTING DUPLICATE IMAGE LINKS (Your Excellent Suggestion)
# =============================================================================

print("--- Inspecting for duplicate image links as per your suggestion ---")

# We will use the 'combined_df' which contains both train and test data
# to get a complete picture of all image links in the dataset.

# --- 1. Count the occurrences of each image link ---
# .value_counts() is perfect for this. It counts each unique value.
image_link_counts = combined_df['image_link'].value_counts()

# --- 2. Filter for links that appear more than once ---
duplicate_links = image_link_counts[image_link_counts > 1]

print(f"\nFound {len(combined_df)} total rows.")
print(f"Found {len(image_link_counts)} unique image links.")
print(f"Found {len(duplicate_links)} image links that are used more than once.")

# --- 3. Display the top 10 most duplicated images ---
if not duplicate_links.empty:
    print("\n--- Top 10 Most Duplicated Image Links ---")
    print(duplicate_links.head(10))

    # --- 4. Let's investigate one of the top duplicates ---
    # We'll take the most duplicated image link and see all the rows associated with it.
    most_duplicated_link = duplicate_links.index[0]
    print(f"\n--- Investigating the most duplicated link: {most_duplicated_link} ---")

    # Display all rows from the combined dataset that use this specific image link
    # We'll show the text and the price to see if they are different.
    display(combined_df[combined_df['image_link'] == most_duplicated_link][['catalog_content', 'price']])
else:
    print("\nNo duplicate image links were found in the dataset.")

print("\n--- Duplicate Image Link Analysis Complete ---")


--- Inspecting for duplicate image links as per your suggestion ---

Found 150000 total rows.
Found 140587 unique image links.
Found 6991 image links that are used more than once.

--- Top 10 Most Duplicated Image Links ---
image_link
https://m.media-amazon.com/images/I/51m1gdQJW2L.jpg    97
https://m.media-amazon.com/images/I/71LRdXdqc0L.jpg    31
https://m.media-amazon.com/images/I/71brV+lqbRL.jpg    25
https://m.media-amazon.com/images/I/21mMXLWiDOL.jpg    21
https://m.media-amazon.com/images/I/61md5v6UPNL.jpg    21
https://m.media-amazon.com/images/I/71FMi9tO3HL.jpg    19
https://m.media-amazon.com/images/I/21l1ELDzJAL.jpg    16
https://m.media-amazon.com/images/I/51DDKoa+mbL.jpg    15
https://m.media-amazon.com/images/I/91RB11r3xSL.jpg    14
https://m.media-amazon.com/images/I/71gtFKX66JL.jpg    14
Name: count, dtype: int64

--- Investigating the most duplicated link: https://m.media-amazon.com/images/I/51m1gdQJW2L.jpg ---


Unnamed: 0,catalog_content,price
1203,"Item Name: Manischewitz Soup Matzo Ball Jars, 1.5 lb\nBullet Point: Kosher for Passover and all year round\nValue: 24.0\nUnit: Ounce\n",7.410
2121,"Item Name: Sweet Leaf Tea Pet Peach, 16 fl oz\nBullet Point 1: Naturally caffeine free and All natural botanicals\nBullet Point 2: Item Package Weight: 1.19 lb\nBullet Point 3: Country Of Origin: United States\nBullet Point 4: Item Package Dimension: 7.2399999926152"" L x 2.599999997348"" W x 2.59...",1.320
2203,"Item Name: Blue Diamond Nut Thins Almonds Honey Cinnamon Cracker Snacks, 4.25 Ounce - 12 per case.\nBullet Point: 4.25 Ounces\nProduct Description: Nut thins are a unique, crispy and crunchy cracker loaded with delicious and nutritious pecans. They are the only crackers made with nuts and baked ...",39.995
2773,"Item Name: Organic Cranberry Pomegrante Juice 32 Ounces (Case of 12)\nBullet Point: 32 Ounces\nProduct Description: An Organic Apple, Cranberry and Pomegranate Juice Blend from Concentrates with Other Ingredients.\nValue: 12.0\nUnit: Count\n",4.680
4677,"Item Name: Peace Clusters and Flakes Cereal, Maple Pecan, 11 Ounce (Pack of 6)\nBullet Point 1: Whole grain oat clusters, crispy corn flakes, pecans with real maple syrup\nBullet Point 2: We add only premium all natural grains, nuts and fruits providing you with essential whole grains, fiber and...",33.240
...,...,...
144557,"Item Name: Eden Foods Organic Apple Sauce, 25 Ounce - 12 per case.\nBullet Point 1: Gluten Free\nBullet Point 2: Kosher\nValue: 12.0\nUnit: Count\n",
144712,"Item Name: R W Knudsen Organic Grapefruit Juice, 32 Ounce - 12 per case.\nValue: 32.0\nUnit: Ounce\n",
144721,Item Name: Taste Of Thai Noodle Qck Meal Peanut\nValue: nan\nUnit: None\n,
145196,Item Name: Organic White Jasmine Rice 25 lbs. bag\nBullet Point: Description | Ingredients | Cooking Instructions | No Additives |Oil-Free| Allergen Free ||| |||| |\nProduct Description: Save On Lotus FoodsÂ 25lb Oz Jasmine White Rice An Aromatic Long-Grain 100% Rice Grown By The Raun Family At ...,



--- Duplicate Image Link Analysis Complete ---


# **Advanced Feature Extraction**

---



In [None]:
# =============================================================================
# SECTION 3: ADVANCED FEATURE EXTRACTION
# =============================================================================

# -- 3.1: Define All Project Constants --
# To avoid all import issues, we are defining our constants directly in this cell.
# This makes the notebook self-contained and robust against runtime restarts.

print("Starting Section 3: Defining all constants directly in the notebook.")

# --- 1. Column Names ---
SAMPLE_ID_COL = 'sample_id'
TEXT_COL = 'catalog_content'
TARGET_COL = 'price'
BRAND_COL = 'brand'
WEIGHT_G_COL = 'weight_grams'
VOLUME_ML_COL = 'volume_ml'
PACK_COUNT_COL = 'pack_count'
BRAND_ENCODED_COL = 'brand_encoded'


# --- 2. Feature Extraction: Unit Lists (from EDA) ---
WEIGHT_UNITS_G = ['g', 'gr', 'gram', 'grams', 'gramos']
WEIGHT_UNITS_KG = ['kg', 'kgs', 'kilo', 'kilogram', 'kilograms', 'quilogramas']
WEIGHT_UNITS_OZ = ['oz', 'ounce', 'ounces', 'onza', 'onzas']
WEIGHT_UNITS_LB = ['lb', 'lbs', 'libra', 'libras', 'pound', 'pounds']
WEIGHT_UNITS_MG = ['mg', 'milligram', 'milligrams']

VOLUME_UNITS_ML = ['ml', 'milliliter', 'milliliters', 'mililiters']
VOLUME_UNITS_L = ['l', 'liter', 'liters', 'litre', 'litres', 'litros']
VOLUME_UNITS_FLOZ = ['floz', 'fl oz', 'fluid ounce']
VOLUME_UNITS_GAL = ['gal', 'gallon', 'gallons', 'galon']
VOLUME_UNITS_CUP = ['cup', 'cups']

COUNT_UNITS = [
    'pack', 'packs', 'pk', 'pkg', 'pck', 'count', 'ct', 'cnt', 'ea', 'each',
    'pc', 'pcs', 'piece', 'pieces', 'dozen', 'doz', 'dz', 'roll', 'rolls',
    'can', 'cans', 'bottle', 'bottles', 'btl', 'bag', 'bags', 'box', 'boxes', 'bx',
    'sachet', 'sachets', 'tablet', 'tablets', 'tabs', 'capsule', 'capsules', 'caps',
    'pods'
]

# --- 3. Feature Extraction: Conversion Maps ---
CONVERSION_TO_G = {
    **{unit: 1.0 for unit in WEIGHT_UNITS_G},
    **{unit: 1000.0 for unit in WEIGHT_UNITS_KG},
    **{unit: 28.35 for unit in WEIGHT_UNITS_OZ},
    **{unit: 453.59 for unit in WEIGHT_UNITS_LB},
    **{unit: 0.001 for unit in WEIGHT_UNITS_MG},
}

CONVERSION_TO_ML = {
    **{unit: 1.0 for unit in VOLUME_UNITS_ML},
    **{unit: 1000.0 for unit in VOLUME_UNITS_L},
    **{unit: 29.57 for unit in VOLUME_UNITS_FLOZ},
    **{unit: 3785.41 for unit in VOLUME_UNITS_GAL},
    **{unit: 236.59 for unit in VOLUME_UNITS_CUP},
}

# --- 4. Model Training Constants ---
RANDOM_STATE = 42
N_SPLITS = 5
LGB_PARAMS = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 2000,
    'learning_rate': 0.01, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 31,
    'verbose': -1, 'n_jobs': -1, 'seed': RANDOM_STATE, 'boosting_type': 'gbdt',
}
TFIDF_MAX_FEATURES = 5000
TFIDF_NGRAM_RANGE = (1, 2)

print("All constants have been defined and loaded into the notebook's memory.")


Starting Section 3: Defining all constants directly in the notebook.
All constants have been defined and loaded into the notebook's memory.


In [None]:
# -- 3.2: Setup Main Extraction Function --

print("Setting up the main feature extraction function...")

def extract_features(df):
    """
    Main function to orchestrate the extraction of all new features.
    We will add the logic for each feature to this function step-by-step.
    """
    df_copy = df.copy()

    # Initialize New Columns using the constants defined in the previous cell
    df_copy[BRAND_COL] = 'unknown'
    df_copy[WEIGHT_G_COL] = np.nan
    df_copy[VOLUME_ML_COL] = np.nan
    df_copy[PACK_COUNT_COL] = 1 # Default to 1

    print(f"Initialized new columns: {BRAND_COL}, {WEIGHT_G_COL}, {VOLUME_ML_COL}, {PACK_COUNT_COL}")

    # We will add the real extraction logic here in subsequent steps.

    return df_copy

# --- Apply the initial function to our dataframes ---
train_featured_df = extract_features(train_df)
test_featured_df = extract_features(test_df)

print("\nInitial feature dataframes created.")
print("First 5 rows of the new training dataframe with empty features:")
print(train_featured_df[[SAMPLE_ID_COL, BRAND_COL, WEIGHT_G_COL, VOLUME_ML_COL, PACK_COUNT_COL]].head())


Setting up the main feature extraction function...
Initialized new columns: brand, weight_grams, volume_ml, pack_count
Initialized new columns: brand, weight_grams, volume_ml, pack_count

Initial feature dataframes created.
First 5 rows of the new training dataframe with empty features:
   sample_id    brand  weight_grams  volume_ml  pack_count
0      33127  unknown           NaN        NaN           1
1     198967  unknown           NaN        NaN           1
2     261251  unknown           NaN        NaN           1
3      55858  unknown           NaN        NaN           1
4     292686  unknown           NaN        NaN           1


In [None]:
# -- 3.3: All Feature Extraction (V8 - Fixing Volume Extraction) --

print("Correcting the feature extraction logic to properly capture Volume...")

# Using the same NON_BRAND_WORDS list from our last successful attempt
NON_BRAND_WORDS = [
    'The', 'A', 'An', 'Organic', 'Gluten-Free', 'Natural', 'Pure', 'Food', 'Gourmet', 'Simply',
    'And', 'For', 'With', 'Pack', 'To', 'Of', 'In', 'From', 'By', 'On', 'At', 'Is', 'It',
    'Red', 'Blue', 'Green', 'Black', 'White', 'Spice', 'Spices', 'And'
]

def extract_features(df):
    df_copy = df.copy()

    # --- Initialize Columns ---
    df_copy[BRAND_COL] = 'unknown'
    df_copy[WEIGHT_G_COL] = np.nan
    df_copy[VOLUME_ML_COL] = np.nan
    df_copy[PACK_COUNT_COL] = 1

    # --- Isolate the 'Item Name' line ---
    item_name_line = df_copy[TEXT_COL].str.extract(r'Item Name:\s*(.*)', flags=re.IGNORECASE).iloc[:, 0].fillna('')

    # --- 1. BRAND EXTRACTION ---
    def get_brand(name):
        if not isinstance(name, str) or not name: return 'unknown'
        words = name.split()
        first_word = words[0].title().replace(',', '')
        if first_word in NON_BRAND_WORDS:
            if len(words) > 1:
                second_word = words[1].title().replace(',', '')
                return second_word
            else:
                return 'unknown'
        return first_word
    df_copy[BRAND_COL] = item_name_line.apply(get_brand)

    # --- 2. PACK, WEIGHT, VOLUME EXTRACTION (CORRECTED LOGIC) ---
    # Pack Count
    pack_pattern = re.compile(r'(?:pack of|per case|count of|\()\s*(\d+)', flags=re.IGNORECASE)
    extracted_packs = item_name_line.str.extract(pack_pattern, expand=False).astype(float)
    df_copy[PACK_COUNT_COL] = extracted_packs.fillna(df_copy[PACK_COUNT_COL])

    # --- ROBUST WEIGHT & VOLUME EXTRACTION ---
    # We will process them separately and more carefully.

    # Weight Extraction
    all_weight_units = '|'.join(CONVERSION_TO_G.keys())
    weight_pattern = re.compile(r'(\d+\.?\d*)\s*(' + all_weight_units + r')\b', flags=re.IGNORECASE)
    weight_matches = item_name_line.str.extract(weight_pattern)

    # Check if the matches DataFrame is not empty and has the expected columns
    if not weight_matches.empty and weight_matches.shape[1] == 2:
        value = pd.to_numeric(weight_matches[0], errors='coerce')
        unit = weight_matches[1].str.lower()
        conversion = unit.map(CONVERSION_TO_G)
        # Calculate weights only for rows where a match was found
        calculated_weights = value * conversion
        # Use the 'where' condition to fill NaNs
        df_copy[WEIGHT_G_COL] = df_copy[WEIGHT_G_COL].where(calculated_weights.isna(), calculated_weights)

    # Volume Extraction
    all_volume_units = '|'.join(CONVERSION_TO_ML.keys())
    volume_pattern = re.compile(r'(\d+\.?\d*)\s*(' + all_volume_units + r')\b', flags=re.IGNORECASE)
    volume_matches = item_name_line.str.extract(volume_pattern)

    if not volume_matches.empty and volume_matches.shape[1] == 2:
        value = pd.to_numeric(volume_matches[0], errors='coerce')
        unit = volume_matches[1].str.lower()
        conversion = unit.map(CONVERSION_TO_ML)
        calculated_volumes = value * conversion
        df_copy[VOLUME_ML_COL] = df_copy[VOLUME_ML_COL].where(calculated_volumes.isna(), calculated_volumes)

    return df_copy

# --- Re-run the extraction ---
train_featured_df = extract_features(train_df)
test_featured_df = extract_features(test_df)

print("\nFeature extraction re-run with CORRECTED Volume logic.")

# --- Verification ---
print("\n--- FINAL VERIFICATION ---")
print("Top 15 most common brands found:")
print(train_featured_df[BRAND_COL].value_counts().head(15))

print("\nExample of all new features extracted:")
print(train_featured_df[[SAMPLE_ID_COL, BRAND_COL, WEIGHT_G_COL, VOLUME_ML_COL, PACK_COUNT_COL]].head(10))

# --- NEW: Check the status of the volume column ---
print(f"\nNumber of non-null values in '{VOLUME_ML_COL}': {train_featured_df[VOLUME_ML_COL].notna().sum()}")

Correcting the feature extraction logic to properly capture Volume...

Feature extraction re-run with CORRECTED Volume logic.

--- FINAL VERIFICATION ---
Top 15 most common brands found:
brand
To            947
Mccormick     637
Goya          445
Rani          425
Frontier      370
La            344
Betty         328
Starbucks     308
Badia         286
Bob'S         277
Amoretti      277
Crystal       256
Campbell'S    253
Fresh         251
Kraft         244
Name: count, dtype: int64

Example of all new features extracted:
   sample_id     brand  weight_grams  volume_ml  pack_count
0      33127        La      340.2000        NaN         6.0
1     198967   Salerno      226.8000        NaN         4.0
2     261251      Bear       53.8650        NaN         6.0
3      55858   Judee’S      318.9375        NaN         1.0
4     292686     Kedem      360.0450        NaN         1.0
5       9259  Member'S      177.1875        NaN         1.0
6     191846      Goya      850.5000        NaN    

# **Data Preparation**

---



In [None]:
# =============================================================================
# SECTION 4: DATA PREPARATION WITH A PREPROCESSING PIPELINE
# =============================================================================

# -- 4.1: Define Feature Groups --
# We are switching to a scikit-learn Pipeline to handle preprocessing more robustly
# and prevent any potential data leakage during cross-validation.

print("Starting Section 4 with a robust Preprocessing Pipeline strategy.")

# First, let's clean up the rare brands as we discussed before.
# This is a data cleaning step done before the pipeline.
brand_counts = train_featured_df[BRAND_COL].value_counts()
rare_brands = brand_counts[brand_counts < 3].index
train_featured_df.loc[train_featured_df[BRAND_COL].isin(rare_brands), BRAND_COL] = 'unknown'
test_featured_df.loc[test_featured_df[BRAND_COL].isin(rare_brands), BRAND_COL] = 'unknown'
print(f"Cleaned rare brands. Number of unique brands is now: {train_featured_df[BRAND_COL].nunique()}")


# Define the groups of columns for our ColumnTransformer.
# Note: We use the original train_df for the text column to ensure it's clean.
# The other features come from our 'train_featured_df'.

# 1. Text feature
text_feature = TEXT_COL

# 2. Numerical features
numeric_features = [WEIGHT_G_COL, VOLUME_ML_COL, PACK_COUNT_COL]

# 3. Categorical feature
categorical_feature = BRAND_COL

print("\nDefined feature groups for the pipeline:")
print(f"Text Feature: {text_feature}")
print(f"Numeric Features: {numeric_features}")
print(f"Categorical Feature: {categorical_feature}")

Starting Section 4 with a robust Preprocessing Pipeline strategy.
Cleaned rare brands. Number of unique brands is now: 3905

Defined feature groups for the pipeline:
Text Feature: catalog_content
Numeric Features: ['weight_grams', 'volume_ml', 'pack_count']
Categorical Feature: brand


In [None]:
# =============================================================================
# SECTION 4: DATA PREPARATION WITH A PREPROCESSING PIPELINE
# =============================================================================

# --- 4.1: Import Libraries and Define Feature Groups ---
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

print("Starting Section 4: Building the preprocessing pipeline.")

# Define the groups of columns for our ColumnTransformer.
text_feature = TEXT_COL
numeric_features = [WEIGHT_G_COL, VOLUME_ML_COL, PACK_COUNT_COL]
categorical_feature = BRAND_COL

# Clean rare brands before defining the pipeline.
# This is an important data cleaning step.
brand_counts = train_featured_df[BRAND_COL].value_counts()
rare_brands = brand_counts[brand_counts < 3].index
train_featured_df.loc[train_featured_df[BRAND_COL].isin(rare_brands), BRAND_COL] = 'unknown'
test_featured_df.loc[test_featured_df[BRAND_COL].isin(rare_brands), BRAND_COL] = 'unknown'
print(f"Cleaned rare brands. Number of unique brands is now: {train_featured_df[BRAND_COL].nunique()}")

print("\nDefined feature groups for the pipeline:")
print(f"Text Feature: {text_feature}")
print(f"Numeric Features: {numeric_features}")
print(f"Categorical Feature: {categorical_feature}")

# --- 4.2: Build the Preprocessing Pipeline ---
print("\nBuilding the ColumnTransformer...")

# Define the transformer for numeric features (impute missing values)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Define the transformer for the categorical 'brand' feature (one-hot encode)
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

# Define the transformer for the text feature (TF-IDF)
text_transformer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words='english', ngram_range=TFIDF_NGRAM_RANGE)

# Create the master preprocessor object that applies each transformer to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, [categorical_feature]),
        ('text', text_transformer, text_feature)
    ],
    remainder='drop' # Drop any columns that we haven't specified
)

print("\nPreprocessing pipeline ('preprocessor') built successfully.")
print("\n--- Section 4 Complete ---")


Starting Section 4: Building the preprocessing pipeline.
Cleaned rare brands. Number of unique brands is now: 3905

Defined feature groups for the pipeline:
Text Feature: catalog_content
Numeric Features: ['weight_grams', 'volume_ml', 'pack_count']
Categorical Feature: brand

Building the ColumnTransformer...

Preprocessing pipeline ('preprocessor') built successfully.

--- Section 4 Complete ---


# **Model Training and Evaluation**

---



In [None]:
# =============================================================================
# SECTION 4.4: DATA CLEANING (YOUR STRATEGY: REMOVE PLACEHOLDER IMAGES)
# =============================================================================

print("--- Implementing your data cleaning strategy: Removing rows with placeholder images. ---")

# --- 1. Identify the placeholder images ---
# Based on our EDA, we'll define a "placeholder" as any image that appears more than 10 times.
# This is a reasonable threshold to catch the most generic images.
image_link_counts = train_featured_df['image_link'].value_counts()
placeholder_links = image_link_counts[image_link_counts > 10].index.tolist()

print(f"Identified {len(placeholder_links)} images as placeholders (appearing > 10 times).")
print("The top 5 are:")
print(placeholder_links[:5])

# --- 2. Remove these rows from the training data ---
# We will only clean the training data. We should not remove rows from the test set.
original_row_count = len(train_featured_df)
train_df_cleaned = train_featured_df[~train_featured_df['image_link'].isin(placeholder_links)]
cleaned_row_count = len(train_df_cleaned)

print(f"\nOriginal training data had {original_row_count} rows.")
print(f"Removed {original_row_count - cleaned_row_count} rows that used placeholder images.")
print(f"Cleaned training data now has {cleaned_row_count} rows.")

print("\n--- Data Cleaning Complete ---")


--- Implementing your data cleaning strategy: Removing rows with placeholder images. ---
Identified 5 images as placeholders (appearing > 10 times).
The top 5 are:
['https://m.media-amazon.com/images/I/51m1gdQJW2L.jpg', 'https://m.media-amazon.com/images/I/71LRdXdqc0L.jpg', 'https://m.media-amazon.com/images/I/21mMXLWiDOL.jpg', 'https://m.media-amazon.com/images/I/61md5v6UPNL.jpg', 'https://m.media-amazon.com/images/I/71FMi9tO3HL.jpg']

Original training data had 75000 rows.
Removed 109 rows that used placeholder images.
Cleaned training data now has 74891 rows.

--- Data Cleaning Complete ---


# **CatBoost**

---



In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# =============================================================================
# SECTION 5: CATBOOST PIPELINE MODEL TRAINING (GPU-ENABLED)
# =============================================================================
# This section creates a definitive, GPU-enabled CatBoost model that
# integrates perfectly with the scikit-learn preprocessing pipeline.

print("--- Starting Definitive CatBoost Pipeline Run (GPU-Enabled) ---")

# --- 1. Import necessary libraries ---
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor
import time

# --- 2. Create a Custom CatBoost Wrapper for Pipeline Compatibility ---
# This is the key to solving the previous errors. This wrapper converts the
# sparse data from the preprocessor into a dense format that CatBoost requires.
class CatBoostWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = CatBoostRegressor(**kwargs)

    def fit(self, X, y):
        # Convert sparse matrix to dense numpy array before fitting
        if hasattr(X, "toarray"):
            X = X.toarray()
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # Convert sparse matrix to dense numpy array before predicting
        if hasattr(X, "toarray"):
            X = X.toarray()
        return self.model.predict(X)

print("Custom CatBoostWrapper for scikit-learn pipeline created successfully.")

# --- 3. Use the FULL cleaned DataFrame ---
# This code assumes 'train_df_cleaned' was created in Section 4.
print(f"Using all {len(train_df_cleaned)} rows from the cleaned dataset.")

# --- 4. Define the final CatBoost model pipeline with GPU enabled ---
# This pipeline is identical in structure to your LightGBM pipeline.
model_pipeline_catboost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', CatBoostWrapper(
        task_type='GPU',      # Enable GPU
        devices='0',          # Use the first available GPU
        iterations=4000,      # Aggressive tuning
        learning_rate=0.015,
        depth=9,
        l2_leaf_reg=5,
        loss_function='RMSE',
        eval_metric='RMSE',
        random_seed=RANDOM_STATE,
        verbose=500,          # Print progress
        # No 'cat_features' needed here, as they are already one-hot encoded by the pipeline
    ))
])

# --- 5. Define the Full, Cleaned Data for Fitting ---
# Clean the target column to ensure it's numeric, removing rows if necessary.
train_df_cleaned[TARGET_COL] = pd.to_numeric(train_df_cleaned[TARGET_COL], errors='coerce')
train_df_cleaned.dropna(subset=[TARGET_COL], inplace=True)
print(f"Final training rows after ensuring target is numeric: {len(train_df_cleaned)}")

X_train_definitive = train_df_cleaned[[text_feature] + numeric_features + [categorical_feature]]
y_train_definitive = np.log1p(train_df_cleaned[TARGET_COL]) # Log-transform the target

# --- 6. Fit the CatBoost Pipeline on the Full, Cleaned Dataset ---
print("\n--- Fitting the final CatBoost pipeline on all available clean data (GPU)... ---")
start_time = time.time()

model_pipeline_catboost.fit(X_train_definitive, y_train_definitive)

end_time = time.time()
print(f"\n--- FITTING COMPLETE ---")
print(f"Total CatBoost GPU training time: {(end_time - start_time)/60:.2f} minutes.")
print("\n--- Section 5 Complete: Definitive CatBoost model successfully trained. ---")


--- Starting Definitive CatBoost Pipeline Run (GPU-Enabled) ---
Custom CatBoostWrapper for scikit-learn pipeline created successfully.
Using all 74891 rows from the cleaned dataset.
Final training rows after ensuring target is numeric: 74891

--- Fitting the final CatBoost pipeline on all available clean data (GPU)... ---
0:	learn: 0.9389526	total: 402ms	remaining: 26m 47s
500:	learn: 0.7385476	total: 1m 36s	remaining: 11m 10s
1000:	learn: 0.7097317	total: 3m 7s	remaining: 9m 21s
1500:	learn: 0.6918770	total: 4m 37s	remaining: 7m 41s
2000:	learn: 0.6797215	total: 6m 6s	remaining: 6m 6s
2500:	learn: 0.6698335	total: 7m 34s	remaining: 4m 32s
3000:	learn: 0.6616345	total: 9m 2s	remaining: 3m
3500:	learn: 0.6542530	total: 10m 30s	remaining: 1m 29s
3999:	learn: 0.6476591	total: 11m 58s	remaining: 0us

--- FITTING COMPLETE ---
Total CatBoost GPU training time: 18.17 minutes.

--- Section 5 Complete: Definitive CatBoost model successfully trained. ---


In [None]:
#  =============================================================================
# SECTION 6: GENERATE FINAL CATBOOST SUBMISSION
# =============================================================================
# This cell uses the 'model_pipeline_catboost' object created in Section 5.

print("\n--- Generating Final CatBoost Submission File ---")

# Define the test data for prediction
X_test_full = test_featured_df[[text_feature] + numeric_features + [categorical_feature]]

# Make predictions on the test set
print("Making predictions on the test set...")
test_predictions_log = model_pipeline_catboost.predict(X_test_full)
test_predictions = np.expm1(test_predictions_log) # Inverse the log-transform
print("Predictions generated successfully.")

# Create the submission DataFrame
submission_df_catboost = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': test_predictions
})
submission_df_catboost['price'] = submission_df_catboost['price'].clip(lower=0)

# Save the final submission file
submission_df_catboost.to_csv("submission_catboost_pipeline_gpu.csv", index=False)

print("\nSubmission file 'submission_catboost_pipeline_gpu.csv' created successfully.")
print("This is your definitive CatBoost submission.")


--- Generating Final CatBoost Submission File ---
Making predictions on the test set...
Predictions generated successfully.

Submission file 'submission_catboost_pipeline_gpu.csv' created successfully.
This is your definitive CatBoost submission.


# **MLP**

---



In [None]:
# =============================================================================
# SECTION 5: MEMORY-EFFICIENT PYTORCH MLP (CRASH FIX)
# =============================================================================
# This version uses a custom PyTorch Dataset to handle the large, sparse
# feature matrix efficiently, preventing the RAM overflow that causes crashes.

print("--- Starting Memory-Efficient PyTorch MLP Run ---")

# --- 1. Import libraries ---
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import time
import numpy as np

print("Libraries imported successfully.")

# --- 2. Verify GPU ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 3. Preprocess data (but keep it sparse!) ---
print("Preparing data with the scikit-learn pipeline...")
transform_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False))
])

train_df_cleaned[TARGET_COL] = pd.to_numeric(train_df_cleaned[TARGET_COL], errors='coerce')
train_df_cleaned.dropna(subset=[TARGET_COL], inplace=True)
print(f"Final training rows: {len(train_df_cleaned)}")

X_train_data = train_df_cleaned[[text_feature] + numeric_features + [categorical_feature]]
y_train_data = np.log1p(train_df_cleaned[TARGET_COL])

# IMPORTANT: We fit the pipeline and transform, but DO NOT call .toarray()
X_train_sparse = transform_pipeline.fit_transform(X_train_data)
y_train_numpy = y_train_data.values

X_test_data = test_featured_df[[text_feature] + numeric_features + [categorical_feature]]
X_test_sparse = transform_pipeline.transform(X_test_data)

print(f"Data transformed into a sparse matrix of shape: {X_train_sparse.shape}")

# --- 4. Create a Custom Memory-Efficient PyTorch Dataset ---
class SparseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        # This is the key: .toarray() is only called for ONE row at a time!
        return torch.tensor(self.X[idx].toarray().flatten(), dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

train_dataset = SparseDataset(X_train_sparse, y_train_numpy)
train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)

# --- 5. Define MLP Model and SMAPE Loss (same as before) ---
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(nn.Linear(input_size, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, 1))
    def forward(self, x): return self.layers(x)

class SmapeLoss(nn.Module):
    def __init__(self, epsilon=1e-8):
        super().__init__(); self.epsilon = epsilon
    def forward(self, y_pred_log, y_true_log):
        y_pred = torch.expm1(y_pred_log); y_true = torch.expm1(y_true_log)
        numerator = torch.abs(y_pred - y_true)
        denominator = (torch.abs(y_true) + torch.abs(y_pred)) / 2
        return torch.mean(numerator / (denominator + self.epsilon)) * 100

# --- 6. Initialize and Train ---
input_size = X_train_sparse.shape[1]
model = MLP(input_size).to(device)
criterion = SmapeLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("\n--- Fitting the PyTorch MLP model (Memory-Efficient)... ---")
start_time = time.time()
num_epochs = 60
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    for features, labels in train_loader:
        features = features.to(device); labels = labels.to(device).view(-1, 1)
        outputs = model(features)
        loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], Average SMAPE Loss: {epoch_loss/len(train_loader):.4f}')
end_time = time.time()
print(f"\n--- FITTING COMPLETE ---")
print(f"Total training time: {(end_time - start_time)/60:.2f} minutes.")


--- Starting Memory-Efficient PyTorch MLP Run ---
Libraries imported successfully.
Using device: cuda
Preparing data with the scikit-learn pipeline...
Final training rows: 74891
Data transformed into a sparse matrix of shape: (74891, 8908)

--- Fitting the PyTorch MLP model (Memory-Efficient)... ---
Epoch [1/60], Average SMAPE Loss: 71.7077
Epoch [2/60], Average SMAPE Loss: 60.7538
Epoch [3/60], Average SMAPE Loss: 57.0746
Epoch [4/60], Average SMAPE Loss: 54.5282
Epoch [5/60], Average SMAPE Loss: 52.0985
Epoch [6/60], Average SMAPE Loss: 49.6944
Epoch [7/60], Average SMAPE Loss: 48.1245
Epoch [8/60], Average SMAPE Loss: 46.3193
Epoch [9/60], Average SMAPE Loss: 45.2769
Epoch [10/60], Average SMAPE Loss: 43.7166
Epoch [11/60], Average SMAPE Loss: 42.1652
Epoch [12/60], Average SMAPE Loss: 41.5176
Epoch [13/60], Average SMAPE Loss: 40.5393
Epoch [14/60], Average SMAPE Loss: 39.4096
Epoch [15/60], Average SMAPE Loss: 38.5153
Epoch [16/60], Average SMAPE Loss: 37.9257
Epoch [17/60], Avera

# **Fine Tuning**

---



In [None]:
# =============================================================================
# SECTION 6: FINE-TUNING THE MLP MODEL FOR A BETTER SCORE
# =============================================================================
# We take our already-trained model and fine-tune it with a lower learning
# rate to try and squeeze out a bit more performance.

print("--- Starting Fine-Tuning of the Trained MLP Model ---")

# --- 1. Lower the Learning Rate ---
# We access the optimizer and reduce the learning rate by a factor of 10.
new_lr = 0.0001
for param_group in optimizer.param_groups:
    param_group['lr'] = new_lr

print(f"Optimizer's learning rate has been reduced to: {new_lr}")

# --- 2. Fine-Tune for a Few More Epochs ---
print("\n--- Fine-tuning for 30 more epochs... ---")
fine_tune_epochs = 30
start_time = time.time()

for epoch in range(fine_tune_epochs):
    model.train()
    epoch_loss = 0.0
    for features, labels in train_loader:
        features = features.to(device)
        labels = labels.to(device).view(-1, 1)

        outputs = model(features)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Fine-Tuning Epoch [{epoch+1}/{fine_tune_epochs}], Average SMAPE Loss: {epoch_loss/len(train_loader):.4f}')

end_time = time.time()
print(f"\n--- FINE-TUNING COMPLETE ---")
print(f"Total fine-tuning time: {(end_time - start_time)/60:.2f} minutes.")

# --- 3. Generate the New Submission File ---
# Use the same "bulletproof" submission code as before.
print("\n--- Generating Final Fine-Tuned Submission File ---")
# (The rest of the submission code is the same)

model.eval()
final_predictions = []
batch_size = 512
with torch.no_grad():
    for i in range(0, X_test_sparse.shape[0], batch_size):
        X_batch_sparse = X_test_sparse[i:i + batch_size]
        X_batch_dense = torch.tensor(X_batch_sparse.toarray(), dtype=torch.float32).to(device)
        outputs = model(X_batch_dense)
        final_predictions.append(outputs.cpu())

test_predictions_log = torch.cat(final_predictions).numpy()
final_predictions = np.expm1(test_predictions_log)

if np.isnan(final_predictions).any() or np.isinf(final_predictions).any():
    median_pred = np.nanmedian(final_predictions)
    final_predictions = np.nan_to_num(final_predictions, nan=median_pred, posinf=median_pred, neginf=median_pred)

final_predictions = final_predictions.clip(min=0)

submission_df_mlp = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions.flatten()
})

submission_df_mlp.to_csv("submission_mlp_finetuned.csv", index=False)
print("\nSubmission file 'submission_mlp_finetuned.csv' created successfully.")


--- Starting Fine-Tuning of the Trained MLP Model ---
Optimizer's learning rate has been reduced to: 0.0001

--- Fine-tuning for 30 more epochs... ---
Fine-Tuning Epoch [1/30], Average SMAPE Loss: 23.4028
Fine-Tuning Epoch [2/30], Average SMAPE Loss: 22.8269
Fine-Tuning Epoch [3/30], Average SMAPE Loss: 22.3797
Fine-Tuning Epoch [4/30], Average SMAPE Loss: 22.0501
Fine-Tuning Epoch [5/30], Average SMAPE Loss: 21.8376
Fine-Tuning Epoch [6/30], Average SMAPE Loss: 21.5299
Fine-Tuning Epoch [7/30], Average SMAPE Loss: 21.3155
Fine-Tuning Epoch [8/30], Average SMAPE Loss: 21.1088
Fine-Tuning Epoch [9/30], Average SMAPE Loss: 21.0551
Fine-Tuning Epoch [10/30], Average SMAPE Loss: 20.9033
Fine-Tuning Epoch [11/30], Average SMAPE Loss: 20.6626
Fine-Tuning Epoch [12/30], Average SMAPE Loss: 20.5472
Fine-Tuning Epoch [13/30], Average SMAPE Loss: 20.4289
Fine-Tuning Epoch [14/30], Average SMAPE Loss: 20.3042
Fine-Tuning Epoch [15/30], Average SMAPE Loss: 20.1544
Fine-Tuning Epoch [16/30], Averag

# **Submission**

---



In [None]:
# =============================================================================
# SECTION 7: GENERATE SUBMISSION (DEFINITIVE CRASH & ERROR FIX v2)
# =============================================================================
# This version fixes the .clip() typo and correctly handles NaN/inf values.

print("\n--- Generating Final Submission File (Definitive Fix v2) ---")

# The 'model' is already trained and in memory. We will re-run the prediction
# and cleaning process to be safe.

model.eval()
final_predictions = []
batch_size = 512

with torch.no_grad():
    for i in range(0, X_test_sparse.shape[0], batch_size):
        X_batch_sparse = X_test_sparse[i:i + batch_size]
        X_batch_dense = torch.tensor(X_batch_sparse.toarray(), dtype=torch.float32).to(device)
        outputs = model(X_batch_dense)
        final_predictions.append(outputs.cpu())

test_predictions_log = torch.cat(final_predictions).numpy()
final_predictions = np.expm1(test_predictions_log)

print("Predictions generated successfully.")

# --- SAFETY CHECKS ---
if np.isnan(final_predictions).any() or np.isinf(final_predictions).any():
    print("WARNING: Invalid values (NaN or infinity) found in predictions.")
    median_pred = np.nanmedian(final_predictions)
    final_predictions = np.nan_to_num(final_predictions, nan=median_pred, posinf=median_pred, neginf=median_pred)
    print(f"Replaced invalid values with the median prediction: {median_pred:.4f}")

# --- THE FIX: Use min=0 instead of lower=0 ---
final_predictions = final_predictions.clip(min=0)
print("Clipped all predictions to be non-negative.")

# --- CREATE AND SAVE THE SUBMISSION FILE ---
submission_df_mlp = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions.flatten()
})

print(f"\nSubmission file shape: {submission_df_mlp.shape}")
print(f"Number of rows in original test set: {len(test_df)}")
if len(submission_df_mlp) != len(test_df):
    print("ERROR: Row count mismatch!")
else:
    print("Row count check passed.")

submission_df_mlp.to_csv("submission_mlp_final_v3.csv", index=False)
print("\nSubmission file 'submission_mlp_final_v3.csv' created successfully.")
print("This version is robust and should evaluate correctly.")




--- Generating Final Submission File (Definitive Fix v2) ---
Predictions generated successfully.
Replaced invalid values with the median prediction: 13.3140
Clipped all predictions to be non-negative.

Submission file shape: (75000, 2)
Number of rows in original test set: 75000
Row count check passed.

Submission file 'submission_mlp_final_v3.csv' created successfully.
This version is robust and should evaluate correctly.
