In [None]:
!pip install pandas numpy torch torchvision transformers scikit-learn tqdm requests



In [None]:
import pandas as pd
import numpy as np
import os
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from functools import partial

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
# --- CONFIGURATION (UPDATE PATHS AS NEEDED) ---
TRAIN_FILE = '/content/drive/MyDrive/student_resource/dataset/train.csv'
TEST_FILE = '/content/drive/MyDrive/student_resource/dataset/test.csv'
TOP_N_BRANDS = 100

# --- CRUCIAL ADDITION: Define Feature List Variables ---
NUM_FEATURES = ['unit_vol', 'pack_count', 'total_qty']
CAT_FEATURES = ['brand', 'unit_measure']
# Define the function that will NOT drop rows
def extract_structured_features_ROBUST(df):
    """
    Extracts Brand, IPQ components, and transforms the target,
    using 'errors=coerce' to ensure all 75,000 rows are retained.
    """

    # 1. IPQ/Unit Extraction (Robust to errors)

    # Extract unit volume string
    unit_vol_extracted = df['catalog_content'].str.extract(r'(\d*\.?\d+)\s+(Ounce|oz|Lb|lb|Pound|ct|Count|Liter|LT|Fl Oz)', re.IGNORECASE)[0]
    # Convert to numeric, forcing errors to NaN
    df['unit_vol'] = pd.to_numeric(unit_vol_extracted, errors='coerce')

    # Extract unit measure
    df['unit_measure'] = df['catalog_content'].str.extract(r'(\d*\.?\d+)\s+(Ounce|oz|Lb|lb|Pound|ct|Count|Liter|LT|Fl Oz)', re.IGNORECASE)[1].fillna('NONE').str.lower()

    # Extract pack count string
    pack_count_extracted = df['catalog_content'].str.extract(r'(Pack of|PK-| case| count)\s*(\d+)', re.IGNORECASE)[1].fillna('1')

    # Convert to numeric, forcing errors to NaN, then fill NaNs with 1, and convert to int
    df['pack_count'] = pd.to_numeric(pack_count_extracted, errors='coerce').fillna(1).astype(int)

    # 2. Total Quantity Feature
    # Fill unit_vol NaNs with 1.0 here so multiplication works without dropping rows
    df['total_qty'] = df['unit_vol'].fillna(1.0) * df['pack_count']

    # 3. Brand Extraction (Keeps the original logic for consistency)
    def extract_brand(text):
        try:
            name_part = text.split(',')[0].split('Item Name: ')[-1]
            return ' '.join(name_part.split(' ')[:3])
        except:
            return 'UNKNOWN'

    df['brand'] = df['catalog_content'].apply(extract_brand)

    # 4. Target Transformation
    if 'price' in df.columns:
        df['log_price'] = np.log1p(df['price'])

    # Crucial: DO NOT drop the catalog_content/image_link columns yet if you need them later
    # We will let the subsequent preprocessing step handle column drops.
    return df


# --- Execution Flow with Verification ---

TRAIN_FILE = '/content/drive/MyDrive/student_resource/dataset/train.csv'
TEST_FILE = '/content/drive/MyDrive/student_resource/dataset/test.csv'

df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)

# Retain original raw text before processing
df_train_raw = df_train.copy()
df_test_raw = df_test.copy()

# Apply the fixed function
df_train = extract_structured_features_ROBUST(df_train)
df_test = extract_structured_features_ROBUST(df_test)

# --- CRITICAL VERIFICATION CHECK ---
if len(df_test) != 75000:
    print(f"\nFATAL ERROR: Row count mismatch after fixed extraction. Count is {len(df_test)}")
else:
    print("Stage 1 Complete. All 75,000 TEST rows retained. Proceed to Stage 2.")

Stage 1 Complete. All 75,000 TEST rows retained. Proceed to Stage 2.


In [None]:
# --- 1.1 & 1.2: Load Data and Extract Features ---
# Apply the 'engine=' and 'on_bad_lines=' parameters to handle parsing errors

df_train = pd.read_csv(
    TRAIN_FILE,
    engine='python',            # Use the more flexible Python parser
    on_bad_lines='warn'         # Warn about bad lines but try to process them
)
df_test = pd.read_csv(
    TEST_FILE,
    engine='python',
    on_bad_lines='warn'
)

# Keep raw text for TF-IDF in Stage 3
df_train_raw = df_train.copy()
df_test_raw = df_test.copy()

df_train = extract_structured_features_ROBUST(df_train)
df_test = extract_structured_features_ROBUST(df_test)

print("Stage 1 Complete. Data loaded successfully with Python engine.")

Stage 1 Complete. Data loaded successfully with Python engine.


In [None]:
# --- Configuration (Fit ONLY on Training Data) ---

# 1. Fit numerical scaler on training data
scaler = StandardScaler()
# Fill NaNs with the median before fitting/transforming
df_train[NUM_FEATURES] = df_train[NUM_FEATURES].fillna(df_train[NUM_FEATURES].median())
scaler.fit(df_train[NUM_FEATURES])

# 2. Identify top brands from training data
brand_counts = df_train['brand'].value_counts()
top_brands = brand_counts[brand_counts > TOP_N_BRANDS].index.tolist()

def preprocess_structured_features(df, scaler, top_brands):
    df_proc = df.copy()

    # Handle NaNs in numerical features (using the same value as the fit step)
    df_proc[NUM_FEATURES] = df_proc[NUM_FEATURES].fillna(df_train[NUM_FEATURES].median().to_dict())

    # 2.2: Apply numerical scaling
    df_proc[NUM_FEATURES] = scaler.transform(df_proc[NUM_FEATURES])

    # Categorical Encoding: Reduce high-cardinality brand feature
    df_proc['brand_encoded'] = df_proc['brand'].apply(lambda x: x if x in top_brands else 'OTHER')

    # One-Hot Encoding for the reduced set of categories
    ohe_features = ['brand_encoded', 'unit_measure']
    dummies = pd.get_dummies(df_proc[ohe_features], prefix=['brand', 'unit'])

    # Drop original categorical and un-needed columns
    df_proc = pd.concat([df_proc, dummies], axis=1).drop(columns=CAT_FEATURES + ohe_features + ['catalog_content', 'image_link'])

    return df_proc

# --- 2.2: Apply Preprocessing ---
df_train_proc = preprocess_structured_features(df_train.copy(), scaler, top_brands)
df_test_proc = preprocess_structured_features(df_test.copy(), scaler, top_brands)

# --- 2.3: CRITICAL: Align columns for Test Set ---
train_cols = [c for c in df_train_proc.columns if c not in ['sample_id', 'price', 'log_price']]

for col in train_cols:
    if col not in df_test_proc.columns:
        df_test_proc[col] = 0

df_test_proc = df_test_proc[['sample_id'] + train_cols] # Ensure column order matches
df_train_proc = df_train_proc[['sample_id', 'log_price'] + train_cols]

print("Stage 2 Complete. Structured features scaled and encoded.")

Stage 2 Complete. Structured features scaled and encoded.


In [None]:
# --- 3.1 & 3.2: TF-IDF Vectorization ---
import pandas as pd
import numpy as np
import re
import os
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix # <-- ADD THIS LINE
import lightgbm as lgb
train_text = df_train_raw['catalog_content'].fillna('')
test_text = df_test_raw['catalog_content'].fillna('')

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words='english',
    analyzer='word'
)

# Fit and Transform on TRAINING data
X_train_text = vectorizer.fit_transform(train_text)
# Transform ONLY on TEST data
X_test_text = vectorizer.transform(test_text)

# --- 3.3: Combine Text and Structured Features ---

# Select feature columns from processed DFs
FEATURE_COLS = [col for col in df_train_proc.columns if col not in ['sample_id', 'price', 'log_price']]

X_train_struct_df = df_train_proc[FEATURE_COLS]
X_test_struct_df = df_test_proc[FEATURE_COLS]

# CRITICAL FIX: Explicitly cast to float64 for sparse matrix compatibility
X_train_struct = X_train_struct_df.astype(np.float64).values
X_test_struct = X_test_struct_df.astype(np.float64).values

# Combine sparse text matrix with dense structured matrix
X_train_final = hstack([X_train_text, csr_matrix(X_train_struct)])
X_test_final = hstack([X_test_text, csr_matrix(X_test_struct)])

y_train = df_train_proc['log_price']

print("Stage 3 Complete. Feature matrices stacked.")
print(f"Final Combined Feature Shape (Train): {X_train_final.shape}")

Stage 3 Complete. Feature matrices stacked.
Final Combined Feature Shape (Train): (75000, 10021)


In [None]:
# --- 4.1: Train LightGBM Model ---

# LightGBM is trained on log-price
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='mse',
    n_estimators=1500,
    learning_rate=0.03,
    num_leaves=63,
    max_depth=-1,
    n_jobs=-1,
    random_state=42,
    verbose=-1
)

print("\nStarting LightGBM Training...")
lgb_model.fit(X_train_final, y_train)
print("Training complete.")

# --- 4.2 & 4.3: Prediction and Inverse Transformation ---

# Generate predictions on the test set (in log scale)
y_pred_log = lgb_model.predict(X_test_final)

# Inverse transform: price = exp(log_price) - 1
y_pred_price = np.expm1(y_pred_log)

# Constraint: Clamp to minimum of 0.01 to ensure strictly positive values
y_pred_price = np.maximum(0.01, y_pred_price)

# --- 4.4: Format Submission File ---

sample_ids = df_test_raw['sample_id']

output_df = pd.DataFrame({
    'sample_id': sample_ids,
    'price': y_pred_price
})

# Save to CSV (Constraint: Match sample output format exactly - float_format='%.2f')
OUTPUT_FILE = 'submission_baseline_lgbm_tfidf.csv'
output_df.to_csv(OUTPUT_FILE, index=False, float_format='%.2f')

print(f"\n✅ Submission file created: {OUTPUT_FILE}")
print("Your Text-Only LightGBM baseline is complete and ready for evaluation!")


Starting LightGBM Training...


In [None]:
import os
import pandas as pd
from pathlib import Path
import time
import os
import sys
import pandas as pd
from pathlib import Path
# Assuming drive.mount('/content/drive') was run successfully.

# --- 1. Define the Project Root and File Paths (Using your structure) ---
PROJECT_NAME = 'student_resource'
BASE_DIR = os.path.join('/content/drive/MyDrive', PROJECT_NAME)

# Ensure pandas is imported before use
# No need to redefine df_train/test here, just the paths
TRAIN_FILE = os.path.join(BASE_DIR, 'dataset', 'train.csv')
TEST_FILE = os.path.join(BASE_DIR, 'dataset', 'test.csv')
IMAGE_DOWNLOAD_FOLDER = os.path.join(BASE_DIR, 'dataset', 'product_images')

# --- 2. CRITICAL FIX: Connect utils.py to Python's Path and Import ---

# 2a. Add 'src' directory to system path
SRC_DIR = os.path.join(BASE_DIR, 'src')
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

# 2b. Import the utility function after updating sys.path
try:
    # This is the line that defines download_images, fixing the NameError
    from utils import download_images
    print("✅ utils.py imported successfully. Proceeding with loading.")
except ImportError as e:
    # This prints if utils.py is missing or has an error inside
    print(f"FATAL ERROR: Could not import utils. Check that utils.py is in the {SRC_DIR} folder and has no errors inside.")
    raise # Stop execution as subsequent steps will fail

# --- 3. Robust Data Loading ---
df_train = pd.read_csv(TRAIN_FILE, engine='python', on_bad_lines='warn')
df_test = pd.read_csv(TEST_FILE, engine='python', on_bad_lines='warn')

# Keep raw dataframes for links and TF-IDF text
df_train_raw = df_train.copy()
df_test_raw = df_test.copy()

print("Initial data loading complete.")
# --- Configuration (Based on your confirmed drive structure) ---
PROJECT_NAME = 'student_resource'
BASE_DIR = os.path.join('/content/drive/MyDrive', PROJECT_NAME)
DATASET_FOLDER = os.path.join(BASE_DIR, 'dataset')
IMAGE_DOWNLOAD_FOLDER = os.path.join(DATASET_FOLDER, 'product_images')

TRAIN_FILE = os.path.join(DATASET_FOLDER, 'train.csv')
TEST_FILE = os.path.join(DATASET_FOLDER, 'test.csv')

# Load RAW data (required to get all image links)
df_train_raw = pd.read_csv(TRAIN_FILE, engine='python', on_bad_lines='warn')
df_test_raw = pd.read_csv(TEST_FILE, engine='python', on_bad_lines='warn')


# --- 1. Cleanup and Deletion of Corrupt Files (Space Recovery) ---
def clean_and_verify_files(image_dir):
    """Deletes zero-byte files, which often result in duplicated file system space usage."""
    if not os.path.exists(image_dir):
        os.makedirs(image_dir, exist_ok=True)
        return 0

    # Ensure Colab filesystem cache is refreshed
    !ls -R "{image_dir}" > /dev/null

    file_list = os.listdir(image_dir)
    zero_byte_count = 0

    for filename in file_list:
        file_path = os.path.join(image_dir, filename)
        try:
            # Check file size (os.path.getsize)
            if os.path.getsize(file_path) == 0:
                os.remove(file_path)
                zero_byte_count += 1

        except FileNotFoundError:
            continue
        except Exception:
            continue

    print(f"Cleanup finished. Deleted {zero_byte_count} zero-byte files.")
    return len(os.listdir(image_dir))

# Execute Cleanup
clean_and_verify_files(IMAGE_DOWNLOAD_FOLDER)


# --- 2. Optimized Download (Get ONLY the Missing Images) ---
def ensure_missing_images_downloaded():

    # 2.1 Get the list of all required links
    train_links = df_train_raw['image_link'].dropna().tolist()
    test_links = df_test_raw['image_link'].dropna().tolist()
    all_links = list(set(train_links + test_links))

    # 2.2 Filter links to only include those whose filenames are NOT present
    # os.listdir() is used to check existing file names
    existing_filenames = set(os.listdir(IMAGE_DOWNLOAD_FOLDER))
    links_to_download = []

    for link in all_links:
        filename = Path(link).name
        # If the filename does not exist in the folder, we need to download it
        if filename not in existing_filenames:
            links_to_download.append(link)

    if not links_to_download:
        print(f"✅ All {len(all_links)} required images are present and clean. Proceeding...")
        return

    print(f"Total required unique images: {len(all_links)}")
    print(f"⚡ Starting download for {len(links_to_download)} missing images...")

    # 2.3 Call the download function with the filtered list
    download_images(links_to_download, IMAGE_DOWNLOAD_FOLDER)

    print("Image download process finished.")
    print(f"Total files in folder: {len(os.listdir(IMAGE_DOWNLOAD_FOLDER))}")

# Execute the final download step:
ensure_missing_images_downloaded()

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from tqdm import tqdm
import numpy as np

# --- 1. Image Preprocessing Pipeline ---
IMAGE_TRANSFORM = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_image(image_path):
    """Loads and transforms a single image with fallback."""
    try:
        image = Image.open(image_path).convert('RGB')
        return IMAGE_TRANSFORM(image)
    except Exception:
        # Fallback: Return a tensor of zeros for missing/corrupt images
        return torch.zeros(3, 224, 224)

# --- 2. CNN Feature Extraction ---
def extract_cnn_features(df_raw, image_dir, batch_size=64, model_name='resnet18'):

    # Use ResNet-18 (fast) or VGG-16/19
    model = getattr(models, model_name)(weights=models.ResNet18_Weights.IMAGENET1K_V1)

    # Remove the final classification layer (for feature extraction)
    model.fc = nn.Identity()
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device) # CRITICAL: Move model to GPU

    # Prepare list of preprocessed image tensors
    image_tensors = [preprocess_image(os.path.join(image_dir, Path(link).name))
                     for link in df_raw['image_link']]

    features = []

    with torch.no_grad():
        for i in tqdm(range(0, len(image_tensors), batch_size), desc="Extracting Image Features"):
            batch = image_tensors[i:i + batch_size]
            batch = torch.stack(batch).to(device) # CRITICAL: Move data to GPU

            output = model(batch)
            features.append(output.cpu().numpy())

    return np.vstack(features)

# --- Execution ---
IMAGE_DIR = 'dataset/product_images'

X_train_img_features = extract_cnn_features(df_train_raw, IMAGE_DIR)
X_test_img_features = extract_cnn_features(df_test_raw, IMAGE_DIR)

print("Phase 1 Complete. Image features extracted using Transfer Learning.")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 72.1MB/s]


In [None]:
# --- 3. Model 2: Image-Only Price Prediction ---

# Assume y_train (log_price) is available from your Stage 1/2 processing
y_train = df_train_proc['log_price']

lgb_image_model = lgb.LGBMRegressor(
    objective='regression', n_estimators=1000, learning_rate=0.05, n_jobs=-1, random_state=42, verbose=-1
)

print("\nStarting Image-Only LightGBM Training...")
lgb_image_model.fit(X_train_img_features, y_train)

# Generate log-price predictions
preds_train_img_log = lgb_image_model.predict(X_train_img_features)
preds_test_img_log = lgb_image_model.predict(X_test_img_features)

# --- 4. Ensemble (Blending) ---
from sklearn.linear_model import Ridge

# NOTE: Rerun your first LGBM model (Text+Struct) to get these predictions:
# 1. Get predictions from your existing Text+Struct model (Model 1)
#    (Assuming lgb_model and X_train/test_final are available from the previous success)
preds_train_text_log = lgb_model.predict(X_train_final)
preds_test_text_log = lgb_model.predict(X_test_final)

# 2. Prepare Blending Data
X_blend_train = np.column_stack([preds_train_text_log, preds_train_img_log])
X_blend_test = np.column_stack([preds_test_text_log, preds_test_img_log])

# 3. Train the Blending Model (Simple and fast Ridge Regression)
blender = Ridge(alpha=1.0) # A small regularization
blender.fit(X_blend_train, y_train)

# 4. Generate Final Log Predictions
final_preds_test_log = blender.predict(X_blend_test)

# --- 5. Final Submission ---

# Inverse transform and clamp
final_preds_price = np.maximum(0.01, np.expm1(final_preds_test_log))

# Format and Save (Ensuring 75,000 rows and correct header/index handling)
output_df_ensemble = pd.DataFrame({
    'sample_id': df_test_raw['sample_id'],
    'price': final_preds_price
})

OUTPUT_FILE = 'submission_multimodal_ensemble.csv'
output_df_ensemble.to_csv(
    OUTPUT_FILE,
    index=False,
    header=True,
    float_format='%.2f'
)

print(f"\n✅ Multimodal Ensemble Submission file created: {OUTPUT_FILE}")
print("This blended model is optimized for high accuracy and speed.")

In [None]:
#first submission was done using this
'''
# --- Final Submission Code (Stage 4) ---

# Get sample_ids from the test set
sample_ids = df_test_raw['sample_id']

output_df = pd.DataFrame({
    'sample_id': sample_ids,
    'price': y_pred_price
})

# Save to CSV (CRITICAL: Ensure HEADER is included by default, or explicitly set to True)
OUTPUT_FILE = 'submission_final_with_header.csv'
output_df.to_csv(
    OUTPUT_FILE,
    index=False,
    header=True,             # <-- Ensure header is explicitly set to True
    float_format='%.2f'
)

print(f"\n✅ Submission file created: {OUTPUT_FILE}")'''


✅ Submission file created: submission_final_with_header.csv
