In [1]:
# ---------------------------------------------------------------------------
# ML Challenge 2025: Smart Product Pricing
# Full End-to-End Pipeline with Validation
#
# >> PyTorch Version <<
#
# This single script handles the entire workflow:
# 1. Text Feature Engineering
# 2. Parallel Image Downloading & Feature Extraction
# 3. Combined Model Training & SMAPE Validation
# ---------------------------------------------------------------------------

import pandas as pd
import numpy as np
import re
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import concurrent.futures

# ML/DL Libraries
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split ## NEW ## Import for validation split
from scipy.sparse import hstack, csr_matrix
import torch
import timm
from torchvision import transforms

print("--- ML Challenge 2025: Full Pipeline Initializing (PyTorch Version) ---")
print(f"PyTorch Version: {torch.__version__}")
print(f"Timm Version: {timm.__version__}")

# === GPU Verification ===
is_gpu_available = torch.cuda.is_available()
if is_gpu_available:
    device = torch.device("cuda")
    print(f"✅ Success! Found GPU: {torch.cuda.get_device_name(0)}")
    print("   PyTorch will use the GPU for image processing.")
else:
    device = torch.device("cpu")
    print("⚠️ Warning: No GPU found.")
    print("   PyTorch will run on the CPU, which will be very slow for image processing.")
print("-" * 60)

# === Main Configuration ===
DATA_PATH = 'dataset/'
IMAGE_DIR = os.path.join(DATA_PATH, 'product_images/')
FEATURES_DIR = os.path.join(DATA_PATH, 'features/')

os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(FEATURES_DIR, exist_ok=True)


## NEW ## Function to calculate SMAPE score
def calculate_smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100


# === Strategy 1: Text Feature Engineering Functions ===
def extract_ipq(text):
    """Extracts the Item Pack Quantity (IPQ) from text."""
    text = str(text).lower()
    patterns = [
        r'pack of (\d+)', r'(\d+)\s*x\s', r'(\d+)\s*count', r'(\d+)\s*ct',
        r'(\d+)\s*pack', r'(\d+)\s*pcs', r'(\d+)\s*pk', r'(\d+)\s*ea'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
    return 1


# === Strategy 2: Image Processing Functions ===
def download_single_image(args):
    """Helper function to download one image; designed for multithreading."""
    image_url, filename = args
    if not os.path.exists(filename):
        try:
            response = requests.get(image_url, timeout=20)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            img.convert('RGB').save(filename, "JPEG")
            return "Downloaded"
        except requests.exceptions.RequestException:
            return "Failed"
        except Exception:
            return "Failed"
    else:
        return "Skipped"

def download_images(df, save_dir):
    """Downloads images from URLs in a dataframe in parallel using multithreading."""
    os.makedirs(save_dir, exist_ok=True)
    print(f"Checking and downloading images to {save_dir} using multiple threads...")
    tasks = []
    for index, row in df.iterrows():
        filename = os.path.join(save_dir, f"{row['sample_id']}.jpg")
        tasks.append((row['image_link'], filename))
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        results = list(tqdm(executor.map(download_single_image, tasks), total=len(tasks), desc="Downloading images"))
    print(f"Download process complete. Downloaded: {results.count('Downloaded')}, Skipped: {results.count('Skipped')}, Failed: {results.count('Failed')}")

def extract_image_features(image_dir, df, model, device):
    """Extracts image embeddings using a pre-trained PyTorch CNN."""
    IMG_SIZE = 224
    transform = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    features = []
    model.eval()
    print("Extracting image features with PyTorch...")
    with torch.no_grad():
        for sample_id in tqdm(df['sample_id'], desc="Extracting features"):
            img_path = os.path.join(image_dir, f"{sample_id}.jpg")
            if os.path.exists(img_path):
                try:
                    img = Image.open(img_path).convert('RGB')
                    img_tensor = transform(img).unsqueeze(0).to(device)
                    feature = model(img_tensor)
                    features.append(feature.cpu().numpy().flatten())
                except Exception:
                    features.append(np.zeros(model.num_features))
            else:
                features.append(np.zeros(model.num_features))
    return np.array(features)


# === Main Execution Block ===
if __name__ == "__main__":
    
    # --- Step 1: Load original data ---
    print("\n--- Step 1: Loading original CSV data ---")
    train_df = pd.read_csv(DATA_PATH + 'train.csv')
    test_df = pd.read_csv(DATA_PATH + 'test.csv')
    print("CSV data loaded successfully.")

    # --- Step 2: Process Text Features ---
    print("\n--- Step 2: Processing text features ---")
    train_df['ipq'] = train_df['catalog_content'].apply(extract_ipq)
    test_df['ipq'] = test_df['catalog_content'].apply(extract_ipq)
    print("IPQ feature created.")

    vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
    X_train_text = vectorizer.fit_transform(train_df['catalog_content'])
    X_test_text = vectorizer.transform(test_df['catalog_content'])
    print("Text vectorization complete.")

    # --- Step 3: Process Image Features (Download & Extract) ---
    print("\n--- Step 3: Processing image features ---")
    train_features_path = os.path.join(FEATURES_DIR, 'train_image_features.npy')
    test_features_path = os.path.join(FEATURES_DIR, 'test_image_features.npy')

    if os.path.exists(train_features_path) and os.path.exists(test_features_path):
        print("Found pre-computed image features. Loading from disk.")
        train_image_features = np.load(train_features_path)
        test_image_features = np.load(test_features_path)
    else:
        print("Pre-computed features not found. Running the full image pipeline...")
        # 3a. Download images
        download_images(train_df, os.path.join(IMAGE_DIR, 'train/'))
        download_images(test_df, os.path.join(IMAGE_DIR, 'test/'))
        print("Image download complete.")

        # 3b. Extract features
        print(f"Loading EfficientNetB0 model onto device: {device.type}")
        cnn_model = timm.create_model(
            'efficientnet_b0', pretrained=True, num_classes=0
        ).to(device)
        train_image_features = extract_image_features(os.path.join(IMAGE_DIR, 'train/'), train_df, cnn_model, device)
        test_image_features = extract_image_features(os.path.join(IMAGE_DIR, 'test/'), test_df, cnn_model, device)
        
        # 3c. Save features
        np.save(train_features_path, train_image_features)
        np.save(test_features_path, test_image_features)
        print("Image features extracted and saved to disk for future runs.")

    print(f"Shape of training image features: {train_image_features.shape}")

    # --- Step 4: Combine All Features and Create Validation Set --- ## MODIFIED ##
    print("\n--- Step 4: Combining features and creating validation set ---")
    train_ipq_sparse = csr_matrix(train_df['ipq'].values.reshape(-1, 1))
    test_ipq_sparse = csr_matrix(test_df['ipq'].values.reshape(-1, 1))
    train_image_sparse = csr_matrix(train_image_features)
    test_image_sparse = csr_matrix(test_image_features)

    X_train_combined = hstack([X_train_text, train_ipq_sparse, train_image_sparse])
    X_test_combined = hstack([X_test_text, test_ipq_sparse, test_image_sparse])
    y_train_full = train_df['price']

    ## NEW ## Split data into training and validation sets (80/20 split)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_combined, y_train_full, test_size=0.2, random_state=42
    )
    print(f"Original training data shape: {X_train_combined.shape}")
    print(f"New training set shape: {X_train.shape}")
    print(f"Validation set shape: {X_val.shape}")

    # --- Step 5: Train Model and Validate --- ## MODIFIED ##
    print("\n--- Step 5: Training the final LightGBM model ---")
    lgb_params = {
        'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 3000,
        'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
        'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1,
        'num_leaves': 40, 'verbose': -1, 'n_jobs': -1, 'seed': 42
    }
    model = lgb.LGBMRegressor(**lgb_params)
    
    ## MODIFIED ## Train on the new, smaller training set
    model.fit(X_train, y_train)
    print("Model training complete!")

    ## NEW ## Evaluate model performance on the validation set
    print("\n--- Evaluating model performance on the validation set ---")
    val_predictions = model.predict(X_val)
    smape_score = calculate_smape(y_val, val_predictions)
    print(f"✅ Validation SMAPE Score: {smape_score:.4f}%")

    # --- Step 6: Generate Submission File ---
    print("\n--- Step 6: Generating final submission file ---")
    predictions = model.predict(X_test_combined)
    predictions[predictions < 0] = 0
    predictions = np.round(predictions, 2)

    submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': predictions})
    submission_path = os.path.join(DATA_PATH, 'submission_final_local_pytorch.csv')
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission file created successfully at: {submission_path}")
    print(submission_df.head())

  from .autonotebook import tqdm as notebook_tqdm


--- ML Challenge 2025: Full Pipeline Initializing (PyTorch Version) ---
PyTorch Version: 2.5.1
Timm Version: 1.0.20
✅ Success! Found GPU: NVIDIA GeForce RTX 3050 Laptop GPU
   PyTorch will use the GPU for image processing.
------------------------------------------------------------

--- Step 1: Loading original CSV data ---
CSV data loaded successfully.

--- Step 2: Processing text features ---
IPQ feature created.
Text vectorization complete.

--- Step 3: Processing image features ---
Found pre-computed image features. Loading from disk.
Shape of training image features: (75000, 1280)

--- Step 4: Combining features and creating validation set ---
Original training data shape: (75000, 11281)
New training set shape: (60000, 11281)
Validation set shape: (15000, 11281)

--- Step 5: Training the final LightGBM model ---
Model training complete!

--- Evaluating model performance on the validation set ---




✅ Validation SMAPE Score: 53.8563%

--- Step 6: Generating final submission file ---
Submission file created successfully at: dataset/submission_final_local_pytorch.csv
   sample_id  price
0     100179  14.48
1     245611  21.33
2     146263  17.84
3      95658   6.63
4      36806  16.01
