# CatBoost Model - Kaggle GPU Runner

This notebook runs the CatBoost model on Kaggle's GPU infrastructure with complete preprocessing pipeline.

## Before Running:
1. **Enable GPU**: Go to Settings → Accelerator → Select "GPU" (P100 or T4)
2. **Add Competition Data**: Click "Add Input" → Search for "house-prices-advanced-regression-techniques" → Add it
3. **Run all cells** in order

## What This Notebook Does:
- Fixes numpy/scipy compatibility issues
- Clones your GitHub repository
- Sets up Kaggle environment
- Runs complete preprocessing pipeline (stages 1-8)
- Trains CatBoost model with GPU acceleration
- Generates submission file


In [None]:
# Step 1: Clone repository and install dependencies
# CRITICAL: Fix numpy/scipy FIRST before any imports

import subprocess
import os
import sys

# ======================================================================
# STEP 1: Fix numpy/scipy compatibility FIRST
# ======================================================================
print("=" * 70)
print("FIXING NUMPY/SCIPY COMPATIBILITY")
print("=" * 70)

# Uninstall numpy 2.x completely
print("Uninstalling numpy 2.x...")
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "numpy"], 
               capture_output=True, text=True)

# Install numpy 1.26.4 explicitly
print("Installing numpy 1.26.4...")
result = subprocess.run(
    [sys.executable, "-m", "pip", "install", "--no-cache-dir", "numpy==1.26.4"],
    capture_output=True,
    text=True
)

# Install scipy 1.16.3
print("Installing scipy 1.16.3...")
subprocess.run(
    [sys.executable, "-m", "pip", "install", "--no-cache-dir", "scipy==1.16.3"],
    capture_output=True,
    text=True
)

# Verify versions
import numpy as np
import scipy
print(f"\n✓ Numpy: {np.__version__}")
print(f"✓ Scipy: {scipy.__version__}")
print("=" * 70)

# ======================================================================
# STEP 2: Clone repository
# ======================================================================
REPO_URL = "https://github.com/FranckNgN/kaggle-house-prices.git"
PROJECT_DIR = "/kaggle/working/project"

# Ensure we're in a valid directory first
os.chdir("/kaggle/working")
print(f"\nCurrent directory: {os.getcwd()}")

# Remove existing directory if it exists (for re-runs)
if os.path.exists(PROJECT_DIR):
    import shutil
    shutil.rmtree(PROJECT_DIR)
    print(f"[INFO] Removed existing project directory")

# Clone repository
print(f"\nCloning repository from {REPO_URL}...")
result = subprocess.run(
    ["git", "clone", REPO_URL, PROJECT_DIR],
    cwd="/kaggle/working",
    capture_output=True,
    text=True
)

if result.returncode != 0:
    print(f"ERROR: Failed to clone repository")
    print(f"stdout: {result.stdout}")
    print(f"stderr: {result.stderr}")
    raise RuntimeError("Repository clone failed")

print(f"[OK] Repository cloned to {PROJECT_DIR}")

# Change to project directory
os.chdir(PROJECT_DIR)
print(f"Current directory: {os.getcwd()}")

# ======================================================================
# STEP 3: Install dependencies (skip numpy/scipy since we already fixed them)
# ======================================================================
print("\nInstalling dependencies (excluding numpy/scipy)...")
# Install without dependencies first to avoid reinstalling numpy/scipy
result = subprocess.run(
    [sys.executable, "-m", "pip", "install", "-r", "requirements.txt", "--no-deps"],
    capture_output=True,
    text=True
)

# Install core dependencies individually
print("Installing core ML libraries...")
core_deps = ["pandas==2.3.3", "scikit-learn==1.7.2", "optuna>=3.0.0",
             "xgboost>=2.0.0", "lightgbm>=4.0.0", "catboost>=1.2.0"]
for dep in core_deps:
    subprocess.run([sys.executable, "-m", "pip", "install", dep], 
                   capture_output=True, text=True)

print("[OK] Dependencies installed")

# Add project to Python path
sys.path.insert(0, PROJECT_DIR)
print(f"\n[OK] Project root added to Python path")
print("=" * 70)
print("[SUCCESS] Cell 1 complete - ready for Cell 2!")
print("=" * 70)


In [None]:
# Step 2: Setup Kaggle environment (symlinks, GPU verification)
from kaggle.remote.setup_kaggle import setup_kaggle_environment

setup_kaggle_environment()


In [None]:
# Step 4: Run preprocessing pipeline (stages 1-8)
# This generates train_process8.csv and test_process8.csv required for model training

import subprocess
import os
import sys
from pathlib import Path

PROJECT_DIR = "/kaggle/working/project"
os.chdir(PROJECT_DIR)
sys.path.insert(0, PROJECT_DIR)

print("=" * 70)
print("RUNNING PREPROCESSING PIPELINE")
print("=" * 70)
print("This will run all 8 preprocessing stages:")
print("  1. Cleaning")
print("  2. Data Engineering")
print("  3. Skew/Kurtosis")
print("  4. Feature Engineering")
print("  5. Scaling")
print("  6. Categorical Encoding")
print("  7. Feature Selection")
print("  8. Target Encoding")
print("=" * 70)

# Run preprocessing
print("\nStarting preprocessing pipeline...")
result = subprocess.run(
    [sys.executable, "notebooks/preprocessing/run_preprocessing.py"],
    cwd=PROJECT_DIR,
    capture_output=True,
    text=True
)

print("\n" + "=" * 70)
if result.returncode == 0:
    print("[OK] Preprocessing completed successfully!")
    # Show last part of output
    if result.stdout:
        lines = result.stdout.split('\n')
        print("\nLast 20 lines of output:")
        for line in lines[-20:]:
            if line.strip():
                print(f"  {line}")
else:
    print("[WARNING] Preprocessing had errors (check output above)")
    if result.stderr:
        print("\nError output (last 500 chars):")
        print(result.stderr[-500:])

# Verify required files exist
print("\n" + "=" * 70)
print("VERIFYING PREPROCESSED FILES")
print("=" * 70)

train_file = Path(PROJECT_DIR) / "data" / "interim" / "train" / "train_process8.csv"
test_file = Path(PROJECT_DIR) / "data" / "interim" / "test" / "test_process8.csv"

train_exists = train_file.exists()
test_exists = test_file.exists()

print(f"Train file: {train_file.name}")
print(f"  Exists: {'✓ YES' if train_exists else '✗ NO'}")
if train_exists:
    size = train_file.stat().st_size / 1024
    print(f"  Size: {size:.1f} KB")

print(f"\nTest file: {test_file.name}")
print(f"  Exists: {'✓ YES' if test_exists else '✗ NO'}")
if test_exists:
    size = test_file.stat().st_size / 1024
    print(f"  Size: {size:.1f} KB")

if train_exists and test_exists:
    print("\n" + "=" * 70)
    print("[SUCCESS] Preprocessing complete - ready for model training!")
    print("=" * 70)
else:
    print("\n" + "=" * 70)
    print("[ERROR] Required files missing!")
    print("=" * 70)
    print("\nPlease check the preprocessing output above for errors.")
    raise FileNotFoundError("Preprocessed files not found")


In [None]:
# Step 3: Verify GPU availability
print("Checking GPU with nvidia-smi...")
print("-" * 70)
!nvidia-smi

print("\n" + "-" * 70)
from kaggle.remote.gpu_runner import verify_gpu_setup, print_gpu_usage_info

gpu_available = verify_gpu_setup()
print()
print_gpu_usage_info()


In [None]:
# Step 4: Run CatBoost model training (GPU-accelerated)
# This will:
# - Load preprocessed data
# - Run Optuna hyperparameter optimization
# - Train final model with best parameters
# - Generate submission file

%run notebooks/Models/9catBoostModel.py


In [None]:
# Step 5: Verify outputs and check results
import pandas as pd
from pathlib import Path

print("=" * 70)
print("TRAINING RESULTS SUMMARY")
print("=" * 70)

# Check submission files
submissions_dir = Path("/kaggle/working/project/data/submissions")
print("\nSubmission files:")
print("-" * 70)

if submissions_dir.exists():
    submission_files = list(submissions_dir.rglob("*.csv"))
    for file in sorted(submission_files):
        if file.name != "sample_submission.csv":
            size_kb = file.stat().st_size / 1024
            print(f"  {file.name}: {size_kb:.1f} KB")
            # Show first few rows
            df = pd.read_csv(file)
            print(f"    Shape: {df.shape}")
            print(f"    First predictions: {df.head(3).to_string()}")
else:
    print("  No submissions directory found")

# Check model performance log
runs_dir = Path("/kaggle/working/project/runs")
perf_file = runs_dir / "model_performance.csv"
print("\nModel Performance:")
print("-" * 70)

if perf_file.exists():
    df = pd.read_csv(perf_file)
    print(f"Total model runs: {len(df)}")
    if len(df) > 0:
        latest = df.iloc[-1]
        print(f"\nLatest Run:")
        print(f"  Model: {latest['model']}")
        print(f"  CV RMSE: {latest['rmse']:.6f}")
        print(f"  Runtime: {latest.get('runtime', 'N/A')}")
        print(f"\nFull performance log:")
        print(df.to_string())
else:
    print("  No performance log found")

print("\n" + "=" * 70)
print("[INFO] To download submission file:")
print("  1. Go to Kaggle notebook 'Data' output panel (right side)")
print("  2. Navigate to /kaggle/working/project/data/submissions/")
print("  3. Download the latest submission CSV file")
print("=" * 70)
