# CatBoost Model - Kaggle GPU Runner

This notebook runs the CatBoost model on Kaggle's GPU infrastructure.

## Before Running:
1. **Enable GPU**: Go to Settings → Accelerator → Select "GPU" (P100 or T4)
2. **Add Competition Data**: Click "Add Input" → Search for "house-prices-advanced-regression-techniques" → Add it
3. **Run all cells** in order


In [None]:
# Step 1: Clone repository and install dependencies
import subprocess
import os
import sys

REPO_URL = "https://github.com/FranckNgN/kaggle-house-prices.git"
PROJECT_DIR = "/kaggle/working/project"

# Remove existing directory if it exists (for re-runs)
if os.path.exists(PROJECT_DIR):
    import shutil
    shutil.rmtree(PROJECT_DIR)

# Clone repository
print(f"Cloning repository from {REPO_URL}...")
result = subprocess.run(
    ["git", "clone", REPO_URL, PROJECT_DIR],
    capture_output=True,
    text=True
)

if result.returncode != 0:
    print(f"ERROR: Failed to clone repository")
    print(result.stderr)
    raise RuntimeError("Repository clone failed")

print(f"[OK] Repository cloned to {PROJECT_DIR}")

# Change to project directory
os.chdir(PROJECT_DIR)
print(f"Current directory: {os.getcwd()}")

# Install dependencies
print("\nInstalling dependencies...")
result = subprocess.run(
    ["pip", "install", "-r", "requirements.txt"],
    capture_output=True,
    text=True
)

if result.returncode == 0:
    print("[OK] Dependencies installed")
else:
    print(f"[WARNING] Some dependencies may have failed")
    print(result.stderr)

# Add project to Python path
sys.path.insert(0, PROJECT_DIR)
print(f"\n[OK] Project root added to Python path")


In [None]:
# Step 2: Setup Kaggle environment (symlinks, GPU verification)
from kaggle.remote.setup_kaggle import setup_kaggle_environment

setup_kaggle_environment()


In [None]:
# Step 3: Verify GPU availability
print("Checking GPU with nvidia-smi...")
print("-" * 70)
!nvidia-smi

print("\n" + "-" * 70)
from kaggle.remote.gpu_runner import verify_gpu_setup, print_gpu_usage_info

gpu_available = verify_gpu_setup()
print()
print_gpu_usage_info()


In [None]:
# Step 4: Run CatBoost model training (GPU-accelerated)
# This will:
# - Load preprocessed data
# - Run Optuna hyperparameter optimization
# - Train final model with best parameters
# - Generate submission file

%run notebooks/Models/9catBoostModel.py


In [None]:
# Step 5: Verify outputs and check results
import pandas as pd
from pathlib import Path

print("=" * 70)
print("TRAINING RESULTS SUMMARY")
print("=" * 70)

# Check submission files
submissions_dir = Path("/kaggle/working/project/data/submissions")
print("\nSubmission files:")
print("-" * 70)

if submissions_dir.exists():
    submission_files = list(submissions_dir.rglob("*.csv"))
    for file in sorted(submission_files):
        if file.name != "sample_submission.csv":
            size_kb = file.stat().st_size / 1024
            print(f"  {file.name}: {size_kb:.1f} KB")
            # Show first few rows
            df = pd.read_csv(file)
            print(f"    Shape: {df.shape}")
            print(f"    First predictions: {df.head(3).to_string()}")
else:
    print("  No submissions directory found")

# Check model performance log
runs_dir = Path("/kaggle/working/project/runs")
perf_file = runs_dir / "model_performance.csv"
print("\nModel Performance:")
print("-" * 70)

if perf_file.exists():
    df = pd.read_csv(perf_file)
    print(f"Total model runs: {len(df)}")
    if len(df) > 0:
        latest = df.iloc[-1]
        print(f"\nLatest Run:")
        print(f"  Model: {latest['model']}")
        print(f"  CV RMSE: {latest['rmse']:.6f}")
        print(f"  Runtime: {latest.get('runtime', 'N/A')}")
        print(f"\nFull performance log:")
        print(df.to_string())
else:
    print("  No performance log found")

print("\n" + "=" * 70)
print("[INFO] To download submission file:")
print("  1. Go to Kaggle notebook 'Data' output panel (right side)")
print("  2. Navigate to /kaggle/working/project/data/submissions/")
print("  3. Download the latest submission CSV file")
print("=" * 70)
