#### Import Libraries and Setup

In [1]:
# Import essential libraries for pipeline modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
import xgboost as xgb

# Set up plotting
plt.style.use('default')
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = (12, 8)

print("All libraries imported successfully!")
print(f"Working directory: {Path.cwd()}")

All libraries imported successfully!
Working directory: C:\Users\Carlos\Documents\Data Science program\000 my_models\001_portfolio_house_prices\house-prices-advanced-regression-techniques\notebooks


#### Load Data and Preprocessing Pipeline

In [2]:
# Load the training data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Data loaded:")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Import FeatureEngineer from our custom module
import sys
sys.path.append('../src')
from feature_engineering import FeatureEngineer

print("FeatureEngineer imported from module")

# Now load the preprocessing pipeline
pipeline_path = Path('../models/preprocessing_pipeline.pkl')
preprocessing_pipeline = joblib.load(pipeline_path)

print(f"Preprocessing pipeline loaded successfully!")
print("Pipeline steps:")
for i, (name, step) in enumerate(preprocessing_pipeline.steps):
    print(f"  {i+1}. {name}: {type(step).__name__}")

# Prepare features and target
X = train_df.drop(['Id', 'SalePrice'], axis=1)
y = train_df['SalePrice']

print(f"\nData preparation:")
print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target statistics: Mean=${y.mean():,.0f}, Std=${y.std():,.0f}")

Data loaded:
Training data shape: (1460, 81)
Test data shape: (1459, 80)
FeatureEngineer imported from module
Preprocessing pipeline loaded successfully!
Pipeline steps:
  1. feature_engineering: FeatureEngineer
  2. preprocessing: ColumnTransformer

Data preparation:
Features (X) shape: (1460, 79)
Target (y) shape: (1460,)
Target statistics: Mean=$180,921, Std=$79,443


#### Create Complete ML Pipelines

In [3]:
# Create complete pipelines (preprocessing + model) for different algorithms
print("Creating complete ML pipelines...")

# 1. Linear Regression Pipeline
linear_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', LinearRegression())
])

# 2. Ridge Regression Pipeline
ridge_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', Ridge(random_state=42))
])

# 3. Lasso Regression Pipeline
lasso_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', Lasso(random_state=42))
])

# 4. Random Forest Pipeline
rf_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', RandomForestRegressor(random_state=42))
])

# 5. XGBoost Pipeline
xgb_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', xgb.XGBRegressor(random_state=42, verbosity=0))
])

# Store all pipelines
pipelines = {
    'Linear Regression': linear_pipeline,
    'Ridge': ridge_pipeline,
    'Lasso': lasso_pipeline,
    'Random Forest': rf_pipeline,
    'XGBoost': xgb_pipeline
}

print(f"Created {len(pipelines)} complete ML pipelines:")
for name in pipelines.keys():
    print(f"  - {name}")

print(f"\nEach pipeline includes:")
print(f"  1. Feature engineering (TotalSF, TotalBath, etc.)")
print(f"  2. Preprocessing (imputation, scaling, encoding)")
print(f"  3. Model fitting and prediction")

Creating complete ML pipelines...
Created 5 complete ML pipelines:
  - Linear Regression
  - Ridge
  - Lasso
  - Random Forest
  - XGBoost

Each pipeline includes:
  1. Feature engineering (TotalSF, TotalBath, etc.)
  2. Preprocessing (imputation, scaling, encoding)
  3. Model fitting and prediction


#### Train-Test Split and Basic Pipeline Evaluation

Split the data and test these pipelines with basic evaluation before we implement GridSearchCV.

In [4]:
# Split the data for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Evaluate each pipeline with basic parameters
results = {}
print(f"\nEvaluating pipelines with default parameters...")
print("=" * 50)

for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")
    
    # Fit the complete pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Store results
    results[name] = {
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train R²': train_r2,
        'Test R²': test_r2
    }
    
    print(f"  Train RMSE: {train_rmse:,.0f}")
    print(f"  Test RMSE: {test_rmse:,.0f}")
    print(f"  Test R²: {test_r2:.4f}")

print(f"\nBasic evaluation complete!")

Data split:
Training set: 1168 samples
Test set: 292 samples

Evaluating pipelines with default parameters...

Training Linear Regression...
  Train RMSE: 20,458
  Test RMSE: 99,290
  Test R²: -0.2853

Training Ridge...
  Train RMSE: 23,566
  Test RMSE: 31,147
  Test R²: 0.8735

Training Lasso...
  Train RMSE: 20,760
  Test RMSE: 36,411
  Test R²: 0.8272

Training Random Forest...
  Train RMSE: 11,089
  Test RMSE: 28,752
  Test R²: 0.8922

Training XGBoost...
  Train RMSE: 1,232
  Test RMSE: 28,479
  Test R²: 0.8943

Basic evaluation complete!


Performance Analysis

    Top Performers:
    XGBoost: Test R² = 0.8943 (89.4% variance explained!)
    Random Forest: Test R² = 0.8922 (89.2% variance explained)
    Ridge: Test R² = 0.8735 (87.4% variance explained)

Key Insights:

    XGBoost overfitting: Train RMSE = 1,232 vs Test RMSE = 28,479 (needs regularization)
    Ridge performing well: Good balance between train/test performance
    Linear Regression struggling: Negative R² indicates poor fit
    Random Forest: Good performance with reasonable overfitting

#### GridSearchCV Setup and Hyperparameter Tuning

In [5]:
# Define hyperparameter grids for each model
param_grids = {
    'Ridge': {
        'model__alpha': [0.1, 1.0, 10.0, 100.0]
    },
    'Lasso': {
        'model__alpha': [0.001, 0.01, 0.1, 1.0]
    },
    'Random Forest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [10, 15, 20],
        'model__min_samples_split': [2, 5]
    },
    'XGBoost': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [3, 6, 9],
        'model__learning_rate': [0.01, 0.1, 0.2]
    }
}

print("Hyperparameter grids defined:")
for model_name, params in param_grids.items():
    param_combinations = 1
    for param_values in params.values():
        param_combinations *= len(param_values)
    print(f"  {model_name}: {param_combinations} combinations")

print(f"\nStarting GridSearchCV optimization...")

Hyperparameter grids defined:
  Ridge: 4 combinations
  Lasso: 4 combinations
  Random Forest: 18 combinations
  XGBoost: 27 combinations

Starting GridSearchCV optimization...


#### Execute GridSearchCV Optimization

In [6]:
import time

# Store optimized results
optimized_results = {}
best_models = {}

print("GridSearchCV Optimization Results:")
print("=" * 50)

for model_name in param_grids.keys():
    print(f"\nOptimizing {model_name}...")
    start_time = time.time()
    
    # Get the pipeline for this model
    pipeline = pipelines[model_name]
    param_grid = param_grids[model_name]
    
    # Setup GridSearchCV
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    
    # Fit GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model
    
    # Make predictions with best model
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Store results
    optimized_results[model_name] = {
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train R²': train_r2,
        'Test R²': test_r2,
        'Best Params': grid_search.best_params_,
        'CV Score': grid_search.best_score_
    }
    
    elapsed_time = time.time() - start_time
    print(f"  Best CV R²: {grid_search.best_score_:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Test RMSE: {test_rmse:,.0f}")
    print(f"  Time: {elapsed_time:.1f}s")
    print(f"  Best params: {grid_search.best_params_}")

print(f"\nGridSearchCV optimization complete!")

GridSearchCV Optimization Results:

Optimizing Ridge...
  Best CV R²: 0.8025
  Test R²: 0.8731
  Test RMSE: 31,198
  Time: 7.0s
  Best params: {'model__alpha': 10.0}

Optimizing Lasso...
  Best CV R²: 0.7365
  Test R²: 0.8272
  Test RMSE: 36,411
  Time: 2.9s
  Best params: {'model__alpha': 1.0}

Optimizing Random Forest...
  Best CV R²: 0.8406
  Test R²: 0.8926
  Test RMSE: 28,703
  Time: 85.9s
  Best params: {'model__max_depth': 15, 'model__min_samples_split': 5, 'model__n_estimators': 100}

Optimizing XGBoost...
  Best CV R²: 0.8603
  Test R²: 0.9148
  Test RMSE: 25,559
  Time: 44.1s
  Best params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}

GridSearchCV optimization complete!


Optimization Results Analysis

    New Champion: XGBoost
    Test R²: 0.9148 (91.5% variance explained!)
    Test RMSE: 25,559 (significant improvement from 28,479)
    Best params: learning_rate=0.1, max_depth=3, n_estimators=200

Performance Improvements:

    XGBoost: 0.8943 → 0.9148 (+2.0 percentage points)
    Random Forest: 0.8922 → 0.8926 (slight improvement)
    Ridge: 0.8735 → 0.8731 (minimal change)

#### Final Model Comparison and Results Summary

In [7]:
# Create comprehensive comparison
print("FINAL MODEL COMPARISON")
print("=" * 60)

# Combine basic and optimized results for comparison
comparison_data = []
for model_name in ['Ridge', 'Lasso', 'Random Forest', 'XGBoost']:
    if model_name in results:
        basic_r2 = results[model_name]['Test R²']
        basic_rmse = results[model_name]['Test RMSE']
    else:
        basic_r2 = basic_rmse = 0
    
    if model_name in optimized_results:
        opt_r2 = optimized_results[model_name]['Test R²']
        opt_rmse = optimized_results[model_name]['Test RMSE']
        improvement = opt_r2 - basic_r2
    else:
        opt_r2 = opt_rmse = improvement = 0
    
    comparison_data.append({
        'Model': model_name,
        'Basic R²': basic_r2,
        'Optimized R²': opt_r2,
        'Improvement': improvement,
        'Final RMSE': opt_rmse
    })

# Sort by optimized R²
comparison_data.sort(key=lambda x: x['Optimized R²'], reverse=True)

print(f"{'Model':<15} {'Basic R²':<10} {'Optimized R²':<12} {'Improvement':<12} {'RMSE':<10}")
print("-" * 60)
for data in comparison_data:
    print(f"{data['Model']:<15} {data['Basic R²']:<10.4f} {data['Optimized R²']:<12.4f} "
          f"{data['Improvement']:<12.4f} {data['Final RMSE']:<10,.0f}")

# Identify the champion
champion = comparison_data[0]
print(f"\nCHAMPION MODEL: {champion['Model']}")
print(f"Final Performance: R² = {champion['Optimized R²']:.4f}, RMSE = ${champion['Final RMSE']:,.0f}")

# Show best hyperparameters for champion
if champion['Model'] in optimized_results:
    print(f"Best Parameters: {optimized_results[champion['Model']]['Best Params']}")

FINAL MODEL COMPARISON
Model           Basic R²   Optimized R² Improvement  RMSE      
------------------------------------------------------------
XGBoost         0.8943     0.9148       0.0206       25,559    
Random Forest   0.8922     0.8926       0.0004       28,703    
Ridge           0.8735     0.8731       -0.0004      31,198    
Lasso           0.8272     0.8272       0.0000       36,411    

CHAMPION MODEL: XGBoost
Final Performance: R² = 0.9148, RMSE = $25,559
Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}


#### Save Champion Model

In [10]:
# Save the champion model for production deployment
print("SAVING CHAMPION MODEL FOR PRODUCTION")
print("=" * 40)

# Get the champion model (XGBoost from our results)
champion_model = best_models['XGBoost']
champion_name = 'XGBoost'

# Create models directory path
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save the complete champion pipeline
champion_model_path = models_dir / 'champion_xgboost_model.pkl'
joblib.dump(champion_model, champion_model_path)

print(f"Champion model saved: {champion_model_path}")
print(f"Model: {champion_name}")
print(f"Performance: R² = {optimized_results[champion_name]['Test R²']:.4f}")
print(f"RMSE: ${optimized_results[champion_name]['Test RMSE']:,.0f}")
print(f"Best Parameters: {optimized_results[champion_name]['Best Params']}")

# Verify the saved model can be loaded
try:
    loaded_model = joblib.load(champion_model_path)
    print(f"\nModel verification: Successfully loaded from disk")
    print(f"Model type: {type(loaded_model)}")
    print(f"\nReady for production deployment!")
except Exception as e:
    print(f"Error loading model: {e}")

SAVING CHAMPION MODEL FOR PRODUCTION
Champion model saved: ..\models\champion_xgboost_model.pkl
Model: XGBoost
Performance: R² = 0.9148
RMSE: $25,559
Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}

Model verification: Successfully loaded from disk
Model type: <class 'sklearn.pipeline.Pipeline'>

Ready for production deployment!
