#### Import Libraries and Setup

In [2]:
# Import essential libraries for final predictions
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import custom module
import sys
sys.path.append('../src')
from feature_engineering import FeatureEngineer

# Machine learning libraries
from sklearn.metrics import mean_squared_error, r2_score

print("Libraries imported successfully")
print("Ready to generate final predictions with champion XGBoost model")
print(f"Working directory: {Path.cwd()}")

Libraries imported successfully
Ready to generate final predictions with champion XGBoost model
Working directory: C:\Users\Carlos\Documents\Data Science program\000 my_models\001_portfolio_house_prices\house-prices-advanced-regression-techniques\notebooks


#### Load Data and Champion Model

In [4]:
# Load the datasets
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("Data loaded:")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Prepare test data for predictions
X_test = test_df.drop(['Id'], axis=1)
test_ids = test_df['Id'].copy()

print(f"\nTest data preparation:")
print(f"Test features: {X_test.shape}")
print(f"Test IDs: {len(test_ids)}")

# Load the champion model saved from Notebook 04
champion_model_path = Path('../models/champion_xgboost_model.pkl')
champion_model = joblib.load(champion_model_path)

print(f"\nChampion model loaded successfully!")
print(f"Model path: {champion_model_path}")
print(f"Model type: {type(champion_model)}")
print(f"Ready for final predictions!")

Data loaded:
Training data shape: (1460, 81)
Test data shape: (1459, 80)

Test data preparation:
Test features: (1459, 79)
Test IDs: 1459

Champion model loaded successfully!
Model path: ..\models\champion_xgboost_model.pkl
Model type: <class 'sklearn.pipeline.Pipeline'>
Ready for final predictions!


#### Generate Final Predictions

In [5]:
# Generate predictions on test dataset
print("GENERATING FINAL PREDICTIONS")
print("=" * 35)

print("Applying champion XGBoost pipeline to test data...")
print("Pipeline includes:")
print("  1. Feature Engineering (TotalSF, TotalBath, age features)")
print("  2. Preprocessing (imputation, scaling, encoding)")
print("  3. Optimized XGBoost model")

# Generate predictions
test_predictions = champion_model.predict(X_test)

print(f"\nPredictions generated successfully!")
print(f"Number of predictions: {len(test_predictions)}")
print(f"Prediction statistics:")
print(f"  Mean: ${test_predictions.mean():,.0f}")
print(f"  Median: ${np.median(test_predictions):,.0f}")
print(f"  Min: ${test_predictions.min():,.0f}")
print(f"  Max: ${test_predictions.max():,.0f}")
print(f"  Std: ${test_predictions.std():,.0f}")

# Show first few predictions
print(f"\nFirst 10 predictions:")
for i in range(10):
    house_id = test_ids.iloc[i]
    prediction = test_predictions[i]
    print(f"  House ID {house_id}: ${prediction:,.0f}")

print(f"\nPredictions ready for submission/analysis!")

GENERATING FINAL PREDICTIONS
Applying champion XGBoost pipeline to test data...
Pipeline includes:
  1. Feature Engineering (TotalSF, TotalBath, age features)
  2. Preprocessing (imputation, scaling, encoding)
  3. Optimized XGBoost model

Predictions generated successfully!
Number of predictions: 1459
Prediction statistics:
  Mean: $178,762
  Median: $158,707
  Min: $49,033
  Max: $501,272
  Std: $75,832

First 10 predictions:
  House ID 1461: $124,003
  House ID 1462: $162,829
  House ID 1463: $176,590
  House ID 1464: $186,512
  House ID 1465: $189,005
  House ID 1466: $176,071
  House ID 1467: $169,968
  House ID 1468: $162,555
  House ID 1469: $179,631
  House ID 1470: $122,000

Predictions ready for submission/analysis!


#### Save Predictions and Create Output Files

In [6]:
# Create prediction results DataFrame
predictions_df = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})

print("SAVING PREDICTION RESULTS")
print("=" * 30)

# Save predictions to CSV (Kaggle submission format)
submission_path = Path('../data/house_price_predictions.csv')
predictions_df.to_csv(submission_path, index=False)

print(f"Predictions saved to: {submission_path}")
print(f"Format: Kaggle submission ready (Id, SalePrice)")
print(f"Records: {len(predictions_df)}")

# Display the submission file format
print(f"\nSubmission file preview:")
print(predictions_df.head(10))

# Save detailed results with statistics
results_summary = {
    'model_name': 'XGBoost',
    'model_performance': {
        'r2_score': 0.9148,
        'rmse': 25559
    },
    'prediction_statistics': {
        'count': len(test_predictions),
        'mean': float(test_predictions.mean()),
        'median': float(np.median(test_predictions)),
        'min': float(test_predictions.min()),
        'max': float(test_predictions.max()),
        'std': float(test_predictions.std())
    },
    'model_parameters': {
        'learning_rate': 0.1,
        'max_depth': 3,
        'n_estimators': 200
    }
}

import json
results_path = Path('../models/prediction_results_summary.json')
with open(results_path, 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"\nDetailed results saved to: {results_path}")
print(f"\nFINAL PREDICTIONS COMPLETE!")
print(f"Ready for deployment or Kaggle submission")

SAVING PREDICTION RESULTS
Predictions saved to: ..\data\house_price_predictions.csv
Format: Kaggle submission ready (Id, SalePrice)
Records: 1459

Submission file preview:
     Id      SalePrice
0  1461  124002.695312
1  1462  162829.312500
2  1463  176590.015625
3  1464  186512.031250
4  1465  189004.984375
5  1466  176071.312500
6  1467  169967.531250
7  1468  162555.281250
8  1469  179630.828125
9  1470  121999.773438

Detailed results saved to: ..\models\prediction_results_summary.json

FINAL PREDICTIONS COMPLETE!
Ready for deployment or Kaggle submission


#### Project Completion Summary

In [7]:
# PROJECT COMPLETION SUMMARY
print("🎉 HOUSE PRICE PREDICTION PROJECT COMPLETED! 🎉")
print("=" * 55)

print("PROJECT ACHIEVEMENTS:")
print("✅ Built professional preprocessing pipeline with ColumnTransformer")
print("✅ Implemented modular code architecture (src/feature_engineering.py)")
print("✅ Optimized 5 ML models using GridSearchCV")
print("✅ Achieved 91.5% R² performance with XGBoost")
print("✅ Generated 1,459 test predictions")
print("✅ Created production-ready model artifacts")

print(f"\nFINAL MODEL PERFORMANCE:")
print(f"  Champion: XGBoost")
print(f"  R² Score: 0.9148 (91.5% variance explained)")
print(f"  RMSE: $25,559")
print(f"  Hyperparameters: learning_rate=0.1, max_depth=3, n_estimators=200")

print(f"\nPROJECT DELIVERABLES:")
print(f"  📊 5 Jupyter notebooks (exploration → predictions)")
print(f"  🔧 Preprocessing pipeline: models/preprocessing_pipeline.pkl")
print(f"  🏆 Champion model: models/champion_xgboost_model.pkl")
print(f"  📈 Predictions: data/house_price_predictions.csv")
print(f"  📋 Results summary: models/prediction_results_summary.json")
print(f"  🏗️ Modular code: src/feature_engineering.py")

print(f"\nTECHNICAL HIGHLIGHTS:")
print(f"  • Production-safe preprocessing (no data leakage)")
print(f"  • Professional pipeline architecture")
print(f"  • Comprehensive hyperparameter optimization")
print(f"  • End-to-end reproducible workflow")
print(f"  • Portfolio-ready documentation")

print(f"\n🚀 PROJECT READY FOR PORTFOLIO AND PRODUCTION DEPLOYMENT! 🚀")

🎉 HOUSE PRICE PREDICTION PROJECT COMPLETED! 🎉
PROJECT ACHIEVEMENTS:
✅ Built professional preprocessing pipeline with ColumnTransformer
✅ Implemented modular code architecture (src/feature_engineering.py)
✅ Optimized 5 ML models using GridSearchCV
✅ Achieved 91.5% R² performance with XGBoost
✅ Generated 1,459 test predictions
✅ Created production-ready model artifacts

FINAL MODEL PERFORMANCE:
  Champion: XGBoost
  R² Score: 0.9148 (91.5% variance explained)
  RMSE: $25,559
  Hyperparameters: learning_rate=0.1, max_depth=3, n_estimators=200

PROJECT DELIVERABLES:
  📊 5 Jupyter notebooks (exploration → predictions)
  🔧 Preprocessing pipeline: models/preprocessing_pipeline.pkl
  🏆 Champion model: models/champion_xgboost_model.pkl
  📈 Predictions: data/house_price_predictions.csv
  📋 Results summary: models/prediction_results_summary.json
  🏗️ Modular code: src/feature_engineering.py

TECHNICAL HIGHLIGHTS:
  • Production-safe preprocessing (no data leakage)
  • Professional pipeline archit