# 06 Final Pipeline And Deployment Preparation 

## 1. Objectives

This notebook consolidates the final predictive pipeline using the selected model. The goals are:

- Load training artefacts and model components
- Reconstruct the preprocessing and prediction pipeline
- Validate pipeline on test data
- Prepare outputs for deployment and dashboards
- Serialize final components for production use

## Change Working Directory
- Since it is expected that you would keep the notebooks in a subfolder, you will need to switch the working directory when you run the notebook in the editor.
- The working directory must be changed from its current folder to its parent folder.
- We wish to change the current directory's parent to the new current directory.
- Verify the updated current directory.

In [8]:
# Smart Working Directory Setup
import os
project_root = '/workspaces/heritage_housing'
if os.getcwd() != project_root:
    try:
        os.chdir(project_root)
        print(f"[INFO] Changed working directory to project root: {os.getcwd()}")
    except FileNotFoundError:
        raise FileNotFoundError(f"[ERROR] Project root '{project_root}' not found!")

### Requirements (Import Libraries + Verify + Load Artifacts)

In [13]:
# Import Libraries

import pandas as pd
import numpy as np
import os
import json
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import datetime  # For any timestamping/logging

# Verify Dependencies

required_dependencies = {
    "pandas": "1.4.2",
    "numpy": "1.24.4",
    "matplotlib": "3.4.3",
    "seaborn": "0.11.2",
    "joblib": "1.4.2"
}

installed_dependencies = {}
for lib, expected_version in required_dependencies.items():
    try:
        lib_version = __import__(lib).__version__
        installed_dependencies[lib] = lib_version
        if lib_version != expected_version:
            print(f"{lib} version mismatch: Expected {expected_version}, found {lib_version}")
        else:
            print(f"{lib} is correctly installed (version {lib_version})")
    except ImportError:
        print(f"{lib} is not installed!")

print("\nInstalled Dependencies:")
print(json.dumps(installed_dependencies, indent=4))

# Load Saved Artifacts

# Define artifact paths
artifacts_paths = {
    "random_forest_model": "../outputs/models/random_forest_model.pkl",
    "xgboost_model": "../outputs/models/xgboost_model.pkl",
    "evaluation_metrics": "../outputs/metrics/model_evaluation_summmary.csv",
    "feature_importance_rf": "../outputs/ft_importance/random_forest_feature_importance.csv",
    "feature_importance_xgb": "../outputs/ft_importance/xgboost_feature_importance.csv",
    "test_features": "../data/processed/final/X_test.csv",
    "test_target": "../data/processed/final/y_test.csv",
}

# Load models
try:
    rf_model = joblib.load(artifacts_paths["best_random_forest"])
    xgb_model = joblib.load(artifacts_paths["best_gradient_boosting"])
    print("Models loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading models: {e}")

# Load evaluation metrics
try:
    evaluation_metrics = pd.read_csv(artifacts_paths["model_evaluation_summmary"])
    print("Evaluation metrics loaded.")
except FileNotFoundError as e:
    print(f"Error loading evaluation metrics: {e}")

# Load feature importance
try:
    feature_importance_rf = pd.read_csv(artifacts_paths["feature_importance_rf"])
    feature_importance_xgb = pd.read_csv(artifacts_paths["feature_importance_xgb"])
    print("Feature importance data loaded.")
except FileNotFoundError as e:
    print(f"Error loading feature importance data: {e}")

# Load test features and target
try:
    test_features = pd.read_csv(artifacts_paths["test_features"])
    test_target = pd.read_csv(artifacts_paths["test_target"])
    print(" Test data loaded.")
except FileNotFoundError as e:
    print(f"Error loading test data: {e}")

# Quick display
print("\nEvaluation Metrics:")
display(model_evaluation_summary.head())

print("\nFeature Importance (Random Forest):")
display(feature_importance_rf.head())

print("\nFeature Importance (XGBoost):")
display(feature_importance_xgb.head())

print("\nTest Features (5 rows):")
display(test_features.head())

print("\nTest Target (5 rows):")
display(test_target.head())

pandas version mismatch: Expected 1.4.2, found 2.1.1
numpy version mismatch: Expected 1.24.4, found 1.26.1
matplotlib version mismatch: Expected 3.4.3, found 3.8.0
seaborn version mismatch: Expected 0.11.2, found 0.13.2
joblib is correctly installed (version 1.4.2)

Installed Dependencies:
{
    "pandas": "2.1.1",
    "numpy": "1.26.1",
    "matplotlib": "3.8.0",
    "seaborn": "0.13.2",
    "joblib": "1.4.2"
}


KeyError: 'best_random_forest'