# Model and Evaluation


## Objectives


- Implement ML pipeline. 
- Implement regression model to predict the sale price of properties,


## Inputs 


- outputs/datasets/datacollection/HousePrices.csv


## Outputs


- Train Set and Test Set. 
- Machine Learning pipeline. 


## CRISP-DM


- "Modelling and Evaluation".

In [None]:
import os
current_dir = os.getcwd()
current_dir

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
import numpy as np
import pandas as pd 
df = pd.read_csv("outputs/datasets/datacollection/HousePrices.csv")

print(df.shape)
df.head(5)

In [None]:
from sklearn.pipeline import Pipeline

### Data Cleaning
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import MeanMedianImputer

### Feature Engineering
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection

### Feature Scaling
from sklearn.preprocessing import StandardScaler

### Feature Selection 
from sklearn.feature_selection import SelectFromModel

### ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor



def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF']) ),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                variables = ['BsmtFinType1', 'GarageFinish'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage']) ),  
    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ["GarageFinish", "KitchenQual", "BsmtExposure", "BsmtFinType1"]) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea', 'LotArea', 'LotFrontage']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'MasVnrArea', 'OpenPorchSF' ]) ),
      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base

In [6]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = PipelineOptimization(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'],axis=1),
                                    df['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

In [8]:
# Dictionary that maps model to their corresponding scikit-learn or XGBoost regression models.

models_quick_search = {

    # Linear Regression model. 
    "LinearRegression": LinearRegression(),

    # Decision Tree Regressor model with a fixed random state for reproducibility. 
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),

    # Which is an ensemble method using multiple decision trees.
    "RandomForestRegressor": RandomForestRegressor(random_state=0),

    # Another ensemble method similar to Random Forest, but with no more randomness.
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),

    # Which combines multiple weak learners to form a stronger model.
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),

    # Another ensemble method that builds trees sequentially to correct previous errors.
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),

    # A powerful gradient boosting framework that is often more efficient and accurate. 
    "XGBRegressor": XGBRegressor(random_state=0),
}

# Dictionary that maps each model name to an empty dictionary or parameters. 
# This is used to store or pass hyperparameters when tuning or training the models.
params_quick_search = {
    "LinearRegression": {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

In [None]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary 

In [11]:
# Define a list of models
models_search = {
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0)
}

# Define a dictionary for parameter grids to corresponding to the model names
params_search = {
    "ExtraTreesRegressor": {
        "model__n_estimators": [50, 100, 150],
        "model__max_depth": [None, 3, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50]
    }
}

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

In [None]:
regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
regressor_pipeline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# Define the number of data cleaning and feature engineering steps
data_cleaning_feat_eng_steps = 7 

# Extract columns after data cleaning and feature engineering steps
columns_after_cleaning = (
    Pipeline(regressor_pipeline.steps[:data_cleaning_feat_eng_steps])
    .transform(X_train)
    .columns
)

# Get the list of selected features
selected_features_mask = regressor_pipeline["feat_selection"].get_support()
best_features = columns_after_cleaning[selected_features_mask].tolist()

# Create a DataFrame to display feature importance
df_feature_importance = (
    pd.DataFrame({
        "Feature": columns_after_cleaning[selected_features_mask],
        "Importance": regressor_pipeline["model"].feature_importances_
    })
    .sort_values(by="Importance", ascending=False)
)

# Print most important features
print(
    f"* These are the {len(best_features)} most important features in descending order. "
    f"The model was trained on them: \n{df_feature_importance['Feature'].tolist()}"
)

# Plot feature importance
df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.show()

In [18]:
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_regression_model(X_train, y_train, X_test, y_test, regressor):
    """
    Evaluate the performance of a regression model on both training and test sets.
    """
    print("Model Evaluation\n")
    evaluate_on_dataset(X_train, y_train, regressor, dataset_name="Train Set")
    evaluate_on_dataset(X_test, y_test, regressor, dataset_name="Test Set")

def evaluate_on_dataset(X_train, y_train, X_test, y_test, regressor, dataset_name="Test Set"):
    """
    Evaluate the regression performance on a single dataset.
    """
    predictions = regressor.predict(X_test)
    r2 = r2_score(y_test, predictions).round(3)
    mae = mean_absolute_error(y_test, predictions).round(3)
    print(f"* {dataset_name}")
    print(f'  R2 Score: {r2}')
    print(f'  Mean Absolute Error: {mae}\n')

def plot_regression_results(X_train, y_train, X_test, y_test, model, alpha=0.5):
    """
    Plot actual vs predicted values for training and test sets.
    """
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    # Train set plot
    sns.scatterplot(x=y_train, y=pred_train, alpha=alpha, ax=axes[0])
    sns.lineplot(x=y_train, y=y_train, color='red', ax=axes[0])
    axes[0].set_title("Train Set")
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predicted")

    # Test set plot
    sns.scatterplot(x=y_test, y=pred_test, alpha=alpha, ax=axes[1])
    sns.lineplot(x=y_test, y=y_test, color='red', ax=axes[1])
    axes[1].set_title("Test Set")
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predicted")

    plt.tight_layout()
    plt.show()


In [None]:
def evaluate_regression_model(X_train, y_train, X_test, y_test, regressor):
    # Evaluate regression performance
    evaluate_on_dataset(X_train, y_train, X_test, y_test, regressor)

    # Generate evaluation plots
    plot_regression_results(X_train, y_train, X_test, y_test, regressor)

# Call the combined function
evaluate_regression_model(X_train, y_train, X_test, y_test, regressor_pipeline)

In [20]:
from sklearn.decomposition import PCA

def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    
    ("ArbitraryNumberImputer",ArbitraryNumberImputer(arbitrary_number=0,
                                variables = ['2ndFlrSF', 'EnclosedPorch', 'MasVnrArea', 'WoodDeckSF']) ),

    ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                variables = ['BsmtFinType1', 'GarageFinish'])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                variables = ['BedroomAbvGr', 'GarageYrBlt', 'LotFrontage']) ),  
    ### Feature Engineering 
    ("Ordinalencoder", OrdinalEncoder(encoding_method='arbitrary', 
                          variables = ["GarageFinish", "KitchenQual", "BsmtExposure", "BsmtFinType1"]) ),
                          
    ("LogTransformer", vt.LogTransformer(
                         variables = ['1stFlrSF', 'GrLivArea', 'LotArea', 'LotFrontage']) ),

    ("PowerTransformer", vt.PowerTransformer(
                         variables = ['BsmtFinSF1', 'BsmtUnfSF', 'GarageArea', 'GrLivArea', 'MasVnrArea', 'OpenPorchSF' ]) ),
      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="spearman", 
                                                        threshold=0.8, selection_method="variance") ),
    ("feat_scaling",StandardScaler() ),

    ("feat_selection",SelectFromModel(model) ),

    ("model",model ),  
     ])

  return pipeline_base

In [None]:
# Code is being resued from above. 

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

In [None]:
models_search

In [None]:
best_parameters

In [24]:
params_search = {
    "ExtraTreesRegressor": {"model__n_estimators": [50, 100, 150],
    "model__max_depth": [None, 3, 15],
    "model__min_samples_split": [2, 50],
    "model__min_samples_leaf": [1, 50],
    }
}

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by="mean_score")
grid_search_summary

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

In [None]:
best_pipeline_regressor = grid_search_pipelines[best_model].best_estimator_
best_pipeline_regressor

In [None]:
import joblib
import os

version = "v1"
file_path = f"outputs/ml_pipeline/predictsale_price/{version}"

try:
    os.makedirs(name=file_path)
except Exception as e:
    print(e)

In [None]:
X_train.head()

In [31]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train.head()

In [33]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

In [None]:
X_test.head()

In [35]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test.head()

In [37]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

In [None]:
regressor_pipeline

In [None]:
import joblib

joblib.dump(value=best_pipeline_regressor, filename=f"{file_path}/regression_pipeline.pkl")

In [None]:
# Sort the features by importance for better visualization 
df_feature_importance = df_feature_importance.sort_values(by="Importance", ascending=False)

# Create bar plot
plt.bar(df_feature_importance["Feature"], df_feature_importance["Importance"])
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature Importance")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
df_feature_importance.plot(kind="bar", x="Feature", y="Importance")
plt.savefig(f"{file_path}/features_importance.png", bbox_inches="tight")