# 05 MODEL EVALUATION

In [1]:
import os
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Get in the correct working directory

In [3]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_ratings_predictions/notebooks'

In [4]:
os.chdir("../")

In [5]:
%pwd

'/Users/chrissunderland/Desktop/starbucks_ratings_predictions'

### Load Model

In [6]:
from src.StarbucksProject.constants import *
from src.StarbucksProject.utils.common import read_yaml, create_directories
import joblib

In [7]:
model_path = read_yaml(CONFIG_FILE_PATH).model_evaluation.model_path

[ 2024-07-17 12:17:39,066: INFO: common: yaml file: config/config.yaml loaded successfully ]


In [8]:
model = joblib.load(model_path)

#### Evaluate performance on test set

In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [10]:
config = read_yaml(CONFIG_FILE_PATH)

[ 2024-07-17 12:17:39,637: INFO: common: yaml file: config/config.yaml loaded successfully ]


In [11]:
X_test = pd.read_csv(config.model_evaluation.x_test_path)
y_test = pd.read_csv(config.model_evaluation.y_test_path)

In [12]:
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

In [13]:
y_test_pred = model.predict(X_test_scaled)
test_mae = mean_absolute_error(y_test, y_test_pred)

In [14]:
print(f"knr test set MAE = {test_mae}")

knr test set MAE = 0.3518098456747338


#### Hyperparameter Tuning

In [15]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [16]:
X_train = pd.read_csv(config.model_evaluation.x_train_path)
y_train = pd.read_csv(config.model_evaluation.y_train_path)

In [17]:
params = read_yaml(PARAMS_FILE_PATH)

[ 2024-07-17 12:17:39,688: INFO: common: yaml file: params.yaml loaded successfully ]


In [18]:
# pipeline = Pipeline([('scaler', StandardScaler()),
#                      ('knr', model)])

In [19]:
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('abr', model)])

In [20]:
param_grid = { "abr__" + key: list(value) for key, value in dict(params.AdaBoostRegressor).items()}

In [21]:
gs = GridSearchCV(estimator=pipeline, 
                  param_grid=param_grid, 
                  cv=3,
                  scoring='neg_mean_absolute_error', 
                  verbose=1,
                  n_jobs=-1)

In [22]:
# gs.fit(X_train, y_train)
gs.fit(X_train, y_train.values.ravel())

Fitting 3 folds for each of 18 candidates, totalling 54 fits


In [23]:
gs_y_test_pred = gs.predict(X_test)
gs_mae = mean_absolute_error(y_test, gs_y_test_pred)

In [24]:
print("Test Mean Absolute Error (MAE):", gs_mae)

Test Mean Absolute Error (MAE): 0.3523445979171443


In [25]:
create_directories([config.model_evaluation.root_dir])

[ 2024-07-17 12:17:43,672: INFO: common: created directory at: artifacts/model_evaluation ]


In [26]:
joblib.dump(gs.best_estimator_, os.path.join(config.model_evaluation.root_dir, 
                                             config.model_evaluation.tuned_model))

['artifacts/model_evaluation/tuned_model.joblib']

### Create the entity

In [27]:
from dataclasses import dataclass
from pathlib import Path

In [28]:
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    model_path: Path
    x_train_path: Path
    y_train_path: Path
    x_test_path: Path
    y_test_path: Path
    metric_file_name: Path
    tuned_model: str
    mlflow_uri: str
    all_params: dict  # see params.yaml
    target_column: str # see schema.yaml

### Create configuration manager

In [29]:
from src.StarbucksProject.constants import *
from src.StarbucksProject.utils.common import read_yaml, create_directories, save_json

In [30]:
class ConfigurationManager:

    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        
        config = self.config.model_evaluation
        # params = self.params.KNeighborsRegressor
        params = self.params.AdaBoostRegressor
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(root_dir=config.root_dir,
                                                        model_path=config.model_path,
                                                        x_train_path=config.x_train_path,
                                                        y_train_path=config.y_train_path,
                                                        x_test_path=config.x_test_path,
                                                        y_test_path=config.y_test_path,
                                                        metric_file_name=config.metric_file_name,
                                                        tuned_model=config.tuned_model,
                                                        mlflow_uri=config.mlflow_uri,
                                                        all_params=params,
                                                        target_column=schema.name)
            
        return model_evaluation_config

In [31]:
c = ConfigurationManager()

[ 2024-07-17 12:17:43,744: INFO: common: yaml file: config/config.yaml loaded successfully ]
[ 2024-07-17 12:17:43,748: INFO: common: yaml file: params.yaml loaded successfully ]
[ 2024-07-17 12:17:43,751: INFO: common: yaml file: schema.yaml loaded successfully ]
[ 2024-07-17 12:17:43,752: INFO: common: created directory at: artifacts ]


In [32]:
test_config = c.get_model_evaluation_config()

[ 2024-07-17 12:17:43,758: INFO: common: created directory at: artifacts/model_evaluation ]


### Create the component

In [33]:
from src.StarbucksProject import logger
from sklearn.linear_model import ElasticNet
import joblib
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import numpy as np

In [34]:
class ModelEvaluation:
    
    def __init__(self, config: ModelEvaluationConfig):
        
        self.config = config

    
    def get_mae(self, actual, pred):
        
        mae = mean_absolute_error(actual, pred)
        
        return mae
        
    def tune_with_mlflow(self):
        
        model = joblib.load(self.config.model_path)

        X_train = pd.read_csv(self.config.x_train_path)
        y_train = pd.read_csv(self.config.y_train_path)
        X_test = pd.read_csv(self.config.x_test_path)
        y_test = pd.read_csv(self.config.y_test_path)
        
#         pipe = Pipeline([('scaler', StandardScaler()),
#                          ('knr', model)])
                
        pipe = Pipeline([('scaler', StandardScaler()),
                     ('abr', model)])
        
        param_grid = { "abr__" + k: list(v) for k,v in dict(self.config.all_params).items()}
        # param_grid = { "abr__" + key: list(value) for key, value in dict(params.AdaBoostRegressor).items()}

        gs = GridSearchCV(estimator=pipe, 
                          param_grid=param_grid, 
                          cv=3,
                          scoring='neg_mean_absolute_error', 
                          verbose=1,
                          n_jobs=-1)
        
        # gs.fit(X_train, y_train)
        gs.fit(X_train, y_train.values.ravel())
        tuned_model = gs.best_estimator_
        joblib.dump(tuned_model, os.path.join(self.config.root_dir, self.config.tuned_model))
                
                
        # ML FLOW
        
        mlflow.set_registry_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

                
        with mlflow.start_run():
            
            predicted_vals = tuned_model.predict(X_test)

            test_mae = self.get_mae(y_test, predicted_vals)

            scores = {"mae": test_mae}
            save_json(path=Path(self.config.metric_file_name), data=scores) # save metrics locally

            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("mae", test_mae)

            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(model, "model", registered_model_name="AdaBoost Tuned")
            else:
                mlflow.sklearn.log_model(model, "model")

### Create the pipeline

In [35]:
from dotenv import load_dotenv

In [36]:
try:

    load_dotenv()
    
    config = ConfigurationManager()

    model_evaluation_config = config.get_model_evaluation_config()
    
    model_evaluator = ModelEvaluation(config=model_evaluation_config)

    model_evaluator.tune_with_mlflow()
    
except Exception as e:

    raise e

[ 2024-07-17 12:17:44,564: INFO: common: yaml file: config/config.yaml loaded successfully ]
[ 2024-07-17 12:17:44,566: INFO: common: yaml file: params.yaml loaded successfully ]
[ 2024-07-17 12:17:44,569: INFO: common: yaml file: schema.yaml loaded successfully ]
[ 2024-07-17 12:17:44,570: INFO: common: created directory at: artifacts ]
[ 2024-07-17 12:17:44,570: INFO: common: created directory at: artifacts/model_evaluation ]
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[ 2024-07-17 12:17:47,069: INFO: common: json file saved at: artifacts/model_evaluation/metrics.json ]


Registered model 'AdaBoost Tuned' already exists. Creating a new version of this model...
2024/07/17 12:17:54 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: AdaBoost Tuned, version 4
Created version '4' of model 'AdaBoost Tuned'.


### Modularize the code (once working)

- Return to project folder