# Modelling and Evaluation

### Inputs
- Train set engineered CSV

### Outputs

### Objectives

- Select best hyperparameters and algorithm
- Predict CSmpA of concrete via regression

In [1]:
import os
current_dir = os.getcwd()
current_dir


'/workspace/concrete-strength/jupyter_notebooks'

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/concrete-strength'

Load data

In [4]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/cleaned/TrainSetEngineered.csv"))
print(df.shape)
df.head()

(824, 8)


Unnamed: 0,cement,slag,flyash,water,coarseaggregate,fineaggregate,age,csMPa
0,10.083308,0.698912,1.0,119.30805,1859.591102,74065.950376,3.461989,43.89
1,9.483185,0.698912,1.0,115.999523,2086.779165,80994.570422,3.461989,38.21
2,8.987288,5.260016,1.0,131.071191,1770.160564,74341.507912,3.461989,33.42
3,8.557683,0.698912,0.008382,117.379509,2065.282127,80937.293504,1.40217,13.12
4,10.184724,0.698912,1.0,123.701476,2282.625099,55427.133259,4.681869,47.22


Split dataframe into features and target variable

In [5]:
features = df.drop(['csMPa'], axis=1)
target = df['csMPa']
features.head()

Unnamed: 0,cement,slag,flyash,water,coarseaggregate,fineaggregate,age
0,10.083308,0.698912,1.0,119.30805,1859.591102,74065.950376,3.461989
1,9.483185,0.698912,1.0,115.999523,2086.779165,80994.570422,3.461989
2,8.987288,5.260016,1.0,131.071191,1770.160564,74341.507912,3.461989
3,8.557683,0.698912,0.008382,117.379509,2065.282127,80937.293504,1.40217
4,10.184724,0.698912,1.0,123.701476,2282.625099,55427.133259,4.681869


Create pipeline

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

def PipelineOptimization(model):
    pipeline_base = Pipeline([ ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model)])
    
    return pipeline_base

Hyperparameter optimisation search

In [7]:
from sklearn.model_selection import GridSearchCV

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineOptimization(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))
            
            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches

In [8]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}

In [9]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(features, target, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for DecisionTreeRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for AdaBoostRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for GradientBoostingRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for XGBRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [17]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,model__learning_rate,model__n_estimators
5,XGBRegressor,0.806251,0.891416,0.92914,0.045315,0.05,1000
7,XGBRegressor,0.812234,0.891128,0.930359,0.042984,0.1,500
8,XGBRegressor,0.811054,0.890798,0.929841,0.043384,0.1,1000
4,XGBRegressor,0.804569,0.890581,0.927162,0.045144,0.05,500
6,XGBRegressor,0.797791,0.884057,0.920911,0.045306,0.1,100
2,XGBRegressor,0.791391,0.884002,0.920423,0.048008,0.01,1000
3,XGBRegressor,0.766132,0.868882,0.911165,0.053185,0.05,100
1,XGBRegressor,0.76554,0.868712,0.910495,0.053385,0.01,500
0,XGBRegressor,-0.114944,0.019915,0.16747,0.115819,0.01,100


Extensive search on most suitable model - https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook

In [15]:
models_search = {
    "XGBRegressor": XGBRegressor(random_state=0)
}

params_search = {
    "XGBRegressor": {
        #'min_child_weight': [1, 5, 10],
        #'gamma': [0.5, 1, 1.5, 2, 5],
        #'subsample': [0.6, 0.8, 1.0],
        #'colsample_bytree': [0.6, 0.8, 1.0],
        #'max_depth': [3, 4, 5],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__n_estimators': [100, 500, 1000]
    }
}

In [16]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(features, target, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for XGBRegressor 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [18]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,model__learning_rate,model__n_estimators
5,XGBRegressor,0.806251,0.891416,0.92914,0.045315,0.05,1000
7,XGBRegressor,0.812234,0.891128,0.930359,0.042984,0.1,500
8,XGBRegressor,0.811054,0.890798,0.929841,0.043384,0.1,1000
4,XGBRegressor,0.804569,0.890581,0.927162,0.045144,0.05,500
6,XGBRegressor,0.797791,0.884057,0.920911,0.045306,0.1,100
2,XGBRegressor,0.791391,0.884002,0.920423,0.048008,0.01,1000
3,XGBRegressor,0.766132,0.868882,0.911165,0.053185,0.05,100
1,XGBRegressor,0.76554,0.868712,0.910495,0.053385,0.01,500
0,XGBRegressor,-0.114944,0.019915,0.16747,0.115819,0.01,100
