# Models: validation, selection, inspection

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype

from gamma.model.prediction import PredictorFitCV
from gamma.model.inspection import ModelInspector
from gamma.model.selection import  ModelGrid
from gamma.model.validation import CircularCrossValidator
from gamma import Sample
from gamma.model.selection import ModelGrid, ModelRanker, summary_report
from gamma.sklearndf.pipeline import PipelineDF, ModelPipelineDF 
from gamma.sklearndf.transformation import SimpleImputerDF, OneHotEncoderDF, ColumnTransformerDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.sklearndf.classification import RandomForestClassifierDF

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"
sample = Sample(observations=df, target_name=TARGET)

In [None]:
categorical_features = sample.features.select_dtypes([object]).columns
numerical_features = [col for col, dtype in sample.features.dtypes.iteritems() if is_numeric_dtype(dtype)]

## ModelPipeline

In [None]:
ohe_df = PipelineDF(steps=
 [('imputer', SimpleImputerDF(strategy='constant', fill_value='nan')), 
  ('ohe', OneHotEncoderDF(sparse=False, handle_unknown='ignore'))])

imputer_df = SimpleImputerDF(strategy="median")

preprocessing_df = ColumnTransformerDF(transformers=
  [('categorical', ohe_df, categorical_features),('numerical', imputer_df, numerical_features)])

# pipeline_df = PipelineDF(steps=
#   [('preprocessing', preprocessing_df), ('rf_model', RandomForestRegressorDF(n_estimators=10))])

In [None]:
predictor = RandomForestRegressorDF()

In [None]:
model_pipeline_df = ModelPipelineDF(predictor=predictor, preprocessing=preprocessing_df)

In [None]:
model_pipeline_df.fit(sample.features, sample.target);

In [None]:
model_pipeline_df.score(sample.features, sample.target)

## Predictor
todo

In [None]:
cv = CircularCrossValidator(test_ratio=0.2, num_splits=5)

In [None]:
predictor_fit = PredictorFitCV(model=model_pipeline_df, cv=cv, sample=sample)

In [None]:
predictions0 = predictor_fit.predictions_for_split(split_id=0)

In [None]:
type(predictions0)

In [None]:
predictions0.head()

In [None]:
model0 = predictor_fit.fitted_model(split_id=0)

In [None]:
models = predictor_fit.fitted_models()

In [None]:
list(models)[0]

## Model and hyperparameter optimization
todo

In [None]:
rf_pipeline = ModelPipelineDF(predictor=RandomForestRegressorDF(random_state=0), preprocessing=SimpleImputerDF())
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=SimpleImputerDF())

In [None]:
rf_pipeline = ModelPipelineDF(predictor=RandomForestRegressorDF(random_state=0), preprocessing=preprocessing_df)
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=preprocessing_df)

In [None]:
grids = [ModelGrid(pipeline=rf_pipeline, predictor_parameters={"n_estimators": [10, 20]}),
         ModelGrid(pipeline=lgbm_pipeline, predictor_parameters={"learning_rate": [0.1, 0.2]})]

In [None]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

In [None]:
top_model = ranking[0]

## Model inspection
todo

In [None]:
inspector = ModelInspector(predictor_fit)

In [None]:
inspector.feature_importances().head()

In [None]:
inspector.shap_matrix().head()