# Quick Start

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()



## TL;DR

We first make our imports:

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, DendrogramFeatMapStyle, DendrogramLineStyle, DendrogramReportStyle
from gamma.sklearndf.pipeline import RegressionPipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF, StandardScalerDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import RegressionModelInspector
from gamma.model.fitcv import RegressorFitCV
from gamma.model.selection import ModelParameterGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateUpliftSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning
from gamma.yieldengine.viz import SimulationDrawer, SimulationPlotStyle

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

We load the Boston dataset, a regression problem on house prices in boston suburbs:

In [None]:
boston = datasets.load_boston()
TARGET='MEDIAN_HOUSE_PRICE'
df = pd.DataFrame(data=boston.data, columns=boston.feature_names).assign(MEDIAN_HOUSE_PRICE=boston.target)
df.head()

In [None]:
sample = Sample(observations=df, target_name=TARGET)

In [None]:
sample.features.head()

In [None]:
sample.target.head()

##  Dataframes instead of arrays

In [None]:
imputed_array = SimpleImputer().fit_transform(sample.features)
imputed_array

In [None]:
imputed_df = SimpleImputerDF().fit_transform(sample.features)
imputed_df.head()

In [None]:
sample_imputed = SimpleImputerDF().fit_transform_sample(sample=sample)

## Cross-validation and model selection

In [None]:
preprocessing = PipelineDF(steps=[('impute', SimpleImputerDF()), ('normalize', StandardScalerDF())])

In [None]:
rf_pipeline = RegressionPipelineDF(
    regressor=RandomForestRegressorDF(random_state=0), preprocessing=preprocessing)
lgbm_pipeline = RegressionPipelineDF(
    regressor=LGBMRegressorDF(random_state=0), preprocessing=preprocessing)

In [None]:
grids = [
    ModelParameterGrid(pipeline=rf_pipeline, estimator_parameters={"n_estimators": [10, 50]}),
    ModelParameterGrid(pipeline=lgbm_pipeline, estimator_parameters={"learning_rate": [0.1, 0.2]})
]

In [None]:
cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)
ranker = ModelRanker(grids=grids, cv=cv)
ranking = ranker.run(sample_imputed, n_jobs=-3)
print(summary_report(ranking))

## Model inspection

In [None]:
top_model = ranking[0].model

In [None]:
regressor = RegressorFitCV(pipeline=top_model, cv=cv, sample=sample_imputed)

We can retrieve the predictions for all splits, hence make a full inspection of the model:

In [None]:
regressor.predictions_for_split(split_id=0).head()

In [None]:
inspector = RegressionModelInspector(regressor)
inspector.shap_matrix().head()

In [None]:
inspector.feature_importances().head()

In [None]:
linkage_tree = inspector.cluster_dependent_features()
ax = plt.figure(figsize=(10, 10)).add_subplot(111)
style_matplotlib = DendrogramFeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style_matplotlib).draw()

In [None]:
style_report = DendrogramReportStyle()
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style_report).draw()

## Feature independence engineering

In [None]:
sample_new = sample_imputed.select_features(['RM', 'LSTAT','AGE', 'CRIM'])

And we can rerun our pipeline to select, fit and inspect our model with this new data:

In [None]:
ranking = ranker.run(sample_new, n_jobs=-3)
top_model = ranking[0].model
predictor = RegressorFitCV(pipeline=top_model, cv=cv, sample=sample_new)
inspector = RegressionModelInspector(predictor)
linkage_tree = inspector.cluster_dependent_features()
ax = plt.figure(figsize=(10, 10)).add_subplot(111)
style = DendrogramFeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

## Simulation

In [None]:
simulator = UnivariateUpliftSimulator(models=regressor)

In [None]:
partition = ContinuousRangePartitioning(values=predictor.sample.features["LSTAT"])

In [None]:
univariate_simulation = simulator.simulate_feature(feature_name="LSTAT", partitioning=partition)

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)
style = SimulationPlotStyle(ax)
drawer = SimulationDrawer(simulation=univariate_simulation, style=style)
drawer.draw()