# Quick Start

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

The **gamma** package allows to

- build data science pipelines that follow scikit-learn API and are compatible with pandas dataframe
- select relevant features
- inspect the trained models
- visualize the results of the inspection

We first import the needed packages

In [None]:
import pandas as pd
# todo remove RandomForestRegressor when possible
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, FeatMapStyle, LineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning
from gamma.viz.simulation import SimulationDrawer, SimulationMatplotStyle

We load the Ames housing dataset: a regression problem on house prices.

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df.head()

In [None]:
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)

In [None]:
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df = df[list(num_features)]

In [None]:
sample = Sample(observations=df, target_name=TARGET)

In [None]:
sample.features.head()

In [None]:
sample.target.head()

## The sklearnDF API

Scikit-learn works by default with numpy arrays: if one uses a scikit-learn transformer with a dataframe as input, one gets a numpy array as output. This can a problem since column names play a key role in the model inspection. 

In [None]:
imputed_array = SimpleImputer().fit_transform(sample.features)

In [None]:
imputed_array

In [None]:
imputed_df = SimpleImputerDF().fit_transform(sample.features)

In [None]:
imputed_df.head()

In [None]:
sample = SimpleImputerDF().fit_transform_sample(sample=sample)

## Pipeline, cross validation and model selection

In [None]:
rf_pipeline = ModelPipelineDF(predictor=RandomForestRegressorDF(random_state=0), preprocessing=SimpleImputerDF())
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=SimpleImputerDF())

In [None]:
grids = [ModelGrid(pipeline=rf_pipeline, predictor_parameters={"n_estimators": [10, 20]}),
         ModelGrid(pipeline=lgbm_pipeline, predictor_parameters={"learning_rate": [0.1, 0.2]})]

In [None]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

## Model inspection

The champion model can be inspected consistently with the cross-valdiation strategy.
We first retrieve the champion model:

In [None]:
top_model = ranking[0]

In [None]:
predictor = PredictorFitCV(model=top_model.model, cv=circular_cv, sample=sample)

In [None]:
inspector = ModelInspector(predictor)
predictions = predictor.predictions_for_all_splits()

In [None]:
inspector.feature_importances()

The Shap clustering clusters the features using as distance between the features, the correlation matrix of the shap values.
Then using a hierarchical clustering, and visualization style defined in the **gamma** package, one can easily visualize the clustering of the features.

In [None]:
linkage_tree = inspector.cluster_dependent_features()

In [None]:
number_features = predictor.sample.features.shape[1]
ax = plt.figure(figsize=(10, number_features*.5)).add_subplot(111)
style = FeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

## Simulation

In [None]:
model_fit = inspector.model_fit
sim = UnivariateSimulator(model_fit=model_fit)

In [None]:
simulator = UnivariateSimulator(model_fit=model_fit)

In [None]:
feature = "GrLivArea"
partition = ContinuousRangePartitioning(values=model_fit.sample.features[feature])

In [None]:
univariate_simulation = simulator.get_simulation_from_partition(partition, feature)

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)
style = SimulationMatplotStyle(ax)
drawer = SimulationDrawer(simulation=univariate_simulation, style=style)
drawer.draw()