# Quick Start

In [1]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

working dir is 'C:\Users\martin florent\Documents\projects\yield-engine'
added `src` to python paths


In [2]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, FeatMapStyle, LineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning
from gamma.viz.simulation import SimulationDrawer, SimulationData, SimulationMatplotStyle

In [3]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)

In [5]:
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df = df[list(num_features)]

In [6]:
sample = Sample(observations=df, target_name=TARGET)

In [8]:
boruta_selector = PipelineDF(steps = [
        ('preprocess', SimpleImputerDF(strategy="median")),
        ('boruta', BorutaDF(estimator=RandomForestRegressor(), n_estimators=10, 
                            verbose=2, max_iter=10, random_state=0))
])

AttributeError: 'Pipeline' object has no attribute 'base_transformer'

In [None]:
boruta_selector.fit(sample.features, sample.target);

In [None]:
boruta_selector.columns_original

In [None]:
sample = sample.select_features(boruta_selector.columns_original)

In [None]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)

In [None]:
rf_pipeline = ModelPipelineDF(predictor=RandomForestRegressorDF(random_state=0), preprocessing=SimpleImputerDF())
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=SimpleImputerDF())

In [None]:
grids = [
    ModelGrid(pipeline=rf_pipeline, predictor_parameters={"n_estimators": [10, 20]}),
    ModelGrid(pipeline=lgbm_pipeline, predictor_parameters={"learning_rate": [0.1, 0.2]})
]

In [None]:
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

In [None]:
top_model = ranking[0]
print(top_model.scoring['test_score'])
print(top_model.parameters)

In [None]:
predictor = PredictorFitCV(model=top_model.model, cv=circular_cv, sample=sample)

In [None]:
inspector = ModelInspector(predictor)
predictions = predictor.predictions_for_all_splits()

In [None]:
inspector.feature_importances()

The Shap clustering clusters the features using as distance between the features, the correlation matrix of the shap values.
Then using a hierarchical clustering, and visualization style defined in the yield-engine package, one can easily visualize the clustering of the features.

In [None]:
linkage_tree = inspector.cluster_dependent_features()

In [None]:
number_features = predictor.sample.features.shape[1]
ax = plt.figure(figsize=(10, number_features*.5)).add_subplot(111)
style = FeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

The simulation builds partial dependency plots which allow to assess the impact that the value of a given feature has on the model predictions.

In [None]:
model_fit = inspector.model_fit
sim = UnivariateSimulator(model_fit=model_fit)

### Simulation

In [None]:
simulator = UnivariateSimulator(model_fit=model_fit)

In [None]:
partition = ContinuousRangePartitioning(values=model_fit.sample.features["OverallQual"], max_partitions=10)

In [None]:
univariate_simulation = simulator.get_simulation_from_partition(partition, "OverallQual")

In [None]:
f = "{:.0f}"
style = SimulationMatplotStyle(xticklabels_kwargs={'rotation':45}, hspace=.3, xtickslabels_format=f)

In [None]:
drawer = SimulationDrawer(simulation=univariate_simulation, style=style, histogram=True)

In [None]:
drawer.draw()