# Quick Start

In [1]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

working dir is 'C:\Users\martin florent\Documents\projects\yield-engine'
added `src` to python paths


In [2]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

from yieldengine import Sample
from yieldengine.dendrogram import DendrogramDrawer
from yieldengine.dendrogram.style import FeatMapStyle, LineStyle
from yieldengine.df.pipeline import PipelineDF
from yieldengine.preprocessing.impute import SimpleImputerDF, MissingIndicatorDF
from yieldengine.preprocessing.selection import BorutaDF
from yieldengine.model.inspection import ModelInspector
from yieldengine.model.prediction import PredictorFitCV
from yieldengine.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from yieldengine.model.validation import CircularCrossValidator
from yieldengine.simulation import UnivariateSimulation
from yieldengine.partition import ContinuousRangePartitioning

In [3]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)

In [5]:
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df = df[list(num_features)]

In [6]:
sample = Sample(observations=df, target_name=TARGET)

In [7]:
boruta_selector = PipelineDF(steps = [
        ('preprocess', SimpleImputerDF(strategy="median")),
        ('boruta', BorutaDF(estimator=RandomForestRegressor(),n_estimators=10, verbose=2, max_iter=10))
])

In [8]:
boruta_selector.fit(sample.features, sample.target);

Iteration: 	1 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	2 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	3 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	4 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	5 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	6 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	7 / 10
Confirmed: 	0
Tentative: 	34
Rejected: 	0
Iteration: 	8 / 10
Confirmed: 	10
Tentative: 	8
Rejected: 	16
Iteration: 	9 / 10
Confirmed: 	10
Tentative: 	8
Rejected: 	16


BorutaPy finished running.

Iteration: 	10 / 10
Confirmed: 	10
Tentative: 	2
Rejected: 	16


In [9]:
boruta_selector.columns_original

column_out
LotArea            LotArea
OverallQual    OverallQual
YearBuilt        YearBuilt
BsmtFinSF1      BsmtFinSF1
TotalBsmtSF    TotalBsmtSF
1stFlrSF          1stFlrSF
2ndFlrSF          2ndFlrSF
GrLivArea        GrLivArea
GarageCars      GarageCars
GarageArea      GarageArea
Name: column_in, dtype: object

In [10]:
sample = sample.select_features(boruta_selector.columns_original)

In [11]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)

The class **ModelPipelineDF** specifies a model as an estimator and a preprocessing pipeline.
The class **ModelGrid** specifies a **ModelPipelineDF**  and a hyperparameter grid.

In [12]:
rf_model = ModelPipelineDF(predictor=RandomForestRegressor(), preprocessing=SimpleImputerDF())
lgbm_model = ModelPipelineDF(predictor=LGBMRegressor(), preprocessing=SimpleImputerDF())

TypeError: arg predictor expected to be a DataFramePredictor but is a RandomForestRegressor

In [None]:
help(Model)

In [None]:
grids = [
    ModelGrid(model=rf_model, estimator_parameters={"n_estimators": [10, 20]}),
    ModelGrid(model=lgbm_model, estimator_parameters={"learning_rate": [0.1, 0.2]})
]

In [13]:
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

NameError: name 'grids' is not defined

In [None]:
top_model = ranking[0]
print(top_model.scoring['test_score'])
print(top_model.parameters)

## ModelPipelineDF inspection
The **PredictorFitCV** summarizes all the information of a model: the estimator used for the model, the CV (=cross-validation) type, and the **Sample** itself.

In [None]:
predictor = PredictorFitCV(model=top_model.model, cv=circular_cv, sample=sample)
inspector = ModelInspector(predictor)
predictions = predictor.predictions_for_all_splits()

## Shape values

The inspector object allows directly to acces the shap values of the model. These shap values are computed for a given sample as the average of the shap values over all the test folds containg that given sample.

In [None]:
inspector.feature_importances()

## Shap clustering
The Shap clustering clusters the features using as distance between the features, the correlation matrix of the shap values.
Then using a hierarchical clustering, and visualization style defined in the yield-engine package, one can easily visualize the clustering of the features.

In [None]:
linkage_tree = inspector.cluster_dependent_features()

In [None]:
number_features = predictor.sample.features.shape[1]
ax = plt.figure(figsize=(10, number_features*.5)).add_subplot(111)
style = FeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

It is desirable to have a model with
- good predictivity (good R2 score for instance)
- few features
- independent features  

With the above dendrograms one can isolate **features with low importance and which are strongly realted to other features. It makes sense to discard those.**

## Shap clustering iterations

## Method the run an iteration
Based on the above remarks, who are going to run a clustering iteration as:
1. Based on the shap dendrogram select features to discard
2. Re-run the model with the new set of features
3. Plot again the shap dendrogram to see if the feautures are more independent, and iterate this process if necessary

## Simulation
The simulation builds partial dependency plots which allow to assess the impact thta the value of a given feature has on the model predictions.

In [None]:
model_fit = inspector.model_fit
sim = UnivariateSimulation(model_fit=model_fit)

In [None]:
from IPython.display import display, clear_output
import ipywidgets as widgets

dd = widgets.Dropdown(
    options=predictor.sample.features.columns,
    description='Feature:',
    disabled=False,
    layout={"width":"550px"}
)

btn = widgets.Button(description='Simulate')

def plot_simulation(feature:str):
    feature_values = ContinuousRangePartitioning(df.loc[:,feature]).partitions()
    yield_change = sim.simulate_feature(
            feature_name=feature,
            feature_values=feature_values,
    )
    
    yield_change_aggr = UnivariateSimulation.aggregate_simulation_results(
                    results_per_split=yield_change, percentiles=[10, 50, 90])
    
    XLABEL_TITLE = f"{feature}"
    YLABEL_TITLE = f"Predicted mean yield uplift ({TARGET})"
    COLOR1 = 'red'
    COLOR2 = 'silver'
    
    fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10,10), sharex=True)
    
    # plot lines of prediction
    ax1.set_xlabel(XLABEL_TITLE, color='black', labelpad=10, fontsize=12)
    ax1.set_ylabel(YLABEL_TITLE, color='black', fontsize=12)
    line1, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,0], color=COLOR2, linewidth=1)
    line2, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,1], color=COLOR1)
    line3, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,2], color=COLOR2, linewidth=1)
    ax1.axhline(y=0, color='black', linewidth=.5)
    ax1.tick_params(axis='x', labelcolor='black')
    for pos in ['top', 'right', 'bottom']:
        ax1.spines[pos].set_visible(False)
    ax1.tick_params(axis='x', labelbottom=True, bottom=False)
    ax1.legend((line3, line2, line1), ('90th percentile', 'Median', '10th percentile'), frameon=False)
    
    # plot the histogram
    x = sample.features[feature].dropna()
    hist_range = (min(yield_change_aggr.index), max(yield_change_aggr.index))
    n, bins, patches = ax2.hist(x, edgecolor='white', color=COLOR2, range=hist_range)
    bins1 = pd.Series(bins).rolling(window=2).mean().shift(-1).dropna()
    ax2.invert_yaxis()
    ax2.tick_params(axis='y', labelcolor='black')
    max_y = max(n)
    y_offset = max_y * 0.05
    for (x,y) in zip(bins1, n):
        if y>0:
            ax2.text(x, y + y_offset, str(int(y)), color='black', horizontalalignment='center')
    ax2.get_yaxis().set_visible(False)
    ax2.get_xaxis().set_visible(False)
    for pos in ['top', 'right', 'left', 'bottom']:
        ax2.spines[pos].set_visible(False)
    plt.subplots_adjust(hspace=.2)
    plt.show()

def on_click(btn):
    clear_output()
    display(widgets.HBox([dd, btn]))
    plot_simulation(feature=dd.value)
    
btn.on_click(on_click)    
display(widgets.HBox([dd, btn]))
btn.click()