# Quick Start

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, FeatMapStyle, LineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulation
from gamma.yieldengine.partition import ContinuousRangePartitioning

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df.head()

In [None]:
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)

In [None]:
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df = df[list(num_features)]

In [None]:
sample = Sample(observations=df, target_name=TARGET)

In [None]:
boruta_selector = PipelineDF(steps = [
        ('preprocess', SimpleImputerDF(strategy="median")),
        ('boruta', BorutaDF(estimator=RandomForestRegressor(),n_estimators=10, verbose=2, max_iter=10))
])

In [None]:
boruta_selector.fit(sample.features, sample.target);

In [None]:
boruta_selector.columns_original

In [None]:
sample = sample.select_features(boruta_selector.columns_original)

In [None]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)

In [None]:
rf_pipeline = ModelPipelineDF(predictor=RandomForestRegressorDF(), preprocessing=SimpleImputerDF())
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(), preprocessing=SimpleImputerDF())

In [None]:
grids = [
    ModelGrid(pipeline=rf_pipeline, estimator_parameters={"n_estimators": [10, 20]}),
    ModelGrid(pipeline=lgbm_pipeline, estimator_parameters={"learning_rate": [0.1, 0.2]})
]

In [None]:
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

In [None]:
top_model = ranking[0]
print(top_model.scoring['test_score'])
print(top_model.parameters)

In [None]:
predictor = PredictorFitCV(model=top_model.model, cv=circular_cv, sample=sample)

In [None]:
inspector = ModelInspector(predictor)
predictions = predictor.predictions_for_all_splits()

In [None]:
inspector.feature_importances()

The Shap clustering clusters the features using as distance between the features, the correlation matrix of the shap values.
Then using a hierarchical clustering, and visualization style defined in the yield-engine package, one can easily visualize the clustering of the features.

In [None]:
linkage_tree = inspector.cluster_dependent_features()

In [None]:
number_features = predictor.sample.features.shape[1]
ax = plt.figure(figsize=(10, number_features*.5)).add_subplot(111)
style = FeatMapStyle(ax)
DendrogramDrawer(title=TARGET, linkage_tree=linkage_tree, style=style).draw()

The simulation builds partial dependency plots which allow to assess the impact thta the value of a given feature has on the model predictions.

In [None]:
model_fit = inspector.model_fit
sim = UnivariateSimulation(model_fit=model_fit)

In [None]:
from IPython.display import display, clear_output
import ipywidgets as widgets

dd = widgets.Dropdown(
    options=predictor.sample.features.columns,
    description='Feature:',
    disabled=False,
    layout={"width":"550px"}
)

btn = widgets.Button(description='Simulate')

def plot_simulation(feature:str):
    feature_values = ContinuousRangePartitioning(df.loc[:,feature]).partitions()
    yield_change = sim.simulate_feature(
            feature_name=feature,
            feature_values=feature_values,
    )
    
    yield_change_aggr = UnivariateSimulation.aggregate_simulation_results(
                    results_per_split=yield_change, percentiles=[10, 50, 90])
    
    XLABEL_TITLE = f"{feature}"
    YLABEL_TITLE = f"Predicted mean yield uplift ({TARGET})"
    COLOR1 = 'red'
    COLOR2 = 'silver'
    
    fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10,10), sharex=True)
    
    # plot lines of prediction
    ax1.set_xlabel(XLABEL_TITLE, color='black', labelpad=10, fontsize=12)
    ax1.set_ylabel(YLABEL_TITLE, color='black', fontsize=12)
    line1, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,0], color=COLOR2, linewidth=1)
    line2, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,1], color=COLOR1)
    line3, = ax1.plot(yield_change_aggr.index, yield_change_aggr.iloc[:,2], color=COLOR2, linewidth=1)
    ax1.axhline(y=0, color='black', linewidth=.5)
    ax1.tick_params(axis='x', labelcolor='black')
    for pos in ['top', 'right', 'bottom']:
        ax1.spines[pos].set_visible(False)
    ax1.tick_params(axis='x', labelbottom=True, bottom=False)
    ax1.legend((line3, line2, line1), ('90th percentile', 'Median', '10th percentile'), frameon=False)
    
    # plot the histogram
    x = sample.features[feature].dropna()
    hist_range = (min(yield_change_aggr.index), max(yield_change_aggr.index))
    n, bins, patches = ax2.hist(x, edgecolor='white', color=COLOR2, range=hist_range)
    bins1 = pd.Series(bins).rolling(window=2).mean().shift(-1).dropna()
    ax2.invert_yaxis()
    ax2.tick_params(axis='y', labelcolor='black')
    max_y = max(n)
    y_offset = max_y * 0.05
    for (x,y) in zip(bins1, n):
        if y>0:
            ax2.text(x, y + y_offset, str(int(y)), color='black', horizontalalignment='center')
    ax2.get_yaxis().set_visible(False)
    ax2.get_xaxis().set_visible(False)
    for pos in ['top', 'right', 'left', 'bottom']:
        ax2.spines[pos].set_visible(False)
    plt.subplots_adjust(hspace=.2)
    plt.show()

def on_click(btn):
    clear_output()
    display(widgets.HBox([dd, btn]))
    plot_simulation(feature=dd.value)
    
btn.on_click(on_click)    
display(widgets.HBox([dd, btn]))
btn.click()