# Simulation

Once we trust a model we want to simulate the outcome of some parameter change:

- What would happen if we set the feature to this value?
- What is the optimal value for this feature? 

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

import numpy as np
np.random.seed(0)

In [None]:
import pandas as pd

from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, DendrogramFeatMapStyle, DendrogramLineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning, CategoryPartitioning
from gamma.yieldengine.viz import SimulationDrawer, SimulationPlotStyle

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"
num_features = df.select_dtypes('number').columns
df_numerical = df[list(num_features)].copy()
sample = Sample(observations=df_numerical, target_name=TARGET)

In [None]:
cv = CircularCrossValidator(test_ratio=1/3, num_splits=6)
model = ModelPipelineDF(predictor=RandomForestRegressorDF(n_estimators=10), preprocessing=SimpleImputerDF())
predictor = PredictorFitCV(model=model, cv=cv, sample=sample)

## Partitioning

In [None]:
feature = "GrLivArea"
partition = ContinuousRangePartitioning(values=sample.features[feature])

In [None]:
partition.partitions()

In [None]:
partition.frequencies()

In [None]:
len(partition)

In [None]:
partition.is_categorical

One can easily control the size and bounds of the partitions:

In [None]:
new_partition = ContinuousRangePartitioning(
    values=sample.features[feature], max_partitions=20, lower_bound=1000, upper_bound=3000)

In [None]:
new_partition.partitions()

## Building the simulation

In [None]:
simulator = UnivariateSimulator(model_fit=predictor)

In [None]:
univariate_simulation = simulator.simulate_feature(feature_name=feature, partitioning=partition)

## Drawing

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)
style = SimulationPlotStyle(ax)
drawer = SimulationDrawer(title=f"{feature} Simulation", simulation=univariate_simulation, style=style)
drawer.draw()

## Styling

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)
style = SimulationPlotStyle(ax)
drawer = SimulationDrawer(title=f"{feature} Simulation", simulation=univariate_simulation, style=style)
drawer.draw()
style.ax.set_title("Simulation of the Ground living area")
style.ax.tick_params(axis='x', labelcolor='green')

## Categorical simulation

First we convert string columns into pandas categorical columns. Hence we will be able to use lightgbm models.

In [None]:
mask_categorical = df.dtypes == object
categorical_columns = mask_categorical[mask_categorical].index
df_with_categorical = df.copy()
df_with_categorical.loc[:, categorical_columns] = df_with_categorical.loc[:, categorical_columns].astype('category')

In [None]:
sample = Sample(observations=df_with_categorical, target_name=TARGET)
lgbm_model = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=None)
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)
predictor = PredictorFitCV(model=lgbm_model, cv=circular_cv, sample=sample)

In [None]:
feature = "HouseStyle"
partition = CategoryPartitioning(values=sample.features[feature], max_partitions=3)

We can run the simulation as in the continuous case:

In [None]:
simulator = UnivariateSimulator(model_fit=predictor)
univariate_simulation = simulator.simulate_feature(feature_name=feature, partitioning=partition)

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)
style = SimulationPlotStyle(ax)
drawer = SimulationDrawer(title=f"{feature} Simulation", simulation=univariate_simulation, style=style)
drawer.draw()