In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

# Simulation

In [None]:
import pandas as pd
# todo remove RandomForestRegressor when possible
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, FeatMapStyle, LineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning, CategoryPartitioning
from gamma.yieldengine.viz import SimulationDrawer, SimulationPlotStyle

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"
df = df.dropna(how='any', axis=1)

## Partitioning
todo

## Building the simulation
todo

## Drawing
todo

## Styling
todo

## Categorical simulation

In [None]:
categorical_columns = df.dtypes == object
categorical_columns = categorical_columns[categorical_columns].index
df.loc[:, categorical_columns] = df.loc[:, categorical_columns].astype('category')

In [None]:
sample = Sample(observations=df, target_name=TARGET)

In [None]:
lgbm_pipeline = ModelPipelineDF(predictor=LGBMRegressorDF(random_state=0), preprocessing=None)
grids = [ModelGrid(pipeline=lgbm_pipeline, predictor_parameters={"learning_rate": [0.1, 0.2]})]

In [None]:
circular_cv = CircularCrossValidator(test_ratio=1/3, num_splits=3)
ranker = ModelRanker(grids=grids, cv=circular_cv)
ranking = ranker.run(sample, n_jobs=-3)
print(summary_report(ranking))

In [None]:
top_model = ranking[0]

predictor = PredictorFitCV(model=top_model.model, cv=circular_cv, sample=sample)

inspector = ModelInspector(predictor)

model_fit = inspector.model_fit
sim = UnivariateSimulator(model_fit=model_fit)

In [None]:
feature = "HouseStyle"
partition = CategoryPartitioning(values=model_fit.sample.features[feature])

simulator = UnivariateSimulator(model_fit=model_fit)

univariate_simulation = simulator.simulate_feature(feature_name=feature, partitioning=partition)

In [None]:
ax = plt.figure(figsize=(10,10)).add_subplot(111)

style = SimulationPlotStyle(ax)

drawer = SimulationDrawer(title=f"{feature} Simulation", simulation=univariate_simulation, style=style)
drawer.draw()