# Front matter

*Important:* On first run, set CACHING to False

In [None]:
from gamma.sklearndf.regression import LGBMRegressorDF
from gamma.viz.simulation import SimulationDrawer, SimulationMatplotStyle
from gamma.yieldengine.partition import  ContinuousRangePartitioning
from test import read_test_config
from test.model import make_simple_transformer
CACHING = True
EXPERIMENTAL = False

## Set path

In [None]:
PATH_YIELD_ENGINE = 'src'

def set_paths() -> None:
    """
    set correct working directory and python path when started from within PyCharm
    """
    import sys
    import os
    
    if 'cwd' not in globals():
        # noinspection PyGlobalUndefined
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir)
        os.chdir(cwd)
    
    print(f"working dir is '{os.getcwd()}'")
                             
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    
    print(f"added `{sys.path[0]}` to python paths")

set_paths()

## Imports

In [None]:
import logging
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from test.paths import TEST_DATA_CSV
from gamma import Sample
from gamma.sklearndf.transformation import FunctionTransformerDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.transformation.extra import OutlierRemoverDF
from gamma.sklearndf.transformation import ColumnTransformerDF
from gamma.model.prediction import RegressorFitCV
from gamma.sklearndf.pipeline import PipelineDF
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator

In [None]:
%matplotlib inline

In [None]:
PATH_TMP = 'tmp'
MI_PKL = os.path.join(PATH_TMP, 'model_inspector.pkl')
MP_PKL = os.path.join(PATH_TMP, 'model_predictor.pkl')

In [None]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

## Load the data file

In [None]:
inputfile_config = read_test_config(section="inputfile")

batch_df = pd.read_csv(
    filepath_or_buffer=TEST_DATA_CSV,
    delimiter=inputfile_config["delimiter"],
    header=inputfile_config["header"],
    decimal=inputfile_config["decimal"],
)

batch_df = batch_df.drop(columns=["Date", "Batch Id"])

# replace values of +/- infinite with n/a, then drop all n/a columns:
batch_df = batch_df.replace([np.inf, -np.inf], np.nan).dropna(
    axis=1, how="all"
)
batch_df.head()

In [None]:
TARGET = "Yield"
# define a Sample based on the test batch_file
sample = Sample(observations=batch_df, target_name=TARGET)
sample

In [None]:
# define the circular cross validator with 10 folds
circular_cv = CircularCrossValidator(test_ratio=0.2, num_splits=10)

circular_cv

In [None]:
outlier_transformers = [
    (
        'outlier', 
        OutlierRemoverDF(iqr_multiple=3), 
        sample.features_by_type(Sample.DTYPE_NUMERICAL).columns
    ),
    (
        'rest',
        FunctionTransformerDF(validate=False),
        sample.features_by_type(Sample.DTYPE_OBJECT).columns
    )
]
outlier_step = ColumnTransformerDF(transformers=outlier_transformers)

In [None]:
if not CACHING:
    boruta_selector = PipelineDF(steps=[
                (
        'outlier_removal',
        outlier_step
        
        ),
        (
            'preprocess', 
            make_simple_transformer(
                impute_median_columns=sample.features_by_type(Sample.DTYPE_NUMERICAL).columns,
                one_hot_encode_columns=sample.features_by_type(Sample.DTYPE_OBJECT).columns,
            )
        ),
        (
            'boruta', 
            BorutaDF(
                estimator=RandomForestRegressor(n_jobs=4),
                max_iter=100,
                n_estimators='auto', 
                verbose=2, 
                random_state=42
            )
        )
    ])
    
    feature_selection = boruta_selector.fit_transform(
            sample.features, 
            sample.target
        )
    
    selected_features = boruta_selector.columns_original

    sample_post_boruta = sample.select_features(selected_features)
    
    selected_features.to_frame()
else:
    sample_post_boruta = None

In [None]:
# define a transformer step based on the sample
if not CACHING:
    preprocessor = make_simple_transformer(
            impute_median_columns=sample_post_boruta.features_by_type(Sample.DTYPE_NUMERICAL).columns,
            one_hot_encode_columns=sample_post_boruta.features_by_type(Sample.DTYPE_OBJECT).columns,
    )
    # noinspection PyStatementEffect
    preprocessor
else:
    preprocessor = None

In [None]:
if not CACHING:
    # define a ModelPipelineDF with a preprocessing pipeline
    lgbm = ModelGrid(
                pipeline=ModelPipelineDF(
                    preprocessing=preprocessor, predictor=LGBMRegressorDF()
                ),
                predictor_parameters={
                    "max_depth": [5, 10],
                    "min_split_gain": [0.1, 0.2],
                    "num_leaves": [50, 100, 200],
                    "random_state": [42],
                },
    )
    # noinspection PyStatementEffect
    lgbm
else:
    lgbm = None

In [None]:
if not CACHING:
    # define a ModelRanker
    model_ranker: ModelRanker = ModelRanker(
            grids=[lgbm], cv=circular_cv, scoring="r2"
        )

    # run the ModelRanker to retrieve a ranking
    model_ranking = model_ranker.run(sample=sample_post_boruta)
    # noinspection PyStatementEffect
else:
    model_ranking = None
    
model_ranking

In [None]:
if not CACHING:
    # retrieve the best model
    best_model = model_ranking[0]
    # noinspection PyStatementEffect
    best_model
else:
    best_model = None

In [None]:
if not CACHING:
    # define a ModelFitCV
    predictor_fit = RegressorFitCV(
        model=best_model.model,
        cv=circular_cv,
        sample=sample
    )

    with open(MP_PKL, 'wb') as f:
        pickle.dump(predictor_fit, f)
else:
    with open(MP_PKL, 'rb') as f:
        predictor_fit = pickle.load(f)

predictor_fit

# Simulation Example starts here

In [None]:
sim = UnivariateSimulator(model_fit=predictor_fit, min_percentile=10, max_percentile=90)

In [None]:
parameterized_feature = "Step4-6 RawMat Vendor Compound08 Purity (#)"

yield_change = sim.simulate_feature(
    feature_name=parameterized_feature, 
    partitioning=ContinuousRangePartitioning(
        sample.features.loc[:, parameterized_feature]
    )
)

yield_change.median_uplift

In [None]:
SimulationDrawer(
    title=parameterized_feature,
    simulation=yield_change,
    style=SimulationMatplotStyle()
).draw()