# Front matter

## Set path

In [None]:
def set_paths() -> None:
    import sys
    import os
    
    global cwd
    
    if 'cwd' not in globals():
        cwd = os.path.join(os.getcwd(), os.pardir)
        os.chdir(cwd)
    
    print(f"working dir is '{os.getcwd()}'")
                             
    PATH_YIELD_ENGINE = 'src/'
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    
    print(f"added `{sys.path[0]}` to python paths")

set_paths()

## Imports

In [None]:
import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRegressor

import tests
from tests.model import make_simple_transformer
from tests.paths import TEST_DATA_CSV
from yieldengine import Sample
from yieldengine.model.inspection import ModelInspector
from yieldengine.model.selection import Model, ModelGrid, ModelRanker
from yieldengine.model.validation import CircularCrossValidator

## Load the data file

In [None]:
inputfile_config = tests.read_test_config(section="inputfile")

batch_file = pd.read_csv(
    filepath_or_buffer=TEST_DATA_CSV,
    delimiter=inputfile_config["delimiter"],
    header=inputfile_config["header"],
    decimal=inputfile_config["decimal"],
)

In [None]:
batch_file = batch_file.drop(columns=["Date", "Batch Id"])

# replace values of +/- infinite with n/a, then drop all n/a columns:
batch_file = batch_file.replace([np.inf, -np.inf], np.nan).dropna(
    axis=1, how="all"
)
batch_file.head()

In [None]:
# define a Sample based on the test batch_file
sample = Sample(observations=batch_file, target_name="Yield")
sample

In [None]:
# define the circular cross validator with 10 folds
circular_cv = CircularCrossValidator(test_ratio=0.2, num_folds=10)

circular_cv

In [None]:
# define a transformer step based on the sample
preprocessor = make_simple_transformer(
        impute_median_columns=sample.features_by_type(Sample.DTYPE_NUMERICAL).columns,
        one_hot_encode_columns=sample.features_by_type(Sample.DTYPE_OBJECT).columns,
)
preprocessor

In [None]:
# define a Model with a preprocessing pipeline
lgbm = ModelGrid(
            model=Model(
                preprocessing=preprocessor, estimator=LGBMRegressor()
            ),
            estimator_parameters={
                "max_depth": [5, 10],
                "min_split_gain": [0.1, 0.2],
                "num_leaves": [50, 100, 200],
                "random_state": [42],
            },
)
lgbm

In [None]:
# define a ModelRanker
model_ranker: ModelRanker = ModelRanker(
        grids=[lgbm], cv=circular_cv, scoring="r2"
    )

# run the ModelRanker to retrieve a ranking
model_ranking = model_ranker.run(sample=sample)
model_ranking

In [None]:
# retrieve the best model
best_model = model_ranking[1]
best_model

In [None]:
# define a ModelInspector
mi = ModelInspector(
    model=best_model.model,
    cv=circular_cv,
    sample=sample
)
mi

In [None]:
# retrieve a the shap_matrix
mi.shap_matrix().head()

In [None]:
mi.feature_dependency_matrix().head()

In [None]:
mi.plot_feature_dendrogram_scipy()