# Front matter

In [None]:
CACHING = True
EXPERIMENTAL = False

## Set path

In [None]:
PATH_YIELD_ENGINE = 'src'

def set_paths() -> None:
    """
    set correct working directory and python path when started from within PyCharm
    """
    import sys
    import os
    
    if 'cwd' not in globals():
        # noinspection PyGlobalUndefined
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir)
        os.chdir(cwd)
    
    print(f"working dir is '{os.getcwd()}'")
                             
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    
    print(f"added `{sys.path[0]}` to python paths")

set_paths()

## Imports

In [None]:
import logging
import os
import pickle

import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRegressor
from matplotlib import cm
from matplotlib.pyplot import figure

import tests
from tests.model import make_simple_transformer
from tests.paths import TEST_DATA_CSV
from yieldengine import Sample
from yieldengine.dendrogram import DendrogramDrawer
from yieldengine.dendrogram.style import LineStyle
from yieldengine.model.inspection import ModelInspector
from yieldengine.model.selection import Model, ModelGrid, ModelRanker
from yieldengine.model.validation import CircularCrossValidator

In [None]:
%matplotlib inline

In [None]:
PATH_TMP = 'tmp'
MI_PKL = os.path.join(PATH_TMP, 'model_inspector.pkl')

In [None]:
logging.basicConfig(level=logging.DEBUG)

## Load the data file

In [None]:
inputfile_config = tests.read_test_config(section="inputfile")

batch_df = pd.read_csv(
    filepath_or_buffer=TEST_DATA_CSV,
    delimiter=inputfile_config["delimiter"],
    header=inputfile_config["header"],
    decimal=inputfile_config["decimal"],
)

batch_df = batch_df.drop(columns=["Date", "Batch Id"])

# replace values of +/- infinite with n/a, then drop all n/a columns:
batch_df = batch_df.replace([np.inf, -np.inf], np.nan).dropna(
    axis=1, how="all"
)
batch_df.head()

In [None]:
# define a Sample based on the test batch_file
sample = Sample(observations=batch_df, target_name="Yield")
sample

In [None]:
# define the circular cross validator with 10 folds
circular_cv = CircularCrossValidator(test_ratio=0.2, num_folds=10)

circular_cv

In [None]:
# define a transformer step based on the sample
preprocessor = make_simple_transformer(
        impute_median_columns=sample.features_by_type(Sample.DTYPE_NUMERICAL).columns,
        one_hot_encode_columns=sample.features_by_type(Sample.DTYPE_OBJECT).columns,
)
preprocessor

In [None]:
# define a Model with a preprocessing pipeline
lgbm = ModelGrid(
            model=Model(
                preprocessing=preprocessor, estimator=LGBMRegressor()
            ),
            estimator_parameters={
                "max_depth": [5, 10],
                "min_split_gain": [0.1, 0.2],
                "num_leaves": [50, 100, 200],
                "random_state": [42],
            },
)
lgbm

In [None]:
if not CACHING:
    # define a ModelRanker
    model_ranker: ModelRanker = ModelRanker(
            grids=[lgbm], cv=circular_cv, scoring="r2"
        )

    # run the ModelRanker to retrieve a ranking
    model_ranking = model_ranker.run(sample=sample)
    # noinspection PyStatementEffect
    model_ranking
else:
    model_ranking = None

In [None]:
if not CACHING:
    # retrieve the best model
    best_model = model_ranking[0]
    # noinspection PyStatementEffect
    best_model
else:
    best_model = None

In [None]:
if not CACHING:
    # define a ModelInspector
    mi = ModelInspector(
        model=best_model.model,
        cv=circular_cv,
        sample=sample
    )

    mi.feature_dependency_matrix()

    with open(MI_PKL, 'wb') as f:
        pickle.dump(mi, f)
else:
    with open(MI_PKL, 'rb') as f:
        mi = pickle.load(f)

mi

In [None]:
# retrieve a the shap_matrix
mi.shap_matrix().head()

In [None]:
mi.feature_dependency_matrix().head()

In [None]:
if EXPERIMENTAL:
    mi.plot_feature_dendrogram_scipy()

In [None]:
plasma=cm.get_cmap(name="plasma", lut=256)

In [None]:
fig = figure(figsize=(8,16))
ax = fig.add_subplot(111)

drawer = DendrogramDrawer(
    linkage=mi.cluster_dependent_features(),
    style=LineStyle(ax=ax),
    title='Feature clusters'
)

drawer.draw()

if EXPERIMENTAL:
    ax = fig.add_subplot(122)
    a=np.outer(np.arange(1,0,-0.01),1)
    ax.imshow(a, aspect='auto', cmap=plasma)