# Front matter

## Set path

In [None]:
def set_paths() -> None:
    import sys
    import os
    
    global cwd
    
    if 'cwd' not in globals():
        cwd = os.path.join(os.getcwd(), os.pardir)
        os.chdir(cwd)
    
    print(f"working dir is '{os.getcwd()}'")
                             
    PATH_YIELD_ENGINE = 'src/'
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    
    print(f"added `{sys.path[0]}` to python paths")

set_paths()

## Imports

In [None]:
from typing import *

import numpy as np
import pandas as pd
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

from tests.model import make_simple_transformer
from yieldengine import Sample
from yieldengine.model.selection import (
    Model, ModelEvaluation, ModelGrid, ModelRanker, summary_report,
)
from yieldengine.model.validation import CircularCrossValidator

In [None]:
# load the data file
from tests.paths import TEST_DATA_CSV
import tests

inputfile_config = tests.read_test_config(section="inputfile")

batch_file = pd.read_csv(
    filepath_or_buffer=TEST_DATA_CSV,
    delimiter=inputfile_config["delimiter"],
    header=inputfile_config["header"],
    decimal=inputfile_config["decimal"],
)

In [None]:
batch_file = batch_file.drop(columns=["Date", "Batch Id"])
# replace values of +/- infinite with n/a, then drop all n/a columns:
batch_file = batch_file.replace([np.inf, -np.inf], np.nan).dropna(
    axis=1, how="all"
)
batch_file.head()

In [None]:
# define a Sample based on the test batch_file
sample = Sample(observations=batch_file, target_name="Yield")
sample

In [None]:
# define the circular cross validator with 10 folds
circular_cv = CircularCrossValidator(test_ratio=0.2, num_folds=10)

circular_cv

In [None]:
# define how features should be preprocessed
impute_only_preprocessor = make_simple_transformer(
    impute_median_columns=sample.features_by_type(Sample.DTYPE_NUMERICAL).columns,
    )

full_preprocessor = make_simple_transformer(
    impute_median_columns=sample.features_by_type(Sample.DTYPE_NUMERICAL).columns,
    one_hot_encode_columns=sample.features_by_type(Sample.DTYPE_OBJECT).columns,
)

full_preprocessor.base_transformer.transformers

In [None]:
grids = [
    ModelGrid(
        Model(
            estimator=LGBMRegressor(),
            preprocessing=impute_only_preprocessor
        ),
        estimator_parameters={
            "n_estimators": [100],
            "learning_rate": [0.1],
            "max_depth": [8, 10],
            "min_split_gain": [0.1, 0.2],
            "num_leaves": [8, 16, 32],
            "feature_fraction": [0.8],
            # "early_stopping_round": [0, 4],

        },
    ),
]

grids_unused = [
    ModelGrid(
        Model(
            estimator=RandomForestRegressor(),
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"n_estimators": (100, 400)},
    ),
    ModelGrid(
        Model(
            estimator=AdaBoostRegressor(), 
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"n_estimators": (100, 200)}, 
    ),
    ModelGrid(
        Model(
            estimator=DecisionTreeRegressor(),
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"max_depth": (0.5, 1.0), "max_features": (0.5, 1.0)},
    ),
    ModelGrid(
        Model(
            estimator=ExtraTreeRegressor(),
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"max_depth": (5, 10, 12)},
    ),
    ModelGrid(
        Model(
            estimator=SVR(),
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"gamma": (0.5, 1), "C": (50, 100)},
    ),
    ModelGrid(
        Model(
            estimator=LinearRegression(),
            preprocessing=full_preprocessor,
        ),
        estimator_parameters={"normalize": (False, True)},
    ),
]

print(f"{len(grids)} model grids")

In [None]:
ranker = ModelRanker(
        grids=grids,
        cv=circular_cv,
        scoring=make_scorer(mean_squared_error, greater_is_better=False),
)

ranker

In [None]:
ranking: Sequence[ModelEvaluation] = ranker.run(sample)

ranking

In [None]:
print(summary_report(ranking))