In [None]:
import os
import sys
# register yieldengine package in path, if it is missing
if os.path.abspath("./../src") not in ";".join(sys.path):
    sys.path.insert(0, os.path.abspath("./../src"))
    
# change current working path of Notebook back to project-root
if os.path.abspath(".").endswith("/notebooks"):
    os.chdir(os.path.abspath("./../"))

from yieldengine.loading.sample import Sample
from yieldengine.modeling.validation import CircularCrossValidator
from yieldengine.modeling.selection import ModelRanker, Model, ModelZoo, ModelRanking
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from lightgbm.sklearn import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error

In [None]:
# load the data file
from tests.paths import TEST_DATA_CSV
import tests

inputfile_config = tests.read_test_config(section="inputfile")

batch_file = pd.read_csv(
    filepath_or_buffer=TEST_DATA_CSV,
    delimiter=inputfile_config["delimiter"],
    header=inputfile_config["header"],
    decimal=inputfile_config["decimal"],
)

In [None]:
batch_file = batch_file.drop(columns=["Date", "Batch Id"])

# replace values of +/- infinite with n/a, then drop all n/a columns:
batch_file = batch_file.replace([np.inf, -np.inf], np.nan).dropna(
    axis=1, how="all"
)
batch_file.head()

In [None]:
# define a Sample based on the test batch_file
sample = Sample(observations=batch_file, target_name="Yield")

In [None]:
# define the circular cross validator with 10 folds
circular_cv = CircularCrossValidator(test_ratio=0.2, num_folds=10)

In [None]:
# define a ColumnTransformer to pre-process:
preprocessor = ColumnTransformer(
    [
        ("numerical", SimpleImputer(strategy="mean"), sample.features_numerical),
        (
            "categorical",
            OneHotEncoder(sparse=False, handle_unknown="ignore"),
            sample.features_categorical,
        ),
    ]
)

In [None]:
# define a sklearn Pipeline, containing the preprocessor defined above:
pre_pipeline = Pipeline([("prep", preprocessor)])

In [None]:
model_zoo = ModelZoo(
        [
            Model(
                estimator=LGBMRegressor(),
                parameter_grid={
                    "max_depth": (5, 10),
                    "min_split_gain": (0.1, 0.2),
                    "num_leaves": (50, 100, 200),
                },
            ),
            Model(
                estimator=AdaBoostRegressor(), parameter_grid={"n_estimators": (50, 80)}
            ),
            Model(
                estimator=RandomForestRegressor(),
                parameter_grid={"n_estimators": (50, 80)},
            ),
            Model(
                estimator=DecisionTreeRegressor(),
                parameter_grid={"max_depth": (0.5, 1.0), "max_features": (0.5, 1.0)},
            ),
            Model(
                estimator=ExtraTreeRegressor(),
                parameter_grid={"max_depth": (5, 10, 12)},
            ),
            Model(estimator=SVR(), parameter_grid={"gamma": (0.5, 1), "C": (50, 100)}),
            Model(
                estimator=LinearRegression(),
                parameter_grid={"normalize": (False, True)},
            ),
        ]
    )

In [None]:
ranker = ModelRanker(
        zoo=model_zoo,
        preprocessing=pre_pipeline,
        cv=circular_cv,
        scoring=make_scorer(mean_squared_error, greater_is_better=False),
)

In [None]:
ranking: ModelRanking = ranker.run(sample)

In [None]:
print(ranking.summary_string())