In [1]:
from pathlib import Path
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_selector as selector

import numpy as np
from matplotlib import pyplot as plt
import pickle
from category_encoders.target_encoder import TargetEncoder

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config

#from skopt import BayesSearchCV
from tempfile import mkdtemp
cachedir = mkdtemp()
#from skopt.callbacks import DeltaYStopper, TimerCallback, DeadlineStopper
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

set_config(display="diagram")

%matplotlib inline

# Import data

In [2]:
train_data_path = Path('data_train.csv')
train_label_path = Path('target_train.csv')
test_data_path = Path('data_test.csv')

data_train = pd.read_csv(train_data_path)
data_label = pd.read_csv(train_label_path)
data_test = pd.read_csv(test_data_path)

target_test = pd.read_csv("target_test.csv").drop('Unnamed: 0', axis=1).set_index('index')

data_test['index'] = data_test['LAPISSUGL'].astype(str) + '-' + data_test['ARUSKOD'].astype(str)
data_test = data_test.set_index('index')

data_train['index'] = data_train['LAPISSUGL'].astype(str) + '-' + data_train['ARUSKOD'].astype(str)
data_train = data_train.set_index('index')

data_label['index'] = data_train.index
data_label = data_label.set_index('index')

data_train = data_train.loc[:, lambda _df: _df.nunique(dropna=False) > 1].dropna(
    how="all", axis=1
)

## Constants 

In [3]:
FIG_WIDTH = 16
FIG_HEIGHT = 8
rng = np.random.default_rng(seed=420)

# Wrangling

In [4]:
from src.schema import TypeClass

ModuleNotFoundError: No module named 'src'

In [None]:
data_train = (
    data_train.astype(TypeClass.all_dict())
    .replace({pd.NA: np.nan})
    .drop(columns=["LAPISSUGL", "KFDELDATE"])
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data_train,
    data_label["NAPI_ELADOTT_DB"],
    test_size=0.2,
    random_state=420,
)

# Pandas profiling 

In [None]:
profile = ProfileReport(
    data_train,
    infer_dtypes=False,
)

In [None]:
profile.to_file(Path('output/pp_report.html'))

# Encoding

A következő rész egy pipeline-t épít fel, ami megnézi, hogy a test adaton melyik encoding teljesít a legjobban.

In [None]:
numeric_transformer = SimpleImputer(strategy="median")

encoders = Pipeline(
    steps=[
        ("ohe", OneHotEncoder()),
        (
            "oe",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
        ),
        (
            "te",
            TargetEncoder(
                verbose=0,
                drop_invariant=False,
                return_df=True,
                handle_missing="value",
                handle_unknown="value",
                min_samples_leaf=1,
                smoothing=1.0,
            ),
        ),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(
                missing_values=np.nan, strategy="constant", fill_value="MISSING"
            ),
        ),
        ("encoder", encoders),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, selector(dtype_include="object")),
        (
            "num",
            numeric_transformer,
            selector(dtype_include="number", dtype_exclude="boolean"),
        ),
    ],
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", DecisionTreeRegressor(random_state=420)),
    ]
)

In [None]:
grid_params = [
    {
        "preprocessor__cat__encoder__ohe": ["passthrough"],
        "preprocessor__cat__encoder__oe": ["passthrough"],
        "regressor__max_depth": range(1, 50),
    },
    {
        "preprocessor__cat__encoder__oe": ["passthrough"],
        "preprocessor__cat__encoder__te": ["passthrough"],
        "regressor__max_depth": range(1, 50),
    },
    {
        "preprocessor__cat__encoder__te": ["passthrough"],
        "preprocessor__cat__encoder__ohe": ["passthrough"],
        "regressor__max_depth": range(1, 50),
    },
]

In [None]:
gs = GridSearchCV(
    clf, grid_params, verbose=3, scoring="neg_mean_squared_error", n_jobs=-1
)

gs.fit(X_train, y_train)

In [None]:
grid_stats = pd.DataFrame(
    {
        "encoder": [*(["TE"] * 49), *(["OHE"] * 49), *(["OE"] * 49)],
        "N-RMSE": gs.cv_results_["mean_test_score"],
        "max_depth": [*range(1, 50)] * 3,
    }
)

In [None]:
fig, axes = plt.subplots(figsize=[FIG_WIDTH, FIG_HEIGHT])
grid_stats.groupby("encoder").apply(
    lambda _gdf: axes.plot(
        _gdf["max_depth"], _gdf["N-RMSE"], label=_gdf["encoder"].iloc[0]
    )
)
axes.legend()
plt.show()

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_

# Decision tree

## Simple

In [None]:
numeric_transformer = SimpleImputer(strategy="median")

encoder = TargetEncoder(
    verbose=0,
    drop_invariant=False,
    return_df=True,
    handle_missing="value",
    handle_unknown="value",
    min_samples_leaf=1,
    smoothing=1.0,
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(
                missing_values=np.nan, strategy="constant", fill_value="MISSING"
            ),
        ),
        ("encoder", encoder),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, selector(dtype_include="object")),
        (
            "num",
            numeric_transformer,
            selector(dtype_include="number"),
        ),
    ],
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", DecisionTreeRegressor(random_state=420)),
    ],
    memory=cachedir,
)

In [None]:
clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=[FIG_WIDTH, FIG_HEIGHT])
sns.scatterplot(x=pred, y=y_test.values, alpha=0.5, ax=axes[0])
sns.histplot(y_test.values - pred, ax=axes[1], stat='density')
plt.show()

In [None]:
Path('output/dtree_simple.pickle').write_bytes(pickle.dumps(clf))

## Tune max_depth only (Bayesian)

In [None]:
numeric_transformer = SimpleImputer(strategy="median")

encoder = TargetEncoder(
    verbose=0,
    drop_invariant=False,
    return_df=True,
    handle_missing="value",
    handle_unknown="value",
    min_samples_leaf=1,
    smoothing=1.0,
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(
                missing_values=np.nan, strategy="constant", fill_value="MISSING"
            ),
        ),
        ("encoder", encoder),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, selector(dtype_include="object")),
        (
            "num",
            numeric_transformer,
            selector(dtype_include="number"),
        ),
    ],
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("feature_selection", SelectFromModel(ExtraTreesRegressor())),
        ("regressor", DecisionTreeRegressor(random_state=420)),
    ],
    memory=cachedir,
)

In [None]:
parameters = {
#     "regressor__splitter": ["best", "random"],
#     "regressor__max_depth": (1, 1000),
    "feature_selection__estimator__n_estimators": (1,50),
    "regressor__min_samples_leaf": (1e-6, 5e-1, "log-uniform"),
    "regressor__min_samples_split": (1e-6, 5e-1, "log-uniform"),
#     "regressor__min_weight_fraction_leaf": (1e-6, 5e-1, "log-uniform"),
#     "regressor__max_features": ["auto", "log2", "sqrt", None],
#     "regressor__max_leaf_nodes": [None, *np.linspace(1, 100, 100, dtype=np.int64)],
}

In [None]:
opt = BayesSearchCV(
    clf_1,
    parameters,
    n_iter=1_000,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=0,
    refit=True,
#     n_jobs=7
)

In [None]:
callbacks = [
    DeadlineStopper(60*60),
#     DeltaYStopper(0.0001)
]

In [None]:
opt.fit(X_train, y_train, callback=callbacks)

In [None]:
fig, axes = plt.subplots(figsize=[16, 8])
sns.lineplot(
    x=np.arange(opt.cv_results_["mean_test_score"].shape[0]),
    y=opt.cv_results_["mean_test_score"],
    ax=axes,
)
axes.axhline(y=opt.best_score_, color="red", linestyle="--")
plt.show()

In [None]:
opt.best_params_

In [None]:
dtree_bayesian_te = opt.best_estimator_

In [None]:
pred = dtree_bayesian_te.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=[FIG_WIDTH, FIG_HEIGHT])
sns.scatterplot(x=pred, y=y_test.values, alpha=0.5, ax=axes[0])
sns.histplot(y_test.values - pred, ax=axes[1], stat='density')
plt.show()

In [None]:
Path('output/dtree_bayesian_te.pickle').write_bytes(pickle.dumps(dtree_bayesian_te))

# Prediction

In [None]:
data_test_processed = data_test.loc[:,data_train.columns].astype(data_train.dtypes)

In [None]:
pred = pd.DataFrame(dtree_bayesian_te.predict(data_test_processed)).reset_index().rename(columns={0:'pred'})

In [None]:
pred.to_csv(Path('output/test_prediction.csv'), index=False)