In [None]:
import numpy as np
import pandas as pd
from math import sqrt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from optuna.integration import OptunaSearchCV
from optuna.distributions import *

In [None]:
import sys
sys.path.append('../../../src')
from pipeline_utils import LGBMRegressorEarlyStopping, mse

In [None]:
data_dir = '../data'
RANDOM_STATE = 2021

In [None]:
df = pd.read_parquet(f"{data_dir}/interim/train.parq", engine='pyarrow').convert_dtypes()
display(df.shape)
df.head(2)

In [None]:
df = df[df['target']>=4]
df.shape

In [None]:
numeric_features = [col for col in df.columns if col.startswith('cont')]
categorical_features = [col for col in df.columns if col.startswith('cat')]

In [None]:
df['target'] = df['target'].astype('float')
# df[numeric_features] = df[numeric_features].astype('float')
# df[categorical_features] = df[categorical_features].apply(lambda x: x.cat.codes).astype('int')

In [None]:
X = df.copy()
y = X.pop('target')

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=RANDOM_STATE,
)

Categorical Pipeline

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)),
])

Numeric Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler()),
])

Preprocess Pipeline   
- merge cateogrical & numeric into one pipeline 

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

Pipeline   
- merge preprocess & model into one pipeline

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LGBMRegressorEarlyStopping(early_stopping_rounds=300, test_size=0.2, eval_metric='rmse')),
])

Parameters for `OptunaSearchCV`

In [None]:
parameters = {
    "model__objective": CategoricalDistribution(["regression"]),
    "model__metric": CategoricalDistribution(["rmse"]),
    "model__learning_rate": LogUniformDistribution(1e-3, 1.0),
    'model__n_estimators': CategoricalDistribution([20000]),
    'model__reg_alpha': LogUniformDistribution(1e-3, 10.0),
    'model__reg_lambda': LogUniformDistribution(1e-3, 10.0),
    'model__colsample_bytree': CategoricalDistribution([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
    'model__subsample': CategoricalDistribution([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
    'model__max_depth': CategoricalDistribution([4, 8, 16, 32, 64]),
    'model__num_leaves' : IntUniformDistribution(1, 1000),
    'model__min_child_samples': IntUniformDistribution(1, 300),
    'model__cat_smooth' : IntUniformDistribution(1, 100),
}

In [None]:
grid_search = OptunaSearchCV(
    pipeline,
    param_distributions=parameters,
    cv=5,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    scoring=mse,
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
preds = grid_search.best_estimator_.predict(X_valid)
mean_squared_error(y_valid, preds, squared=False) 

In [None]:
abs(grid_search.best_score_)

In [None]:
grid_search.best_params_

### Submission

In [None]:
X_test = pd.read_parquet(f"{data_dir}/interim/test.parq", engine='pyarrow').convert_dtypes()
display(X_test.shape)
X_test.head(2)

In [None]:
# X_test[numeric_features] = X_test[numeric_features].astype('float')
# X_test[categorical_features] = X_test[categorical_features].apply(lambda x: x.cat.codes).astype('int')

In [None]:
preds_test = grid_search.best_estimator_.predict(X_test)

In [None]:
output = pd.DataFrame(
    {'Id': X_test.index, 'target': preds_test})
output.to_csv(f"{data_dir}/processed/submission.csv", index=False)