In [None]:
import numpy as np
import pandas as pd
from math import sqrt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold

In [None]:
data_dir = '../data'
RANDOM_STATE  = 2021

In [None]:
df = pd.read_parquet(f"{data_dir}/interim/train.parq", engine='pyarrow').convert_dtypes()
display(df.shape)
df.head(2)

In [None]:
df = df.sample(frac=0.1)

In [None]:
X = df.copy()
y = X.pop('target')

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, test_size=0.2, random_state=RANDOM_STATE,
)

In [None]:
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('log', FunctionTransformer(np.log1p)),
    ('scaler', StandardScaler()),
])

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('variance_drop', VarianceThreshold(threshold=(0.95 * (1 - 0.95)))),
    ('voting', 'passthrough'),
])

parameters = [
    {
        'voting': [VotingRegressor([
            ('lgbm', LGBMRegressor(random_state=RANDOM_STATE)),
            ('xgb', XGBRegressor(random_state=RANDOM_STATE))
        ])],
        
        # LGBM
        'voting__lgbm__n_estimators': [2000], # range(500, 3000, 1000),
        'voting__lgbm__max_depth': [12], # range(4, 16, 4),
        'voting__lgbm__learning_rate': [0.01],
        'voting__lgbm__num_leaves': [256],
        'voting__lgbm__min_child_weight': [12],
        'voting__lgbm__feature_fraction': [0.4],  # np.arange(0.1, 1, 0.1),
        'voting__lgbm__bagging_fraction': [0.7],  # np.arange(0.1, 1, 0.1),
        'voting__lgbm__bagging_freq': [5],
        'voting__lgbm__min_child_samples': [32],
        'voting__lgbm__lambda_l1':[9],
        'voting__lgbm__lambda_l2': [0.13],               
        
        # XGBM
        'voting__xgb__n_estimators': [2000],  # range(500, 3000, 1000),
        'voting__xgb__max_depth': [12],  # range(4, 16, 4),
        'voting__xgb__learning_rate': [0.01],
        'voting__xgb__alpha': [5],
        'voting__xgb__gamma': [3],
        'voting__xgb__lambda': [3],
        'voting__xgb__subsample': [0.8],
        'voting__xgb__colsample_bytree': [0.4],
    }
]

In [None]:
3 * len(ParameterGrid(parameters))

In [None]:
custom_scoring = make_scorer(
    score_func=lambda y, y_pred: mean_squared_error(y, y_pred, squared=False),
    greater_is_better=False,
)

In [None]:
grid_search = GridSearchCV(
    pipeline,
    param_grid=parameters,
    cv=3,
    scoring=custom_scoring,
    n_jobs=-1,
    verbose=True,
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
preds = grid_search.best_estimator_.predict(X_valid)
mean_squared_error(y_valid, preds, squared=False) 

In [None]:
abs(grid_search.best_score_)

In [None]:
grid_search.best_params_

In [None]:
for name, importance in zip(df.columns, grid_search.best_estimator_.named_steps["clf"].feature_importances_):
    if importance <= 0:
        print(name)

In [None]:
# xgboost
plot_importance(grid_search.best_estimator_.named_steps["clf"]);

### Submit

In [None]:
X_test = pd.read_parquet(f"{data_dir}/interim/test.parq", engine='pyarrow').convert_dtypes()
display(X_test.shape)
X_test.head(2)

In [None]:
preds_test = grid_search.best_estimator_.predict(X_test)

In [None]:
output = pd.DataFrame(
    {'Id': X_test.index, 'target': preds_test})
output.to_csv(f"{data_dir}/processed/submission.csv", index=False)