In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import os
import joblib

from src.feature_engineering import *
from src.modeling import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
#config
OUTPUT_PATH = "output/blueprint/"
GRID_SEARCH_RESULT_PATH = "output/grid_search/tc2_run_1/grid_search_result.csv"

In [3]:
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

### Read grid search result

In [4]:
df_gs_result = pd.read_csv(GRID_SEARCH_RESULT_PATH)
print(df_gs_result.shape)
df_gs_result.head()

(94, 4)


Unnamed: 0,model_name,preprocessing,best_params,rmse
0,knn,['outlier_removal'],"{'model__n_neighbors': 15, 'model__p': 2, 'mod...",27.26252
1,knn,['polynomial'],"{'model__n_neighbors': 15, 'model__p': 2, 'mod...",27.270176
2,knn,['binning'],"{'binning__n_bins': 10, 'model__n_neighbors': ...",27.266932
3,knn,['standardization'],"{'model__n_neighbors': 15, 'model__p': 1, 'mod...",27.291581
4,knn,['pca'],"{'model__n_neighbors': 15, 'model__p': 1, 'mod...",27.267437


In [8]:
pd.set_option('display.max_colwidth', None)
df_gs_result.sort_values('rmse', ascending=True).head(10)

Unnamed: 0,model_name,preprocessing,best_params,rmse
62,lasso,['outlier_removal'],"{'model__alpha': 0.1, 'model__max_iter': 1000}",26.419948
39,ridge,"['outlier_removal', 'pca']","{'model__alpha': 0.001, 'model__solver': 'auto', 'pca__n_components': 0.85}",26.420203
70,lasso,"['outlier_removal', 'pca']","{'model__alpha': 0.001, 'model__max_iter': 1000, 'pca__n_components': 0.85}",26.420203
35,ridge,['pca'],"{'model__alpha': 100, 'model__solver': 'svd', 'pca__n_components': 0.85}",26.420442
66,lasso,['pca'],"{'model__alpha': 0.001, 'model__max_iter': 1000, 'pca__n_components': 0.85}",26.420442
79,lasso,"['outlier_removal', 'polynomial', 'pca']","{'model__alpha': 0.001, 'model__max_iter': 1000, 'pca__n_components': 0.85, 'polynomial__degree': 3}",26.420466
48,ridge,"['outlier_removal', 'polynomial', 'pca']","{'model__alpha': 0.001, 'model__solver': 'auto', 'pca__n_components': 0.85, 'polynomial__degree': 3}",26.420466
42,ridge,"['polynomial', 'pca']","{'model__alpha': 0.001, 'model__solver': 'auto', 'pca__n_components': 0.85, 'polynomial__degree': 3}",26.420687
73,lasso,"['polynomial', 'pca']","{'model__alpha': 0.001, 'model__max_iter': 1000, 'pca__n_components': 0.85, 'polynomial__degree': 3}",26.420687
86,lasso,"['binning', 'standardization', 'pca']","{'binning__n_bins': 5, 'model__alpha': 1, 'model__max_iter': 1000, 'pca__n_components': 0.85}",26.421642


### Make custom blueprint

In [10]:
BLUEPRINT_NAME = "outlier_poly_pca_lasso_alpha_0_001_max_iter_1000_n_components_0_85_degree_3.joblib"

steps = []

steps.append(("outlier_removal", OutlierRemoval()))
steps.append(("polynomial", PolynomialFeatures(interaction_only=False, include_bias=False, degree=3)))
# steps.append(("binning", KBinsDiscretizer(encode="ordinal", n_bins=5)))
# steps.append(("standardization", StandardScaler()))
steps.append(("pca", PCA(n_components=0.85)))
steps.append(("model", Lasso(alpha=0.001, max_iter=1000)))
# steps.append(("model", Ridge(alpha=0.001, solver="auto")))

pipeline = Pipeline(steps=steps)

#save best estimator to output folder
joblib.dump(pipeline, os.path.join(OUTPUT_PATH, BLUEPRINT_NAME))

['output/blueprint/outlier_poly_pca_lasso_alpha_0_001_max_iter_1000_n_components_0_85_degree_3.joblib']