# Imports

In [1]:
from IPython.core.display import display
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.pipeline import Pipeline, make_pipeline
from riski_ml import PandasFeatureUnion, PandasOneHotEncoder,EstimatorSelectionHelper
from riski_ml.transformers.dataframe_transformers import *
from show_and_tell.utils import get_data
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Config

In [2]:
DATA_FOLDER_PATH = Path("test_data")
TRAIN_PATH = DATA_FOLDER_PATH/"house_prices_regression.csv"
LABEL_COL_NAME = 'SalePrice'
TEST_PROPORTION_SIZE = 0.1
RANDOM_STATE = 42
FINAL_MODEL_PATH = 'show_and_tell/finalized_model.sav'
NAN_PROPORTION_TO_REMOVE = 0.5

# Data Import 

In [3]:
from sklearn.model_selection import train_test_split
X,y = get_data(TRAIN_PATH, LABEL_COL_NAME)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_PROPORTION_SIZE, random_state=RANDOM_STATE)

# Quick data exploration

In [4]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table', 'table style="display:inline"'), raw=True)

def object_columns_unique_and_missing_values(df):
    cols = ['column', 'type', '#_unique', '% missing_values']
    cols_metadata = [[col, df[col].dtype, len(df[col].value_counts()), df[col].isna().sum()/df.shape[0]] for col in df.columns]
    return pd.DataFrame(cols_metadata, columns = cols).sort_values('% missing_values')

display_side_by_side(*[object_columns_unique_and_missing_values(X_train.select_dtypes(dtype)) for dtype in ['O','int64', 'float64']]) 

Unnamed: 0,column,type,#_unique,% missing_values
0,MSZoning,object,5,0.0
1,Street,object,2,0.0
3,LotShape,object,4,0.0
4,LandContour,object,4,0.0
5,Utilities,object,2,0.0
6,LotConfig,object,5,0.0
7,LandSlope,object,3,0.0
8,Neighborhood,object,25,0.0
9,Condition1,object,9,0.0
2,Alley,object,2,0.934551

Unnamed: 0,column,type,#_unique,% missing_values
0,Id,int64,1314,0.0
31,MiscVal,int64,20,0.0
30,PoolArea,int64,8,0.0
29,ScreenPorch,int64,74,0.0
28,3SsnPorch,int64,19,0.0
27,EnclosedPorch,int64,112,0.0
26,OpenPorchSF,int64,195,0.0
25,WoodDeckSF,int64,260,0.0
24,GarageArea,int64,416,0.0
23,GarageCars,int64,5,0.0

Unnamed: 0,column,type,#_unique,% missing_values
1,MasVnrArea,float64,306,0.006088
2,GarageYrBlt,float64,95,0.055556
0,LotFrontage,float64,108,0.180365


# 1. Basic Usage

In [5]:
from sklearn.linear_model import LinearRegression
preprocess = Pipeline([('select_columns', ColumnRemover(['Id'])),
                       ('object_to_cat_or_num', ObjectsColumnaAsType()),
                       ('remove_nan_columns', NanColumnsRemover()),
                       ('feature_union', PandasFeatureUnion(transformer_list=[
                           ("numeric_features", make_pipeline(
                               TypeSelector(np.number),
                               PandasImputer(strategy="mean"),
                               PandasStandardScaler()
                           )),
                           ("categorical_features", make_pipeline(
                               TypeSelector("category"),
                               CatToInt(),
                               PandasImputer(strategy="most_frequent"),
                               PandasOneHotEncoder()
                           )),
                           ("boolean_features", make_pipeline(
                               TypeSelector(bool),
                               PandasImputer(strategy="most_frequent"),
                               BoolToInt(),
                           )),
                           
                       ])),
                       ])

preprocess.fit(X_train)
preprocess.transform(X_train).head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Condition1_0,Condition1_6,Condition1_1,Condition1_7,Condition1_4,Condition1_5,Condition1_8,Street,Alley,Utilities
907,-0.174883,0.733776,0.084557,0.650071,1.282993,-1.149679,0.105391,-0.588127,-0.489611,-0.279269,...,1,0,0,0,0,0,0,1,1,1
782,-0.880863,-0.1265,0.543774,0.650071,-0.520058,0.984929,0.830435,-0.588127,-0.979513,-0.279269,...,1,0,0,0,0,0,0,1,1,1


In [6]:
preprocess.transform(X_test).head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Condition1_2,Condition1_3,Condition1_4,Condition1_5,Condition1_6,Condition1_7,Condition1_8,Street,Alley,Utilities
892,-0.880863,0.009333,-0.211606,-0.078142,2.184519,-0.262996,0.878772,-0.588127,0.477012,-0.279269,...,0,0,0,0,0,0,0,1,1,1
1105,0.060443,1.277108,0.157111,1.378284,-0.520058,0.755048,0.492081,1.462934,1.287656,-0.279269,...,0,0,0,0,0,0,0,1,1,1


In [7]:
preprocess.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'select_columns', 'object_to_cat_or_num', 'remove_nan_columns', 'feature_union', 'select_columns__cols', 'remove_nan_columns__p', 'feature_union__n_jobs', 'feature_union__transformer_list', 'feature_union__transformer_weights', 'feature_union__verbose', 'feature_union__numeric_features', 'feature_union__categorical_features', 'feature_union__boolean_features', 'feature_union__numeric_features__memory', 'feature_union__numeric_features__steps', 'feature_union__numeric_features__verbose', 'feature_union__numeric_features__typeselector', 'feature_union__numeric_features__pandasimputer', 'feature_union__numeric_features__pandasstandardscaler', 'feature_union__numeric_features__typeselector__dtype', 'feature_union__numeric_features__pandasimputer__strategy', 'feature_union__categorical_features__memory', 'feature_union__categorical_features__steps', 'feature_union__categorical_features__verbose', 'feature_union__categorical_features__typeselector', '

## Accesssing objects is easy
Also named_steps in the pipeline allows autocomple

In [8]:
print(preprocess.named_steps.remove_nan_columns.columns_to_remove)
preprocess.named_steps.feature_union.transformer_list[0][1][-1].transformer.mean_

[]


array([5.74315068e+01, 6.97938719e+01, 1.06189201e+04, 6.10730594e+00,
       5.57686454e+00, 1.97100837e+03, 1.98481963e+03, 1.03800919e+02,
       4.45867580e+02, 4.33515982e+01, 5.67424658e+02, 1.05664384e+03,
       1.16369787e+03, 3.50787671e+02, 5.45433790e+00, 1.51993988e+03,
       4.24657534e-01, 5.78386606e-02, 1.57153729e+00, 3.83561644e-01,
       2.87747336e+00, 1.04870624e+00, 6.54261796e+00, 6.16438356e-01,
       1.97857293e+03, 1.76940639e+00, 4.72396499e+02, 9.48759513e+01,
       4.78850837e+01, 2.21324201e+01, 3.77092846e+00, 1.52587519e+01,
       3.06544901e+00, 4.66773212e+01, 6.34474886e+00, 2.00781963e+03])

In [9]:
# Show autocomplete
# preprocess.named_steps

## Piping a classifer 

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso,LinearRegression

clf_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', Lasso())
])
params = {'preprocess__remove_nan_columns__p':[0.2],
          'clf__alpha': [1,2]} 
clf = GridSearchCV(clf_pipeline, params)
clf.fit(X_train, y_train)
pd.DataFrame(clf.cv_results_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_preprocess__remove_nan_columns__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.107805,0.006443,0.047608,0.000744,1,0.2,"{'clf__alpha': 1, 'preprocess__remove_nan_colu...",0.843353,0.66133,0.529361,0.83164,0.587038,0.690544,0.127126,2
1,0.091855,0.011874,0.046934,0.00138,2,0.2,"{'clf__alpha': 2, 'preprocess__remove_nan_colu...",0.843644,0.66199,0.529841,0.832344,0.59072,0.691708,0.126606,1


In [11]:
clf.predict(X_test)

array([155150.6991462 , 265590.56276628, 123561.91399295, 233344.42131596,
       264867.70606823,  41011.05903581, 219735.20500982, 139529.34760413,
        39129.46852104, 151365.24660418, 145754.93064197, 157387.34755024,
        61287.31851214, 184102.50114609, 179793.78286339, 149492.70772431,
       194429.01191481, 146023.16723807, 104158.53463624, 213060.17827771,
       178966.00721306, 179560.78838086, 174702.68259917, 138063.08313353,
       192707.85279361, 135790.95773261, 181566.81296771,  85801.36418166,
       171126.29277503, 153964.50886944, 169686.6152043 , 237857.0105874 ,
       225512.51389609, 104435.46800127, 234818.68464954, 166381.33019247,
       157228.50641755, 205051.97754253, 272472.54704247,  99485.57351663,
       184828.69337654, 230420.9356078 , 116803.15487145, 242418.51551271,
       125078.94576989, 143387.61070646, 112504.68078661, 134035.30263316,
       310842.56620586, 147891.43049783, 126181.20443822, 254252.38557954,
        84248.32153935, 2

In [12]:
print(np.mean(np.sqrt(np.square(clf.predict(X_test) - y_test))))
np.mean(np.sqrt(np.square(clf.predict(X_train) - y_train)))

30985.145126827112


18403.789139702094

# Testing multiple algorithms is easy

Multiprocessing 

In [13]:
clf_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', 'passthrough')
])

param_grid = [
    {'clf': [Lasso()],
     'clf__alpha': [1,2,3]
    },
    {'clf': [LinearRegression()],
    },
]

clf = GridSearchCV(clf_pipeline, n_jobs=8, param_grid=param_grid, scoring = 'neg_mean_absolute_error')
clf.fit(X, y)
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.175849,0.013394,0.068545,0.005419,Lasso(alpha=3),1.0,"{'clf': Lasso(alpha=3), 'clf__alpha': 1}",-31112.697687,-28152.472921,-23181.246812,-20186.061809,-27257.005904,-25977.897027,3849.615183,3
1,0.159272,0.018241,0.069203,0.006779,Lasso(alpha=3),2.0,"{'clf': Lasso(alpha=3), 'clf__alpha': 2}",-30881.836475,-28108.096254,-23153.395457,-20162.898088,-27230.077166,-25907.260688,3792.661342,2
2,0.156011,0.02857,0.066403,0.004594,Lasso(alpha=3),3.0,"{'clf': Lasso(alpha=3), 'clf__alpha': 3}",-30651.332291,-28062.4991,-23124.496245,-20141.061052,-27201.883984,-25836.254534,3736.173023,1
3,0.093988,0.012056,0.056562,0.007746,LinearRegression(),,{'clf': LinearRegression()},-31261.113014,-28359.986301,-38257.739726,-23552.761986,-27426.047945,-29771.529795,4907.166807,4


### Selecting dimensionality reduction type example

https://scikit-learn.org/stable/auto_examples/compose/plot_compare_reduction.html

In [14]:
best_estimator = clf.best_estimator_
best_estimator.fit(X_train,y_train)

Pipeline(steps=[('preprocess',
                 Pipeline(steps=[('select_columns', ColumnRemover(cols=['Id'])),
                                 ('object_to_cat_or_num',
                                  ObjectsColumnaAsType()),
                                 ('remove_nan_columns', NanColumnsRemover()),
                                 ('feature_union',
                                  PandasFeatureUnion(transformer_list=[('numeric_features',
                                                                        Pipeline(steps=[('typeselector',
                                                                                         TypeSelector(dtype=<class 'numpy.number'>)),
                                                                                        ('pandasimputer',
                                                                                         P...
                                                                                         TypeSelector(dtype='ca

## Saving and loading

In [15]:
best_estimator.predict(X_test)[:10]

array([155139.83813614, 265659.13343567, 123340.81938772, 233302.34950032,
       264883.44259523,  40958.443787  , 219732.23099432, 139473.6863763 ,
        39082.42007581, 151440.70805662])

In [16]:
import joblib
joblib.dump(best_estimator, FINAL_MODEL_PATH)
loaded_model = joblib.load(FINAL_MODEL_PATH)
loaded_model.predict(X_test)[:10]

array([155139.83813614, 265659.13343567, 123340.81938772, 233302.34950032,
       264883.44259523,  40958.443787  , 219732.23099432, 139473.6863763 ,
        39082.42007581, 151440.70805662])

# 2. Stacking

In [17]:
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

estimators = [
    ('catboost1', CatBoostRegressor(iterations = 1, subsample = 0.5)),
    ('catboost2', CatBoostRegressor(iterations = 1))]

stacking_clf = StackingRegressor(estimators=estimators, final_estimator=CatBoostRegressor(iterations = 1), passthrough = False)


clf = Pipeline([
    ('preprocess', preprocess),
    ('clf',stacking_clf)
])

clf.fit(X_train, y_train).score(X_test, y_test)

Learning rate set to 0.5
0:	learn: 57236.6190786	total: 56ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57354.0145258	total: 2.08ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57707.5293576	total: 2.06ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 54470.1848545	total: 2.56ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 56777.5532270	total: 1.57ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 56564.5964865	total: 1.53ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 58429.1562621	total: 1.67ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57976.8616155	total: 1.57ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 55411.3103182	total: 1.84ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 55782.1724428	total: 2.74ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 56449.9927225	total: 1.65ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57161.0192619	total: 1.56ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 58014.0234415	total: 1.05ms	rem

0.39323095654411455

In [18]:
clf_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', stacking_clf)
])
params = {'preprocess__remove_nan_columns__p':[0.5],} 
clf = GridSearchCV(clf_pipeline, params)
clf.fit(X_train, y_train)
pd.DataFrame(clf.cv_results_)

Learning rate set to 0.5
0:	learn: 57707.5293576	total: 2.52ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57976.8616155	total: 1.68ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 54844.6645467	total: 1.6ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57110.3632825	total: 1.54ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 58309.6755141	total: 1.64ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 56899.6442665	total: 1.45ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 60763.9479788	total: 1.48ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57040.5110558	total: 2.28ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57713.0395835	total: 1.44ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 55662.3993835	total: 1.45ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 56766.4293272	total: 1.41ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 58654.4423426	total: 1.44ms	remaining: 0us
Learning rate set to 0.5
0:	learn: 57911.0990377	total: 1.07ms	re

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocess__remove_nan_columns__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.277341,0.006894,0.062531,0.017356,0.5,{'preprocess__remove_nan_columns__p': 0.5},0.429358,0.468881,0.38445,0.442573,0.497639,0.44458,0.038098,1


In [19]:
clf.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__preprocess', 'estimator__clf', 'estimator__preprocess__memory', 'estimator__preprocess__steps', 'estimator__preprocess__verbose', 'estimator__preprocess__select_columns', 'estimator__preprocess__object_to_cat_or_num', 'estimator__preprocess__remove_nan_columns', 'estimator__preprocess__feature_union', 'estimator__preprocess__select_columns__cols', 'estimator__preprocess__remove_nan_columns__p', 'estimator__preprocess__feature_union__n_jobs', 'estimator__preprocess__feature_union__transformer_list', 'estimator__preprocess__feature_union__transformer_weights', 'estimator__preprocess__feature_union__verbose', 'estimator__preprocess__feature_union__numeric_features', 'estimator__preprocess__feature_union__categorical_features', 'estimator__preprocess__feature_union__boolean_features', 'estimator__preprocess__feature_union__numeric_features__memory', 'estimator__preprocess__feature_uni