reference: 
1. https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions/notebook
2. https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-2.html
3. https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines

In [140]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [141]:
all, test_data = pd.read_csv('train.csv', index_col=0), pd.read_csv('test.csv', index_col=0)

all_data, all_label = all.iloc[:, :-1], all.iloc[:, -1]

In [142]:
cor = all.corr()

threshold = 0.5
label_name = 'SalePrice'
cor_cols = cor.loc[(cor[label_name] >= threshold) | (cor[label_name] <= -threshold)].index.values

cor_cols = cor_cols[:-1]

In [143]:
test_data.shape, all_data.shape

((1459, 79), (1460, 79))

In [144]:
from sklearn.base import BaseEstimator, TransformerMixin

class Conver2Pd(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X, columns=self.columns)
        return X

class CategoryEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for col in self.columns:
            X[col] = X[col].astype("category")
            X[col] = X[col].cat.codes            
        return X
    
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
#         X.corr()
        return self
    
    def transform(self, X):           
        return X[self.columns]

In [145]:
cols = all_data.dtypes[ all_data.dtypes == 'object' ].index.values

piple_dt = Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
                     ,('Convert2Pd', Conver2Pd(all_data.columns))
                     ,('encode', CategoryEncoder(cols))
                     ,('featureSelector', FeatureSelector(cor_cols))
                     ,('cart', DecisionTreeRegressor(min_samples_leaf=10))])

piple_svr = Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
                     ,('Convert2Pd', Conver2Pd(all_data.columns))
                     ,('encode', CategoryEncoder(cols))
#                      ,('featureSelector', FeatureSelector(cor_cols))
                     ,('svr', SVR())])

parameters_dt = [{'cart__min_samples_leaf': np.arange(1, 100, 10)}]
parameters_svr = [{'svr__C': np.logspace(-1, 2, 10)}, {'svr__epsilon': np.arange(0.1, 2, 0.2)}]

In [146]:
grid_search = GridSearchCV(piple_svr, param_grid=parameters_svr, cv=10)
best_estimators = grid_search.fit(all_data, all_label).best_estimator_
print("best estimators ", best_estimators)
print("optimal value ", -np.mean(cross_val_score(best_estimators, all_data, all_label, cv=10, scoring='neg_mean_squared_error')))



















optimal value  6632725434.963904




In [147]:
cor_cols

array(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF',
       '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars',
       'GarageArea'], dtype=object)