In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
import seaborn as sns

In [3]:
df = sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
X= df.iloc[:,1:]
y= df['total_bill']

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3, random_state =14)

In [10]:
##Pipelining
numeric_pipeline = Pipeline([
    ('Imputation mean', SimpleImputer(missing_values = np.nan, strategy = 'mean')),
    ('scaler', StandardScaler()),
])

In [12]:
categorical_preprocessing = Pipeline([
    ('imputation_constant', SimpleImputer(fill_value = "missing", strategy = "constant")),
    ("onehot", OneHotEncoder(handle_unknown= "ignore"))
])

In [13]:
preprocessor = ColumnTransformer([
    ('categorical', categorical_preprocessing, ["sex","smoker","day","time"]),
    ('numerical', numeric_pipeline, ["tip","size"])
])


In [19]:
pipe = Pipeline(
    steps = [("preproceessor", preprocessor),("regressor", RandomForestRegressor())]
)

In [20]:
pipe

In [21]:
pipe.fit(X_train,y_train)

In [22]:
pipe.predict(X_test)

array([20.4111    , 17.7827    , 21.3632    , 35.2802    , 19.01318667,
       38.8508    , 33.9454    , 44.725     , 17.431465  , 23.36515667,
       19.2509    , 41.3722    , 35.592     , 26.60445   , 16.9005    ,
       11.4451    , 28.039     , 19.4231    , 17.326745  , 18.6536    ,
       19.5399    , 44.1262    , 19.86      , 14.9885    , 37.4944    ,
       17.6539    , 12.285     , 21.0831    , 19.428     ,  9.94708571,
       19.8266    , 20.1755    , 20.8999    , 13.70481786, 14.28646988,
       29.23159095, 21.1213    , 22.3563    , 17.6868    , 13.05041786,
       20.6623    , 11.73227667, 22.19353333, 13.70481786, 14.13666667,
       17.83513667, 18.00115   , 34.17      , 14.25213333, 22.60785   ,
       40.8168    , 12.5122    , 20.53307   , 35.8828    , 15.2626    ,
       25.6283    , 20.54782   , 19.55569   , 17.9573    , 20.621     ,
       15.11      , 17.29545   , 13.0849    , 22.055     , 25.8676    ,
       17.80025   , 31.7902    , 21.6219    , 17.38814   , 26.78

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
##Hyperparameter Tuning
param_grid = {
    "regressor__n_estimators": [200,500],
    "regressor__max_features": ["auto","sqrt","log2"],
    "regressor__max_depth": [4,5,6,7,8]
    }

In [25]:
grid_search = GridSearchCV(pipe, param_grid = param_grid, n_jobs =1)

In [27]:
grid_search.fit(X_train,y_train)

In [29]:
grid_search.best_params_

{'regressor__max_depth': 4,
 'regressor__max_features': 'auto',
 'regressor__n_estimators': 500}

In [30]:
pipe = Pipeline(
    steps = [("preproceessor", preprocessor),("regressor", RandomForestRegressor(max_depth=4,
                                                                                 max_features='auto',
                                                                                 n_estimators=500))]
)

In [31]:
pipe.fit(X_train,y_train)

In [32]:
pipe.predict(X_test)

array([19.83247082, 18.25563773, 22.13510785, 33.96848229, 18.71363762,
       37.94851475, 32.82331021, 42.39521764, 16.67591401, 23.97797614,
       21.31693319, 39.50917702, 33.20632616, 22.29928643, 19.41086019,
       11.08140091, 27.2171784 , 19.6705222 , 14.36959148, 19.17555512,
       19.66358957, 43.71816317, 20.80034946, 16.85806905, 37.04385011,
       19.30031163, 14.1832552 , 22.43327816, 19.96240411, 11.01062954,
       19.66070395, 21.89172889, 18.43142178, 14.64262281, 13.59711556,
       26.91594642, 20.86370551, 21.99428406, 18.36418389, 13.4959862 ,
       21.76729174, 12.83935628, 21.16960217, 14.64262281, 12.83840744,
       16.4758049 , 14.31782658, 33.24175398, 15.78128407, 18.48705776,
       39.44795228, 14.77656524, 18.65204549, 34.01563178, 15.85240185,
       25.46617105, 18.66792735, 20.91200135, 16.95866176, 21.58667078,
       19.04226457, 17.67603039, 13.35605194, 23.54266784, 26.70004918,
       19.43698555, 31.5698294 , 22.88750539, 18.73365322, 27.96