<a href="https://colab.research.google.com/github/datle2403/datle2403/blob/main/Building_pipepline_tuning_grid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# import dataset
df=pd.read_csv('housing.csv')
housing_y=df['median_house_value']
housing=df.drop('median_house_value',axis=1)

In [3]:
# building pipeline

# adding attribute
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [4]:
housing_num = housing.drop("ocean_proximity", axis=1)

In [5]:
# pipeline for numerical attribute
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [6]:
# full-pipeline for num, cat attribute
from sklearn.compose import ColumnTransformer
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
housing_prepared = full_pipeline.fit_transform(housing)

In [7]:
#finding feature_importance
#using randome_forest to find
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [8]:
grid_search.best_estimator_

RandomForestRegressor(max_features=4, n_estimators=30, random_state=42)

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [12]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([8.43135600e-02, 7.87572836e-02, 4.19832331e-02, 2.13368817e-02,
       1.99793382e-02, 2.28926908e-02, 1.95652588e-02, 2.62745343e-01,
       7.38458374e-02, 1.04520261e-01, 9.58811994e-02, 1.75096163e-02,
       1.35901503e-01, 2.02492170e-04, 8.67515221e-03, 1.18903497e-02])

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
# feature selector function, k: number of important feature
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [14]:
k=5

In [15]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [16]:
housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [17]:
housing_prepared_top_k_features[:5]

array([[-1.32783522,  2.34476576, -0.04959654, -1.02998783,  0.        ],
       [-1.32284391,  2.33223796, -0.09251223, -0.8888972 ,  0.        ],
       [-1.33282653,  1.7826994 , -0.02584253, -1.29168566,  0.        ],
       [-1.33781784,  0.93296751, -0.0503293 , -0.4496128 ,  0.        ],
       [-1.33781784, -0.012881  , -0.08561576, -0.63908657,  0.        ]])

In [18]:
housing_prepared_top_k_features.shape

(20640, 5)

In [None]:
grid_search.fit(housing_prepared_top_k_features, housing_y)

In [20]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

76131.32579175218 {'max_features': 2, 'n_estimators': 3}
70034.4453949059 {'max_features': 2, 'n_estimators': 10}
68306.59097617044 {'max_features': 2, 'n_estimators': 30}
78846.4218189802 {'max_features': 4, 'n_estimators': 3}
71181.34984673388 {'max_features': 4, 'n_estimators': 10}
69580.08596898144 {'max_features': 4, 'n_estimators': 30}
nan {'max_features': 6, 'n_estimators': 3}
nan {'max_features': 6, 'n_estimators': 10}
nan {'max_features': 6, 'n_estimators': 30}
nan {'max_features': 8, 'n_estimators': 3}
nan {'max_features': 8, 'n_estimators': 10}
nan {'max_features': 8, 'n_estimators': 30}
77954.85987457915 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
71009.49256964032 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
78496.39509032253 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
72906.73428115713 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
79227.37739320414 {'bootstrap': False, 'max_features': 4, 'n_estimators': 3}
74476.1

In [22]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 30}

In [25]:
from sklearn.linear_model import LinearRegression
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ("forest_reg", RandomForestRegressor(**grid_search.best_params_))
])
print(preparation_and_feature_selection_pipeline.fit(housing,housing_y))
preparation_and_feature_selection_pipeline.predict(housing)

Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('attribs_adder',
                                                                   CombinedAttributesAdder()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                    

array([458440.2       , 407733.6       , 359660.06666667, ...,
        93953.33333333,  94790.        ,  91076.66666667])

In [27]:
some_data = housing.iloc[:4]
some_labels = housing_y.iloc[:4]

print("Predictions:\t", preparation_and_feature_selection_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [458440.2        407733.6        359660.06666667 341116.66666667]
Labels:		 [452600.0, 358500.0, 352100.0, 341300.0]
