In [None]:
import pandas as pd
import numpy as np
import sklearn

URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(URL)
df.head()

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

housing = train_set.drop("median_house_value",axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity",axis=1)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, x , y=None):
        return self
    def transform(self, x):
        rooms_per_household = x[:, rooms_ix]/x[:, households_ix]
        population_per_household = x[:, population_ix]/x[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = x[:, bedrooms_ix]/ x[:,rooms_ix]
            return np.c_[x, rooms_per_household,population_per_household]
        else:
            return np.c_[x, rooms_per_household,population_per_household]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler',StandardScaler())
])
num_pipeline.fit_transform(housing_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.326196  ,
        -0.17491646,  0.05137609],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.03584338,
        -0.40283542, -0.11736222],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.14470145,
         0.08821601, -0.03227969],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.49697313,
        -0.60675918,  0.02030568],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.96545045,
         0.40217517,  0.00707608],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.68544764,
        -0.85144571, -0.08535429]])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ],
       [-0.10855122,  0.5320839 ,  1.14200767, -0.11956547, -0.25655915,
        -0.48587717, -0.31496232, -0.17148831, 