In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [2]:
housing = pd.read_csv('housing.csv')

# Remove the target column before pre-processing
y = housing['median_house_value']
housing = housing.drop('median_house_value', axis=1)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In [3]:
# Add the combined features
housing['rooms_per_person'] = np.round(housing['total_rooms'] / housing['population'], 2)
housing['rooms_per_person'] = housing['rooms_per_person'].map(lambda x: min(x, 4))
housing['bedrooms_per_room'] = np.round(housing['total_bedrooms'] / housing['total_rooms'], 2)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_person,bedrooms_per_room
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY,2.73,0.15
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY,2.96,0.16
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY,2.96,0.13
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY,2.28,0.18
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY,2.88,0.17


In [4]:
columns = list(housing)

lon_lat_features = [columns[0], columns[1]]
lon_lat_transformer = KBinsDiscretizer(n_bins=10, encode='onehot-dense')

numeric_features = columns[2:-3] + columns[-2:]
numeric_transformer = Pipeline([ 
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())
])

categorical_features = [columns[-3]]
categorical_transformer = OneHotEncoder()

In [5]:
preprocessor = ColumnTransformer([
    ('lon_lat', lon_lat_transformer, lon_lat_features),
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

##### O

In [6]:
X_train, X_test, y_train, y_test = train_test_split(housing, y, test_size=0.2)

In [7]:
lin_reg = Pipeline([    
    ('preprocessor', preprocessor),    
    ('regressor', LinearRegression())    
])

lin_reg.fit(X_train, y_train)
train_predictions = lin_reg.predict(X_train)
train_RMSE = np.sqrt(MSE(y_train, train_predictions))
test_predictions = lin_reg.predict(X_test)
test_RMSE = np.sqrt(MSE(y_test, test_predictions))

print('RMSE on training set:', train_RMSE)
print('RMSE on test set:', test_RMSE)

RMSE on training set: 62776.564458393426
RMSE on test set: 62543.83989329687


In [8]:
tree_reg = Pipeline([    
    ('preprocessor', preprocessor),    
    ('regressor', DecisionTreeRegressor())    
])

tree_reg.fit(X_train, y_train)
train_predictions = tree_reg.predict(X_train)
train_RMSE = np.sqrt(MSE(y_train, train_predictions))
test_predictions = tree_reg.predict(X_test)
test_RMSE = np.sqrt(MSE(y_test, test_predictions))

print('RMSE on training set:', train_RMSE)
print('RMSE on test set:', test_RMSE)

RMSE on training set: 0.0
RMSE on test set: 73514.69772422555
