## Define Constants Data

In [78]:
import os
import random

random_seed = 32
random.seed(random_seed)

## Load Data

In [79]:
import pandas as pd

housing = pd.read_csv("../datasets/housing/housing.csv")

## Analyzing the Data

In [80]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [81]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [82]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [83]:
set(housing["ocean_proximity"].values)

{'<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'}

## Define Target and Explatory Variables

In [84]:
import numpy as np

target_variable = "median_house_value"
explanatory_variables = housing.columns.values.tolist()
explanatory_variables.remove(target_variable)

## Define Training and Test Sets

In [85]:
X = housing[explanatory_variables]
y = housing[target_variable]

print(X.shape)
print(y.shape)

(20640, 9)
(20640,)


In [86]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(16512, 9)
(16512,)
(4128, 9)
(4128,)


## Define Preprocessing Pipeline

#### Define Numerical and Categorical Variables

In [87]:
categorical_variables = ["ocean_proximity"]
numerical_variables = [var for var in explanatory_variables if var not in categorical_variables]

print(categorical_variables)
print(numerical_variables)

['ocean_proximity']
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']


#### Define Custom Transformer

In [126]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyTransformer(BaseEstimator, TransformerMixin):
    
    HOUSEHOLDS_IDX = 6
    TOTAL_ROOMS_IDX = 3
    TOTAL_BEDROOMS_IDX = 4
    POPULATION_IDX = 5
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rooms_per_household = X[:, self.TOTAL_ROOMS_IDX] / X[:, self.HOUSEHOLDS_IDX]
        bedrooms_per_household = X[:, self.TOTAL_BEDROOMS_IDX] / X[:, self.HOUSEHOLDS_IDX]
        population_per_household = X[:, self.POPULATION_IDX] / X[:, self.HOUSEHOLDS_IDX]
        
        return np.concatenate((X, 
                               rooms_per_household.reshape(-1, 1),
                               bedrooms_per_household.reshape(-1, 1), 
                               population_per_household.reshape(-1, 1)), axis=1)

#### Define Numerical Pipeline

In [127]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("my_transformer", MyTransformer()),
    ("scaler", StandardScaler())
])

#### Define Preprocessing Pipeline

In [128]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocessing_pipeline = ColumnTransformer([
    ("num_pipeline", numerical_pipeline, numerical_variables),
    ("encoder", OneHotEncoder(), categorical_variables)
])

In [129]:
X_train_transformed = preprocessing_pipeline.fit_transform(X_train)
X_train_transformed

array([[-0.61053778,  1.34571519, -0.44773646, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.82224293, -0.93015644,  0.50551543, ...,  0.        ,
         0.        ,  0.        ],
       [-1.34440107,  1.01791475, -1.48042601, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.78871286,  1.63605272,  0.34664011, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91007382,  0.34826528,  0.82326606, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.842212  , -0.87864494,  0.42607777, ...,  0.        ,
         0.        ,  0.        ]])

In [130]:
print(X_train_transformed.shape)

(16512, 16)


In [131]:
np.isnan(X_train_transformed).sum()

0

## Define Final Pipeline

In [132]:
from sklearn.linear_model import LinearRegression

regressor_pipeline = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("regressor", LinearRegression())
])

In [133]:
regressor_pipeline.fit(X_train, y_train)

In [134]:
regressor_pipeline.score(X_train, y_train)

0.6501036450731024

In [135]:
regressor_pipeline.score(X_test, y_test)

0.6234271879494226

In [136]:
predictions_train = regressor_pipeline.predict(X_train)
predictions_train

array([101527.04535519, 237385.2033671 , 107948.97007421, ...,
       137596.10542161, 231307.11042126, 251907.6622615 ])

In [137]:
predictions_test = regressor_pipeline.predict(X_test)
predictions_test

array([ 56871.21851644, 226332.23752046, 242297.26037942, ...,
       176840.71008199, 131000.87554031, 236905.10112538])

In [138]:
from sklearn.metrics import mean_squared_error as mse 

In [139]:
np.sqrt(mse(y_train, predictions_train))

68027.70633334377

In [140]:
np.sqrt(mse(y_test, predictions_test))

71755.78885808328

In [141]:
regressor_pipeline.named_steps["regressor"].coef_

array([-56129.26737568, -57789.04349646,  12954.93765894, -12121.09288829,
        24409.3206628 , -49657.93750359,  42439.50904387,  71737.75614611,
         4731.00873036,   -611.8518738 ,   2565.77775251, -26237.12019129,
       -66169.31155292, 146146.6998871 , -30386.13739327, -23354.13074962])