In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE

In [2]:
housing = pd.read_csv("housing.csv")

FileNotFoundError: [Errno 2] File b'housing.csv' does not exist: b'housing.csv'

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
# Remove the target column before pre-processing
y = housing['median_house_value']
housing = housing.drop('median_house_value', axis=1)

In [None]:
def run_linear_reg(X, y):
    lin_reg = LinearRegression()
    lin_reg.fit(X, y)
    predictions = lin_reg.predict(X)
    RMSE = np.sqrt(MSE(y, predictions))
    return RMSE

Imputation of missing values

In [None]:
imputer = SimpleImputer(strategy='median')
housing['total_bedrooms'] = imputer.fit_transform(housing[['total_bedrooms']])
housing.info()

In [None]:
# RMSE after only imputation (ignoriong the ocean_proximity feature)
temp = housing.drop('ocean_proximity', axis=1)
run_linear_reg(temp, y)

One-hot encoding of the ocean_proximity feature

In [None]:
encoder = OneHotEncoder(sparse=False)
ocean_proximity_values = encoder.fit_transform(housing[['ocean_proximity']])
ocean_proximity_labels = encoder.categories_
ocean_proximity_df = pd.DataFrame(ocean_proximity_values, 
                                  columns=ocean_proximity_labels)

housing = housing.drop('ocean_proximity', axis=1)
housing = pd.concat([housing, ocean_proximity_df], axis=1)
housing.head()

In [None]:
# RMSE after encoding the ocean proximity
run_linear_reg(housing, y)

Discretization of the longitude and latitude features

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

encoder = KBinsDiscretizer(n_bins=10, encode='onehot-dense')

longitude_values = encoder.fit_transform(housing[['longitude']])
longitude_labels = [f'longitude_{i}' for i in range(10)]
longitude_df = pd.DataFrame(longitude_values, 
                            columns=longitude_labels)

latitude_values = encoder.fit_transform(housing[['latitude']])
latitude_labels = [f'latitude_{i}' for i in range(10)]
latitude_df = pd.DataFrame(latitude_values, 
                           columns=latitude_labels)

housing = housing.drop(['longitude', 'latitude'], axis=1)
housing = pd.concat([housing, longitude_df, latitude_df], axis=1)
housing.head()

In [None]:
run_linear_reg(housing, y)

Feature extraction

In [None]:
housing['rooms_per_person'] = np.round(housing['total_rooms'] / housing['population'], 2)
housing['bedrooms_per_room'] = np.round(housing['total_bedrooms'] / housing['total_rooms'], 2)
housing.corrwith(y).sort_values(ascending=False)

In [None]:
run_linear_reg(housing, y)

Remove outliers

In [None]:
housing.describe()

In [None]:
# Cap rooms per person to 4
housing['rooms_per_person'] = housing['rooms_per_person'].map(lambda x: min(x, 4))
housing.describe()

In [None]:
run_linear_reg(housing, y)

Feature scaling

In [None]:
scaler = StandardScaler()
scaled_values = scaler.fit_transform(housing)
housing = pd.DataFrame(scaled_values,
                       columns=housing.columns)
housing.head()

In [None]:
# Final result
run_linear_reg(housing, y)