In [34]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
housing = pd.read_csv("misc/housing.csv")

In [3]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
# Remove the target column before pre-processing
y = housing['median_house_value']
housing = housing.drop('median_house_value', axis=1)

In [6]:
def run_linear_reg(X, y):
    lin_reg = LinearRegression()
    lin_reg.fit(X, y)
    predictions = lin_reg.predict(X)
    RMSE = np.sqrt(MSE(y, predictions))
    return RMSE

Imputation of missing values

In [7]:
imputer = SimpleImputer(strategy='median')
housing['total_bedrooms'] = imputer.fit_transform(housing[['total_bedrooms']])
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20640 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(8), object(1)
memory usage: 1.4+ MB


In [8]:
# RMSE after only imputation (ignoriong the ocean_proximity feature)
temp = housing.drop('ocean_proximity', axis=1)
run_linear_reg(temp, y)

69658.1903557702

One-hot encoding of the ocean_proximity feature

In [9]:
encoder = OneHotEncoder(sparse=False)
ocean_proximity_values = encoder.fit_transform(housing[['ocean_proximity']])
ocean_proximity_labels = encoder.categories_
ocean_proximity_df = pd.DataFrame(ocean_proximity_values, 
                                  columns=ocean_proximity_labels)

housing = housing.drop('ocean_proximity', axis=1)
housing = pd.concat([housing, ocean_proximity_df], axis=1)
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,"(<1H OCEAN,)","(INLAND,)","(ISLAND,)","(NEAR BAY,)","(NEAR OCEAN,)"
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,0.0,0.0,0.0,1.0,0.0


In [10]:
# RMSE after encoding the ocean proximity
run_linear_reg(housing, y)

68709.3255776217

Discretization of the longitude and latitude features

In [11]:
from sklearn.preprocessing import KBinsDiscretizer

encoder = KBinsDiscretizer(n_bins=10, encode='onehot-dense')

longitude_values = encoder.fit_transform(housing[['longitude']])
longitude_labels = [f'longitude_{i}' for i in range(10)]
longitude_df = pd.DataFrame(longitude_values, 
                            columns=longitude_labels)

latitude_values = encoder.fit_transform(housing[['latitude']])
latitude_labels = [f'latitude_{i}' for i in range(10)]
latitude_df = pd.DataFrame(latitude_values, 
                           columns=latitude_labels)

housing = housing.drop(['longitude', 'latitude'], axis=1)
housing = pd.concat([housing, longitude_df, latitude_df], axis=1)
housing.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,"(<1H OCEAN,)","(INLAND,)","(ISLAND,)","(NEAR BAY,)",...,latitude_0,latitude_1,latitude_2,latitude_3,latitude_4,latitude_5,latitude_6,latitude_7,latitude_8,latitude_9
0,41.0,880.0,129.0,322.0,126.0,8.3252,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
run_linear_reg(housing, y)

65578.56633900401

Feature extraction

In [13]:
housing['rooms_per_person'] = np.round(housing['total_rooms'] / housing['population'], 2)
housing['bedrooms_per_room'] = np.round(housing['total_bedrooms'] / housing['total_rooms'], 2)
housing.corrwith(y).sort_values(ascending=False)

median_income         0.688075
(<1H OCEAN,)          0.256617
longitude_5           0.250819
rooms_per_person      0.209483
(NEAR BAY,)           0.160284
(NEAR OCEAN,)         0.141862
total_rooms           0.134153
latitude_7            0.121279
latitude_4            0.120799
longitude_1           0.116234
housing_median_age    0.105623
longitude_0           0.101134
latitude_1            0.089770
households            0.065843
latitude_6            0.065464
longitude_7           0.064645
latitude_3            0.050537
total_bedrooms        0.049457
(ISLAND,)             0.023416
latitude_0            0.002911
longitude_8          -0.002722
population           -0.024650
latitude_8           -0.025595
longitude_2          -0.032645
latitude_2           -0.042314
longitude_6          -0.067724
longitude_4          -0.090616
longitude_9          -0.125899
latitude_5           -0.130278
longitude_3          -0.203610
bedrooms_per_room    -0.232856
latitude_9           -0.251953
(INLAND,

In [14]:
run_linear_reg(housing, y)

64783.4390515549

Remove outliers

In [15]:
housing.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,"(<1H OCEAN,)","(INLAND,)","(ISLAND,)","(NEAR BAY,)",...,latitude_2,latitude_3,latitude_4,latitude_5,latitude_6,latitude_7,latitude_8,latitude_9,rooms_per_person,bedrooms_per_room
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,...,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,0.442636,0.317393,0.000242,0.11095,...,0.098643,0.101114,0.100194,0.100242,0.099176,0.100242,0.099612,0.101066,1.976998,0.213786
std,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,0.49671,0.465473,0.015563,0.314077,...,0.29819,0.301487,0.300265,0.30033,0.298906,0.30033,0.29949,0.301423,1.145992,0.065291
min,1.0,2.0,1.0,3.0,1.0,0.4999,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
25%,18.0,1447.75,297.0,787.0,280.0,2.5634,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.52,0.18
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.94,0.2
75%,37.0,3148.0,643.25,1725.0,605.0,4.74325,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.24
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,55.22,2.82


In [16]:
# Cap rooms per person to 4
housing['rooms_per_person'] = housing['rooms_per_person'].map(lambda x: min(x, 4))
housing.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,"(<1H OCEAN,)","(INLAND,)","(ISLAND,)","(NEAR BAY,)",...,latitude_2,latitude_3,latitude_4,latitude_5,latitude_6,latitude_7,latitude_8,latitude_9,rooms_per_person,bedrooms_per_room
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,...,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,0.442636,0.317393,0.000242,0.11095,...,0.098643,0.101114,0.100194,0.100242,0.099176,0.100242,0.099612,0.101066,1.928969,0.213786
std,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,0.49671,0.465473,0.015563,0.314077,...,0.29819,0.301487,0.300265,0.30033,0.298906,0.30033,0.29949,0.301423,0.620104,0.065291
min,1.0,2.0,1.0,3.0,1.0,0.4999,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
25%,18.0,1447.75,297.0,787.0,280.0,2.5634,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.52,0.18
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.94,0.2
75%,37.0,3148.0,643.25,1725.0,605.0,4.74325,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.24
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,2.82


In [17]:
run_linear_reg(housing, y)

62969.55412746703

Feature scaling

In [18]:
scaler = StandardScaler()
scaled_values = scaler.fit_transform(housing)
housing = pd.DataFrame(scaled_values,
                       columns=housing.columns)
housing.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,"(<1H OCEAN,)","(INLAND,)","(ISLAND,)","(NEAR BAY,)",...,latitude_2,latitude_3,latitude_4,latitude_5,latitude_6,latitude_7,latitude_8,latitude_9,rooms_per_person,bedrooms_per_room
0,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,-0.891156,-0.681889,-0.015566,2.830742,...,-0.330815,-0.335393,-0.333692,-0.333782,-0.331806,-0.333782,3.006478,-0.335304,1.2918,-0.976975
1,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,-0.891156,-0.681889,-0.015566,2.830742,...,-0.330815,-0.335393,-0.333692,-0.333782,-0.331806,-0.333782,3.006478,-0.335304,1.662714,-0.82381
2,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,-0.891156,-0.681889,-0.015566,2.830742,...,-0.330815,-0.335393,-0.333692,-0.333782,-0.331806,-0.333782,3.006478,-0.335304,1.662714,-1.283304
3,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,-0.891156,-0.681889,-0.015566,2.830742,...,-0.330815,-0.335393,-0.333692,-0.333782,-0.331806,-0.333782,3.006478,-0.335304,0.566098,-0.51748
4,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,-0.891156,-0.681889,-0.015566,2.830742,...,-0.330815,-0.335393,-0.333692,-0.333782,-0.331806,-0.333782,3.006478,-0.335304,1.533701,-0.670645


In [19]:
# Final result
run_linear_reg(housing, y)

62969.57705222518

In [36]:
housing = pd.read_csv("misc/housing.csv")

impute_bedrooms = Pipeline([
    ("impute_total_bedrooms",SimpleImputer(strategy='median'))
])

one_hot_encoder =  Pipeline([
    ("ocean_proximity_one_hot_encoder", OneHotEncoder(sparse=False)),
])

preprocessor = ColumnTransformer([
    ('fix_bedrooms', impute_bedrooms, ['total_bedrooms']),
    ('ocean_prox_ohe', one_hot_encoder, ['ocean_proximity'])
])


clf = Pipeline([
    ('pre', preprocessor),
    ('cls', LinearRegression())
])

x = housing.drop('median_house_value', axis= 1)
y = housing[['median_house_value']]

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)

clf.fit(X_train, y_train)
print(f"Model score: {np.round(clf.score(X_test, y_test), 3)}")

# encoder = OneHotEncoder(sparse=False).fit_transform(housing[['ocean_proximity']])
# ocean_proximity_values = 
# ocean_proximity_labels = encoder.categories_
# ocean_proximity_df = pd.DataFrame(ocean_proximity_values, 
#                                   columns=ocean_proximity_labels)

# housing = housing.drop('ocean_proximity', axis=1)
# housing = pd.concat([housing, ocean_proximity_df], axis=1)
# housing.head()


Model score: 0.242
