In [57]:
import pandas as pd
import numpy as np

In [58]:
data = pd.read_csv('testing.csv')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [59]:
data.shape

(20640, 10)

The dataset contains 20640 observations and 10 attributes (9 predictors and 1 response). Below is a list of the variables with descriptions taken from the original Kaggle site given above.

* **longitude**: A measure of how far west a house is; a higher value is farther west
* **latitude**: A measure of how far north a house is; a higher value is farther north
* **housing_median_age**: Median age of a house within a block; a lower number is a newer building
* **total_rooms**: Total number of rooms within a block
* **total_bedrooms**: Total number of bedrooms within a block
* **population**: Total number of people residing within a block
* **households**: Total number of households, a group of people residing within a home unit, for a block
* **median_income**: Median income for households within a block of houses (measured in tens of thousands of US Dollars)
* **ocean_proximity**: Location of the house w.r.t ocean/sea
* **median_house_value**: Median house value for households within a block (measured in US Dollars)

In [60]:
data.drop_duplicates(keep='first', inplace=True) #There's no duplicates in the data
data.shape

(20640, 10)

In [61]:
#Split the data into test and train data

from sklearn.model_selection import train_test_split

In [62]:
train, test = train_test_split(data, test_size =0.1, random_state = 20) #using 10% of the data as the test data

In [63]:
print('The shape of train data is: {}\nThe shape of test data is: {}'.format(train.shape[0], test.shape[0]))

The shape of train data is: 18576
The shape of test data is: 2064


In [64]:
train_data = train.copy()
test_data = test.copy()

1. Perform Your EDA
2. Check and Fill Missing Data

In [65]:
train_data.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        186
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [66]:
test_data.isna().sum()

longitude              0
latitude               0
housing_median_age     0
total_rooms            0
total_bedrooms        21
population             0
households             0
median_income          0
median_house_value     0
ocean_proximity        0
dtype: int64

In [67]:
#Filling Missing Value
train_median = train_data['total_bedrooms'].median()

train_data['total_bedrooms'].fillna(train_median, inplace=True)
test_data['total_bedrooms'].fillna(train_median, inplace=True)

3. Feature Engineeriing

In [68]:
train_data['ocean_proximity'].value_counts()

<1H OCEAN     8231
INLAND        5896
NEAR OCEAN    2384
NEAR BAY      2061
ISLAND           4
Name: ocean_proximity, dtype: int64

In [69]:
test_data['ocean_proximity'].value_counts()

<1H OCEAN     905
INLAND        655
NEAR OCEAN    274
NEAR BAY      229
ISLAND          1
Name: ocean_proximity, dtype: int64

In [70]:
def convert_ocean(DF):
    DF['ocean_proximity'].replace({'ISLAND': 7, 'NEAR BAY': 4, 'NEAR OCEAN': 3, '<1H OCEAN': 2, 'INLAND': 1}, inplace=True)

In [71]:
convert_ocean(train_data)

In [72]:
convert_ocean(test_data)

In [73]:
train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8101,-118.21,33.8,41.0,1251.0,279.0,1053.0,278.0,3.2778,150800.0,3
9757,-121.44,36.51,31.0,1636.0,380.0,1468.0,339.0,3.2219,114700.0,2
16837,-122.48,37.59,29.0,5889.0,959.0,2784.0,923.0,5.3991,273000.0,3
11742,-121.15,38.91,23.0,1654.0,299.0,787.0,299.0,4.2723,193100.0,1
1871,-119.95,38.94,24.0,2180.0,517.0,755.0,223.0,2.5875,173400.0,1


4. Preprocess your Data

In [74]:
train_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [75]:
input_col = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 
             'median_income','ocean_proximity']

target_col = 'median_house_value'

In [88]:
from sklearn.preprocessing import MinMaxScaler

In [89]:
scaler = MinMaxScaler()

In [90]:
train_data[input_col] = scaler.fit_transform(train_data[input_col])
test_data[input_col] = scaler.transform(test_data[input_col])

In [91]:
train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
8101,0.622718,0.1339,0.784314,0.031767,0.043141,0.036761,0.045552,0.191577,150800.0,0.333333
9757,0.295132,0.421892,0.588235,0.041559,0.058814,0.05129,0.055583,0.187722,114700.0,0.166667
16837,0.189655,0.536663,0.54902,0.149728,0.148665,0.097364,0.15162,0.337871,273000.0,0.333333
11742,0.324544,0.676939,0.431373,0.042016,0.046245,0.027448,0.049005,0.260162,193100.0,0.0
1871,0.446247,0.680128,0.45098,0.055394,0.080074,0.026328,0.036507,0.14397,173400.0,0.0


5. Creating The Holdout set

In [92]:
X = train_data[input_col] #Splitting the train data into train and validation set
y = train_data[target_col]

In [94]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

In [95]:
print(f'The shape of Train set is: {X_train.shape[0]}\nThe shape of Validation set is: {X_val.shape[0]}')

The shape of Train set is: 14860
The shape of Validation set is: 3716


6. Creating A Baseline Model

In [96]:
from sklearn.linear_model import LinearRegression

In [97]:
model = LinearRegression()

In [98]:
model.fit(X_train, y_train)

LinearRegression()

In [104]:
y_pred_train = model.predict(X_train)

In [99]:
y_pred = model.predict(X_val)

In [102]:
from sklearn.metrics import mean_squared_error

In [103]:
np.sqrt(mean_squared_error(y_val, y_pred)) #validation RMSE

68455.0775609644

In [105]:
np.sqrt(mean_squared_error(y_train, y_pred_train)) #train RMSE

69208.92006302692

Our model is severally underfitting 