In [302]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [303]:
import pandas as pd 
import numpy as np
import requests
import matplotlib.pyplot as plt 
import seaborn as sns 

In [304]:
response = requests.get(data)

In [305]:
with open('housing.csv','wb')as csv_file:
    csv_file.write(response.content)

PermissionError: [Errno 13] Permission denied: 'housing.csv'

In [None]:
df = pd.read_csv('housing.csv')

In [None]:
df.shape

In [None]:
df.median_house_value[df.median_house_value<10000].values

In [None]:
sns.histplot(df.median_house_value.values,color='blue',alpha=0.5,bins=40);

In [None]:
oc_df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

In [None]:
oc_df.shape

In [306]:
columns = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']

In [307]:
oc_df = oc_df[columns]

In [308]:
oc_df.shape

(15687, 9)

In [309]:
oc_df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [310]:
oc_df.population.median()

1195.0

In [311]:
n = len(oc_df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [312]:
n_train,n

(9413, 15687)

In [313]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [314]:
df_train = oc_df.iloc[idx[:n_train]]
df_val = oc_df.iloc[idx[n_train:n_train+n_val]]
df_test = oc_df.iloc[idx[n_train+n_val:]]

In [315]:
len(df_test),len(df_val),len(df_train)

(3137, 3137, 9413)

In [316]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [317]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [318]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

# Linear Regression

In [319]:
def linear_regression(X,y):
    ones = np.ones([X.shape[0]])
    X = np.column_stack([ones,X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0],w_full[1:]

# RMSE

In [320]:
def rmse(y_train,y_pred):
    se = (y_train-y_pred)**2
    mse = se.mean()
    score = np.sqrt(mse)
    return score
    

# With 0

In [321]:
def prepare_zero(df):
    df_num = df.fillna(0)
    X = df_num.values
    return X

In [322]:
X_train = prepare_zero(df_train)
w0,w = linear_regression(X_train,y_train)

X_val = prepare_zero(df_val)
y_pred = w0+X_val.dot(w)
rmse(y_val,y_pred)

0.3408479034178568

# With Mean

In [323]:
mean = df_train.total_bedrooms.mean()

In [324]:
def prepare_mean(df):
    df_num = df.fillna(mean)
    X = df_num.values
    return X

In [325]:
X_train = prepare_mean(df_train)
w0,w = linear_regression(X_train,y_train)

X_val = prepare_mean(df_val)
y_pred = w0+X_val.dot(w)
rmse(y_val,y_pred)

0.3405699801421053

# Regularization

In [326]:
def rmse_reg(y_train,y_pred):
    se = (y_train-y_pred)**2
    mse = se.mean()
    score = np.sqrt(mse)
    return round(score,2)

In [327]:
def linear_regression_reg(X,y,r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX_R = XTX + r*np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX_R)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0],w_full[1:]

In [328]:
X_train = prepare_zero(df_train)
w0,w = linear_regression_reg(X_train,y_train,r=0.001)

X_val = prepare_zero(df_val)
y_pred = w0+X_val.dot(w)
rmse_reg(y_val,y_pred)

0.34

In [329]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_zero(df_train)
    w0, w = linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_zero(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse_reg(y_val, y_pred)
    
    print(r, w0, score)
    

0 -9.763249477337387 0.34
1e-06 -9.763228830643307 0.34
0.0001 -9.761185235453269 0.34
0.001 -9.742646249363403 0.34
0.01 -9.561056192740143 0.34
0.1 -8.058889769357727 0.34
1 -3.13315427840502 0.34
5 -0.8410867974437348 0.35
10 -0.4381172315623696 0.35


In [335]:
std = []
for c in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:

    pointer = np.arange(n)
    np.random.seed(c)
    np.random.shuffle(pointer)
    
    df_point = oc_df.iloc[idx]
    
    df_train_1 = df_point.iloc[idx[:n_train]].reset_index(drop=True).copy()
    df_val_1 = df_point.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True).copy()
    df_test_1 = df_point.iloc[idx[n_train+n_val:]].reset_index(drop=True).copy()
    
    y_train_1 = np.log1p(df_train_1.median_house_value.values)
    y_val_1 = np.log1p(df_val_1.median_house_value.values)
    y_test_1 = np.log1p(df_val_1.median_house_value.values)
    
    del df_train_1['median_house_value']
    del df_val_1['median_house_value']
    del df_test_1['median_house_value']
     
    X_train_1 = prepare_zero(df_train_1)
    w0,w = linear_regression(X_train_1,y_train_1)

    X_val = prepare_zero(df_val_1)
    y_pred = w0+X_val.dot(w)
    score = rmse(y_val,y_pred)
    std.append(score)
    print(c,w0,score)

0 -10.127978073432953 0.7303485265317138
1 -10.127978073432953 0.7303485265317138
2 -10.127978073432953 0.7303485265317138
3 -10.127978073432953 0.7303485265317138
4 -10.127978073432953 0.7303485265317138
5 -10.127978073432953 0.7303485265317138
6 -10.127978073432953 0.7303485265317138
7 -10.127978073432953 0.7303485265317138
8 -10.127978073432953 0.7303485265317138
9 -10.127978073432953 0.7303485265317138


In [331]:
std

[0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138,
 0.7303485265317138]

In [332]:
np.round(np.std(std),3)

0.0

In [360]:
pointer = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_shuffled = oc_df.iloc[idx]
    
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()
df_full = pd.concat([df_train, df_val]).reset_index(drop=True)

df_test = df_test.reset_index(drop=True)

y_full = df_full.median_house_value.values
y_test= df_test.median_house_value.values

y_full = np.log1p(y_full)
y_test = np.log1p(y_test)

del df_full['median_house_value']
del df_test['median_house_value']
     

In [361]:
X_full = prepare_zero(df_full)
w0, w = linear_regression_reg(X_full, y_full, r=0.001)

X_test = prepare_zero(df_test)
y_pred = w0 + X_test.dot(w)

np.round(rmse(y_test, y_pred),2)

0.33