In [273]:
import pandas as pd
import numpy as np

In [274]:
columns = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']

In [275]:
df = pd.read_csv('housing.csv')
df = df[columns]

In [276]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [277]:
df.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,35.631861,-119.569704,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.135952,2.003532,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,32.54,-124.35,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,33.93,-121.8,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,34.26,-118.49,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.71,-118.01,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,41.95,-114.31,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [299]:
20640.000000 - 20433.000000

207.0

## Question 1

In [278]:
df.total_bedrooms.isna().sum()

207

## Question 2

In [301]:
df.population.median()

1166.0

In [280]:
def split_train_test(df_source, seed=42):
    df = df_source.copy()
    np.random.seed(seed)

    n = len(df)

    n_train = int(n * 0.6)
    n_val = int(n * 0.2)
    n_test = n - n_train - n_val

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train]
    df_val = df_shuffled.iloc[n_train:n_train + n_val]
    df_test = df_shuffled.iloc[n_train + n_val:]

    y_train = df_train['median_house_value'].values
    y_val = df_val['median_house_value'].values
    y_test = df_test['median_house_value'].values

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    return df_train, y_train, df_val, y_val, df_test, y_test


## Question 3

In [281]:
def fill_data(df, type, columns):
    df_copy = df.copy()
    if type == 'zero':
        df_copy[columns] = df_copy[columns].fillna(0)
    elif type == 'mean':
        df_copy[columns] = df_copy[columns].fillna(df_copy[columns].mean())
    return df_copy

In [282]:
df.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,35.631861,-119.569704,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.135952,2.003532,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,32.54,-124.35,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,33.93,-121.8,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,34.26,-118.49,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.71,-118.01,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,41.95,-114.31,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [283]:
df["median_house_value"] = np.log(df["median_house_value"])

In [284]:
df_filled_zeros = fill_data(df, 'zero', ['total_bedrooms'])
df_filled_mean = fill_data(df, 'mean', ['total_bedrooms'])

In [285]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [286]:
def rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred) ** 2))

In [287]:
X_train, y_train, X_val, y_val, X_test, y_test = split_train_test(df_filled_zeros, seed=42)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_val.dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [288]:
X_train, y_train, X_val, y_val, X_test, y_test = split_train_test(df_filled_mean)
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_val.dot(w)
round(rmse(y_val, y_pred),2)

0.33

# Question 4

In [289]:
r_s = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [290]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [291]:
rmse_list = []
for r in r_s:
    w0, w = train_linear_regression_reg(X_train, y_train, r)
    y_pred = w0 + X_val.dot(w)
    rmse_list.append(rmse(y_val, y_pred))
    print(r, round(rmse(y_val, y_pred),2))

0 0.33
1e-06 0.33
0.0001 0.33
0.001 0.33
0.01 0.33
0.1 0.33
1 0.33
5 0.34
10 0.34


In [292]:
min(rmse_list)

0.32902171756617815

# Question 5

In [293]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_list = []
for seed in seeds:
    df_filled_zeros = fill_data(df, 'zero', ['total_bedrooms'])
    X_train, y_train, X_val, y_val, X_test, y_test = split_train_test(df_filled_zeros, seed)
    w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    rmse_list.append(rmse(y_val, y_pred))
    print(seed, round(rmse(y_val, y_pred),3))

0 0.339
1 0.336
2 0.332
3 0.341
4 0.339
5 0.343
6 0.345
7 0.34
8 0.347
9 0.337


In [294]:
round(np.std(rmse_list), 3)

0.004

# Question 6

In [295]:
df_filled_zeros = fill_data(df, 'zero', columns)
X_train, y_train, X_val, y_val, X_test, y_test = split_train_test(df_filled_zeros, 9)


In [296]:
w0, w = train_linear_regression_reg(np.concatenate([X_train,X_val]), np.concatenate([y_train,y_val]), r=0.001)


In [297]:
y_pred = w0 + X_test.dot(w)

In [298]:
rmse(y_pred, y_test)

0.34531955085041294