# Imports

In [2]:
from scipy.stats import norm
import seaborn as sns
import pandas as pd
import numpy as np

# Data loading and EDA

In [1]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [3]:
!wget $data 

--2022-09-19 18:36:14--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv’


2022-09-19 18:36:14 (23.4 MB/s) - ‘housing.csv’ saved [1423529/1423529]



In [42]:
df = pd.read_csv('/content/housing.csv')
df.shape
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Q1 missing values

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


total_bedrooms have 217 missing values

## Q2

In [10]:
df["population"].median()

1166.0

1166

# Preprocessing

## Sklearn Version

In [43]:
from sklearn.utils import shuffle

df_shuffled = shuffle(df, random_state=42)
df_shuffled = df_shuffled.drop(columns="ocean_proximity")
df_shuffled["median_house_value"] = np.log1p(df_shuffled["median_house_value"])

n = len(df_shuffled)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
print(n_val, n_test, n_train)

df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[n_train:n_train+n_val]
df_test = df_shuffled.iloc[n_train+n_val:]

4128 4128 12384


## Notebook version

In [46]:
df_shuffled = df.drop(columns="ocean_proximity")
df_shuffled["median_house_value"] = np.log1p(df_shuffled["median_house_value"].values)

n = len(df_shuffled)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df_shuffled.iloc[idx[:n_train]]
df_val = df_shuffled.iloc[idx[n_train:n_train+n_val]]
df_test = df_shuffled.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Train and eval

In [None]:
def prepare_X(X, val_for_fill):

  return X

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

train_Y = df_train["median_house_value"]
train_X = df_train.drop(columns="median_house_value")

test_Y = df_test["median_house_value"]
test_X = df_test.drop(columns="median_house_value")
valid_Y = df_val["median_house_value"]
valid_X = df_val.drop(columns="median_house_value")

# zero insert
train_X["total_bedrooms"] = train_X["total_bedrooms"].fillna(0)
valid_X["total_bedrooms"] = valid_X["total_bedrooms"].fillna(0)
reg = LinearRegression().fit(train_X, train_Y)
valid_preds = reg.predict(valid_X)
print(mean_squared_error(valid_Y, valid_preds))

# median insert
med = train_X["total_bedrooms"].median()
train_X["total_bedrooms"] = train_X["total_bedrooms"].fillna(med)
valid_X["total_bedrooms"] = valid_X["total_bedrooms"].fillna(med)
reg = LinearRegression().fit(train_X, train_Y)
valid_preds = reg.predict(valid_X)
print(mean_squared_error(valid_Y, valid_preds))

0.1085920221600239
0.1085920221600239


In [48]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [52]:
train_X["total_bedrooms"] = train_X["total_bedrooms"].fillna(0)
valid_X["total_bedrooms"] = valid_X["total_bedrooms"].fillna(0)

w0, w = train_linear_regression(train_X, train_Y)
y_pred = w0 + valid_X.dot(w)
rmse(valid_Y, y_pred)

0.32953303652286225

## Q4 train regul regression

In [53]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [59]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
  w0, w = train_linear_regression_reg(train_X, train_Y, r)
  y_pred = w0 + valid_X.dot(w)
  print(f"r: {r}, rmse: {round(rmse(valid_Y, y_pred), 2)}")

r: 0, rmse: 0.33
r: 1e-06, rmse: 0.33
r: 0.0001, rmse: 0.33
r: 0.001, rmse: 0.33
r: 0.01, rmse: 0.33
r: 0.1, rmse: 0.33
r: 1, rmse: 0.33
r: 5, rmse: 0.34
r: 10, rmse: 0.34


## Q5 different random seed

In [65]:
def shuffle_split_with_seed(df, n_val=0.2, n_test=0.2, seed=42):
  df_shuffled = df.drop(columns="ocean_proximity")
  df_shuffled["total_bedrooms"] = df_shuffled["total_bedrooms"].fillna(0)
  df_shuffled["median_house_value"] = np.log1p(df_shuffled["median_house_value"].values)

  n = len(df_shuffled)
  n_val = int(n * n_val)
  n_test = int(n * n_test)
  n_train = n - n_val - n_test

  idx = np.arange(n)
  np.random.seed(seed)
  np.random.shuffle(idx)

  df_train = df_shuffled.iloc[idx[:n_train]]
  df_val = df_shuffled.iloc[idx[n_train:n_train+n_val]]
  df_test = df_shuffled.iloc[idx[n_train+n_val:]]

  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  df_test = df_test.reset_index(drop=True)

  train_Y = df_train["median_house_value"]
  train_X = df_train.drop(columns="median_house_value")

  test_Y = df_test["median_house_value"]
  test_X = df_test.drop(columns="median_house_value")
  valid_Y = df_val["median_house_value"]
  valid_X = df_val.drop(columns="median_house_value")

  return train_X, train_Y, test_X, test_Y, valid_X, valid_Y

In [71]:
rmses = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
  train_X, train_Y, test_X, test_Y, valid_X, valid_Y = shuffle_split_with_seed(df, seed=seed)
  w0, w = train_linear_regression(train_X, train_Y)
  y_pred = w0 + valid_X.dot(w)
  rmses.append(rmse(valid_Y, y_pred))
  print(f"seed: {seed}, rmse: {rmse(valid_Y, y_pred)}")

print(round(np.std(rmses), 3))

seed: 0, rmse: 0.33884304805312054
seed: 1, rmse: 0.3362387255956152
seed: 2, rmse: 0.3320912318841953
seed: 3, rmse: 0.34051536090333206
seed: 4, rmse: 0.3389024066572496
seed: 5, rmse: 0.3434866725718243
seed: 6, rmse: 0.3451980953099226
seed: 7, rmse: 0.33959899274099314
seed: 8, rmse: 0.3466230873184584
seed: 9, rmse: 0.33659261241692773
0.004


## Q6

In [78]:
train_X, train_Y, test_X, test_Y, valid_X, valid_Y = shuffle_split_with_seed(df, seed=9)
train_Xc = pd.concat([train_X, valid_X])
train_Yc = pd.concat([train_Y, valid_Y])

w0, w = train_linear_regression_reg(train_Xc, train_Yc, r=0.001)
y_pred = w0 + test_X.dot(w)
print(f"rmse: {round(rmse(test_Y, y_pred), 2)}")

rmse: 0.35
