# Week 2 Homework Linear Regression

In [1]:
import numpy as np 
import pandas as pd

California Housing Prices

The goal of the homework is to create a regression model for predicting housing prices ('median_house_value')

## EDA

Load the data

In [2]:
df = pd.read_csv('house_price.csv')

Look at the median_house_price variable. Does it have a long tail?

In [3]:
df.median_house_value.tail()

20635    78100.0
20636    77100.0
20637    92300.0
20638    84700.0
20639    89400.0
Name: median_house_value, dtype: float64

## Features

For the rest of the homework, you'll need to use only these columns:
- latitute
- longitude
- housing_median_age
- total_rooms
- population
- households
- median_income
- median_house_value

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
del df['ocean_proximity']

In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


## Question 1

Find a feature with missing values. How many missing values does it have?

In [7]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

## Question 2

What's the median (50% percentile) for variable population?

In [8]:
df.population.describe()

count    20640.000000
mean      1425.476744
std       1132.462122
min          3.000000
25%        787.000000
50%       1166.000000
75%       1725.000000
max      35682.000000
Name: population, dtype: float64

In [9]:
df.population.median()

1166.0

## Split the data

1.- Shuffle the initial dataset, use seed 42

In [10]:
n = len(df)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
idx

array([20046,  3024, 15663, ...,  5390,   860, 15795])

2.- Split your data in train/val/test sets, with 60%/ 20%/20% distribution

In [11]:
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [12]:
n_val, n_test, n_train

(4128, 4128, 12384)

In [13]:
df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)

In [14]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0
1,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0
2,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,500001.0
3,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0
4,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,278000.0


3.- Apply the log transformation to the media_house_value variable using the np.log1p() function

In [15]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

4.- Make sure that the target value ('median_house_value') is not in your dataframe

In [16]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [17]:
len(y_train), len(y_val), len(y_test)

(12384, 4128, 4128)

## Question 3

- We need to deal with missing values for the column from Q1
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

### Fill the missing values with 0

In [18]:
df_train_zero = df_train.copy()

In [19]:
df_train_zero.total_bedrooms = df_train_zero.total_bedrooms.fillna(0)

In [20]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

In [21]:
w_zero, w_full_zero = train_linear_regression(df_train_zero, y_train)

In [22]:
w_zero, w_full_zero

(-11.686975241845731,
 array([-2.76255120e-01, -2.82087184e-01,  2.91329621e-03, -8.64531514e-06,
         1.50811922e-04, -2.08205659e-04,  5.69546573e-04,  1.71308139e-01]))

In [23]:
y_pred_zero = w_zero + df_train_zero.dot(w_full_zero)

In [24]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [25]:
round(rmse(y_train, y_pred_zero), 2) 

0.34

### Fill the missing values with the mean

In [26]:
df_train_mean = df_train.copy()

In [27]:
df_train_mean.total_bedrooms = df_train_mean.total_bedrooms.fillna(df_train_mean.total_bedrooms.mean())

In [28]:
w_mean, w_full_mean = train_linear_regression(df_train_mean, y_train)

In [29]:
w_mean, w_full_mean

(-11.759590872784889,
 array([-2.76809637e-01, -2.82182103e-01,  2.93033801e-03, -1.48543785e-05,
         2.48054046e-04, -2.03763456e-04,  4.87004092e-04,  1.73086593e-01]))

In [30]:
y_pred_mean = w_mean + df_train_mean.dot(w_full_mean)

In [31]:
round(rmse(y_train, y_pred_mean), 2)

0.34

## Question 4

- Now let's train a regularized linear regression.
- For this question, fill the NA's with 0.
- Try different values of "r" from this list: [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?

If there are multiple options, select the smallest r.

In [32]:
df_train_zero = df_train.copy()
df_train_zero.total_bedrooms = df_train_zero.total_bedrooms.fillna(0)

In [33]:
df_val_zero = df_val.copy()
df_val_zero.total_bedrooms = df_val_zero.total_bedrooms.fillna(0)

In [34]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    
    XTX = XTX + r * np.eye(len(XTX))
    
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]

In [35]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w0, w =train_linear_regression_reg(df_train_zero, y_train)
    y_pred = w0 + df_val_zero.dot(w)
    score = round(rmse(y_val, y_pred), 2)
    print(r, score)

0 0.33
1e-06 0.33
0.0001 0.33
0.001 0.33
0.01 0.33
0.1 0.33
1 0.33
5 0.33
10 0.33


## Question 5

- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? 
- Round the result to 3 decimal digits (round(std, 3))

In [36]:
scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    n = len(df)
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train+n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)
    df_train.total_bedrooms = df_train.total_bedrooms.fillna(0)
    df_val.total_bedrooms = df_val.total_bedrooms.fillna(0)
    df_test.total_bedrooms = df_test.total_bedrooms.fillna(0)
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    w0, w = train_linear_regression(df_train, y_train)
    y_pred = w0 + df_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)

In [37]:
scores_std = round(np.array(scores).std(), 3)
scores_std

0.004

## Question 6

- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001
- What's the RMSE on the test dataset

In [38]:
np.random.seed(9)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train+n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train+n_val:]].reset_index(drop=True)
df_train.total_bedrooms = df_train.total_bedrooms.fillna(0)
df_test.total_bedrooms = df_test.total_bedrooms.fillna(0)
y_train = np.log1p(df_train.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
del df_train['median_house_value']
del df_test['median_house_value']
w0, w = train_linear_regression_reg(df_train, y_train, r = 0.001)
y_pred = w0 + df_test.dot(w)
score = rmse(y_test, y_pred)
round(score, 3)

0.35