In [27]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

**EDA**

In [None]:
# loda data
# wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
df = pd.read_csv ('data/housing.csv')
df.shape

In [None]:
df.columns

In [None]:
sns.histplot(df['median_house_value'], bins=50)

In [None]:
df

In [None]:
df = df[(df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND')]
df

In [None]:
df.shape

In [None]:
cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
cols

In [None]:
df = df[cols]
df

**1. Missing values**

In [None]:
df.isnull().sum() 

In [None]:
print(df.columns[df.isna().any()].tolist())

**2. Median**

In [None]:
print(df.population.median())

**Prepare and split the dataset**

In [None]:
# calculate how many rows for 60%, 20% and 20% split
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [None]:
print('n=',n)
print('n_val', n_val)
print('n_test', n_test)
print('n_train', n_train)
print('total = ', n_val + n_test + n_train)

In [None]:
#split df into df_train (60%), df_val(20%) and df_test (20%)
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [None]:
# create random index
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [None]:
# select rows based on random index
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [None]:
print('df_train', df_train.shape)
print('df_val', df_val.shape)
print('df_test', df_test.shape)

In [None]:
df_train.head()

In [None]:
print('df_train', df_train.shape)
print('df_val', df_val.shape)
print('df_test', df_test.shape)

In [None]:
# reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
# populate y_train, y_val and y_test
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [None]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

**3. Missing values**

In [None]:
def load_df():
    # loda data
    # data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
    # !wget $data
    df = pd.read_csv ('data/housing.csv')
    
    return df

In [None]:
def prepare_df(df, fill):
    df = df.copy()
      
    df = df[(df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND')]
        
    cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
    df = df[cols]
    
    df = df.fillna(fill)
  
    return df

In [None]:
def split_df(df):
    # calculate how many rows for 60%, 20% and 20% split
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    # create random index
    idx = np.arange(n)
    np.random.seed(42)
    np.random.shuffle(idx)
    
    # select rows based on random index
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
        
    # assign y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    # remove y from data frame
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    # assign X values
    X_train = df_train.values
    X_val = df_val.values
    X_test = df_test.values
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [None]:
# RMSE
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

*3.1 Fill missing vales with zero*

In [None]:
df = load_df()

In [None]:
# fillna with 0
df = prepare_df(df, 0)

In [None]:
# split dataframe into train, test and val datasets
X_train, X_val, X_test, y_train, y_val, y_test = split_df(df)

In [None]:
# train model
w0, w = train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)

In [None]:
# calculate train RMSE
with_zero = round(rmse(y_train, y_pred),2)
with_zero

*3.2 Fill missing vales with mean*

In [None]:
df = load_df()

In [None]:
# store mean for use later
mean = df_train.total_bedrooms.mean()
mean

In [None]:
# fillna with mean
df = prepare_df(df, mean)

In [None]:
# split dataframe into train, test and val datasets
X_train, X_val, X_test, y_train, y_val, y_test = split_df(df)

In [None]:
# train model
w0, w = train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)

In [None]:
# calculate train RMSE
with_mean = round(rmse(y_train, y_pred),2)
with_mean

**4: Regularization**

In [None]:
def load_df():
    # loda data
    # data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
    # !wget $data
    df = pd.read_csv ('data/housing.csv')
    
    return df

In [None]:
def prepare_df(df_all, fill):
    # df = df.copy()
    
    df = df_all[(df_all['ocean_proximity'] == '<1H OCEAN') | (df_all['ocean_proximity'] == 'INLAND')]
        
    cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
    # df = df[cols]
  
    df = df[cols]
    df = df.fillna(fill)
    # X = df_num.values

    return df

In [None]:
def split_df(df):
    # calculate how many rows for 60%, 20% and 20% split
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    # create random index
    idx = np.arange(n)
    np.random.seed(42)
    np.random.shuffle(idx)
    
    # select rows based on random index
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
        
    # assign y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    # remove y from data frame
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    # assign X values
    # X_train = df_train.values
    # X_val = df_val.values
    # X_test = df_test.values
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [None]:
# update linera regression to add small value r along diagonal values of XTX

def train_linear_regression(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
       
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [None]:
# RMSE
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
df_all = load_df()

In [None]:
df = prepare_df(df_all, 0)

In [None]:
# split dataframe into train, test and val datasets
df_train, df_val, df_test, y_train, y_val, y_test = split_df(df)

In [None]:
# train model
X_train = df_train.values
X_val = df_val.values

for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    w0, w = train_linear_regression(X_train, y_train, r=r)
    
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

**5.  Shuffling**

In [28]:
def load_df():
    # loda data
    # data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
    # !wget $data
    df = pd.read_csv ('data/housing.csv')
    
    return df

In [29]:
def prepare_df(df_all, fill):
    # df = df.copy()
    
    df = df_all[(df_all['ocean_proximity'] == '<1H OCEAN') | (df_all['ocean_proximity'] == 'INLAND')]
        
    cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
    # df = df[cols]
  
    df = df[cols]
    df = df.fillna(fill)
    # X = df_num.values

    return df

In [30]:
def split_df(df, seed):
    # calculate how many rows for 60%, 20% and 20% split
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    # create random index
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    # select rows based on random index
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
        
    # assign y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    # remove y from data frame
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    # assign X values
    # X_train = df_train.values
    # X_val = df_val.values
    # X_test = df_test.values
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [31]:
def train_linear_regression(X, y):
    # ones = np.ones(X.shape[0])
    # X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    # XTX = XTX + r * np.eye(XTX.shape[0])
       
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [32]:
# RMSE
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [33]:
df_all = load_df()

In [34]:
df = prepare_df(df_all, 0)

In [38]:
score = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_all = load_df()
    df = prepare_df(df_all, 0)
    df_train, df_val, df_test, y_train, y_val, y_test = split_df(df, seed)
    
    # train model
    X_train = df_train.values
    X_val = df_val.values
    
    y_pred = w0 + X_val.dot(w)
    score.append(rmse(y_val, y_pred))
    
    print(seed, w0, score)
    

0 -0.4381172315677186 [0.3453699489452713]
1 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067]
2 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345]
3 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345, 0.33864362370327566]
4 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345, 0.33864362370327566, 0.34764384003919324]
5 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345, 0.33864362370327566, 0.34764384003919324, 0.3477260175521713]
6 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345, 0.33864362370327566, 0.34764384003919324, 0.3477260175521713, 0.34196295955962464]
7 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.34655766545040345, 0.33864362370327566, 0.34764384003919324, 0.3477260175521713, 0.34196295955962464, 0.350767655885401]
8 -0.4381172315677186 [0.3453699489452713, 0.34631637474686067, 0.3465576

In [44]:
round(np.array(score).std(),3)

0.005

**6. RMSE on test**

In [45]:
def load_df():
    # loda data
    # data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
    # !wget $data
    df = pd.read_csv ('data/housing.csv')
    
    return df

In [46]:
def prepare_df(df_all, fill):
    # df = df.copy()
    
    df = df_all[(df_all['ocean_proximity'] == '<1H OCEAN') | (df_all['ocean_proximity'] == 'INLAND')]
        
    cols = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','median_house_value']
    # df = df[cols]
  
    df = df[cols]
    df = df.fillna(fill)
    # X = df_num.values

    return df

In [47]:
def split_df(df):
    # calculate how many rows for 60%, 20% and 20% split
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    # create random index
    idx = np.arange(n)
    np.random.seed(9)
    np.random.shuffle(idx)
    
    # select rows based on random index
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
        
    # assign y values
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)
    
    # remove y from data frame
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    
    # assign X values
    # X_train = df_train.values
    # X_val = df_val.values
    # X_test = df_test.values
    
    return df_train, df_val, df_test, y_train, y_val, y_test

In [48]:
# update linera regression to add small value r along diagonal values of XTX

def train_linear_regression(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
       
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [49]:
# RMSE
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [50]:
df_all = load_df()

In [51]:
df = prepare_df(df_all, 0)

In [52]:
# split dataframe into train, test and val datasets
df_train, df_val, df_test, y_train, y_val, y_test = split_df(df)

In [57]:
# train model
df_merged = pd.concat([df_train, df_val], axis=0)
y_merged = np.concatenate((y_train, y_val))

X_merged = df_merged.values
X_test = df_test.values

w0, w = train_linear_regression(X_merged, y_merged, r=0.001)

y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)

print(round(score,2))

0.33
