In [79]:
# todo: 
#     [] resturcutre project in productionisable python project format
#     [] experiment with including lat-long & best feature engineering approaches to model

Linear regression using

Boston Housing Data

dataset docs: https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset/data

In [80]:
# import packages
import pandas as pd
import numpy as np
from pathlib import Path
import math

from sklearn.datasets import fetch_california_housing

import math, copy
from sklearn.metrics import mean_squared_error

import plotly.express as px

In [81]:
# load & describe data
housing = fetch_california_housing()

print(str(housing['DESCR']))

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [82]:
# helpers - data cleaning helpers

def load_data():
    housing  = fetch_california_housing()
    
    df = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['target'])
    df = df.drop(columns=['Latitude', 'Longitude'])

    return df
    

def train_test_split(df):
    n = len(df)
    
    # train test split (2/3 train, 1/3 test)
    n_train = round(2/3*n)

    train_df = df[:n_train]
    test_df = df[n_train:]
    
    return train_df, test_df



def initial_rand(X):
    
    np.random.seed(1)
    
    m = X.shape[0]
    n = X.shape[1]
    
    w = np.random.randn(n).reshape(n,1) * 0.01
    b = np.random.randint(0,100) * 0.01 
    
    return w, b

def initial_zeros(X):
    
    np.random.seed(1)
    
    # m = number of training examples
    m = X.shape[0]
    
    # n = number of features
    n = X.shape[1]
    
    w = np.zeros(n).reshape(n,1).T
    b = 0
    
    return w, b

def set_train_vars(X_df):
    # m = number of training examples
    m = X_df.values.shape[0]

    # n = number of features
    n = len(X_df.drop(columns='target').columns)

    # X should be a matrix of with m (number training examples) rows and n (number features) columns 
    X = X_df.drop(columns='target').values.reshape(m,n)

    # Y should be a matrix with 1 row and n columns
    Y = X_df['target'].values.reshape(1,m) 
    
    return X, Y, m, n


In [83]:
# define gradient descent functions

def forward_prop(X, w, b):
    n = X.shape[0]
    # reshape step important for later functions
    Y_hat = np.dot(w, X.T) + b

    return Y_hat


def calculate_cost(X, Y, w, b):
    m = X.shape[0]
    Y_hat = forward_prop(X, w, b)
    cost = np.sum((Y_hat - Y)**2 ) / (2*m)
    return cost


def calculate_grads(X, Y, w, b):
    m, n = X.shape
    Y_hat = forward_prop(X, w, b)
    db = np.mean(Y_hat - Y)
    dw = np.sum(((Y_hat - Y) * X.T), axis=1) / m
    return db, dw

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None
            
        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")

        
    return w, b, J_history #return final w,b and J history for graphing

In [84]:
# run clean data functions

df = load_data()

train_df, test_df = train_test_split(df) 


In [85]:
# linear regression no feature engineering run through

X, Y, m, n = set_train_vars(train_df)

w, b = initial_zeros(X)

dw = np.zeros_like(w)
db = np.array([0.])

cost_history = []
num_iters = 15000
learning_rate = 5*10**-7
# run gradient descent 
w_final, b_final, J_hist = gradient_descent(X, Y, w, b,
                                                    calculate_cost, calculate_grads, 
                                                    learning_rate, num_iters)
print(f"b,w found by gradient descent: {b_final},{w_final} ")
m,_ = X.shape


Iteration    0: Cost     2.03   
Iteration 1500: Cost     1.03   
Iteration 3000: Cost     0.87   
Iteration 4500: Cost     0.80   
Iteration 6000: Cost     0.77   
Iteration 7500: Cost     0.75   
Iteration 9000: Cost     0.74   
Iteration 10500: Cost     0.73   
Iteration 12000: Cost     0.73   
Iteration 13500: Cost     0.72   
b,w found by gradient descent: 0.002127289354550842,[[0.01979033 0.04302802 0.01631023 0.00236143 0.0002922  0.00226104]] 


In [86]:
test_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,target
13760,6.3308,33.0,7.183386,1.015674,1760.0,2.758621,2.306
13761,4.0556,37.0,8.833333,1.500000,27.0,2.250000,1.313
13762,2.5156,24.0,5.264929,1.068966,3132.0,2.634146,1.030
13763,1.3906,38.0,5.403846,1.057692,151.0,2.903846,1.208
13764,4.9688,4.0,6.333333,1.049383,1341.0,2.759259,1.632
...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,0.847


In [87]:
# testing funcitons

def set_test_vars(X_df):
    m,n = X_df.shape
    X_test = X_df.drop(columns=['target']).values
    Y_test = X_df['target'].values
    
    return X_test, Y_test, m, n


def predict(X_in, w_arr, b_val):
    Y_hat = np.dot(w_arr, X_in.T) + b_val
    return Y_hat



In [88]:
# predict all prices for test dataset

X_test, Y_test, m, n = set_test_vars(test_df)

Y_hat_test = predict(X_test, w_final, b_final)

Y_hat_test



array([[2.18741376, 1.83501835, 2.09411192, ..., 1.15430112, 1.12459915,
        1.23749767]])

In [89]:
print('Yhat_test should return: array([[2.54048579, 2.18452727, 1.8916837 , ..., 1.22712008, 1.242251751.35354649]')

Yhat_test should return: array([[2.54048579, 2.18452727, 1.8916837 , ..., 1.22712008, 1.242251751.35354649]


In [90]:
# predicitons to df
data = {'Y_test':Y_test, 'Y_hat_raw_features':Y_hat_test[0]}
results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Y_test,Y_hat_raw_features
0,2.306,2.187414
1,1.313,1.835018
2,1.030,2.094112
3,1.208,1.806036
4,1.632,0.776431
...,...,...
6875,0.781,1.446375
6876,0.771,1.041145
6877,0.923,1.154301
6878,0.847,1.124599


mape = mean ( (actual - forecast) / actual )

In [91]:
# sey Y & X shape (so calc below don't get cross)
m, n = X_test.shape

Y_test = test_df.target.values.reshape(1,m)
Y_test.shape, Y_hat_test.shape

((1, 6880), (1, 6880))

In [92]:
# calculate error metrics
print('mape',  np.mean(abs((Y_test - Y_hat_test) / Y_hat_test)))
print('sk_rmse', math.sqrt(mean_squared_error(Y_test, Y_hat_test)))
print('rmse', np.sqrt(np.mean((Y_hat_test - Y_test )**2)))

mape 0.608141111282366
sk_rmse 1.2674367675275848
rmse 1.2674367675275848


pre feature scaling/engineering error metrics <br>
 - mape: 0.14065267307846 <br>
 - sk_rmse: 1.051720814580968 <br>
 - rmse: 1.051720814580968 <br>

### repeating the modelling with feature scaling/enginerring

In [93]:
 # def feature scaling funcitons
    
def scale_features(X):
    max_features = np.array([np.max(X.T[:][i]) for i in range(X.shape[1])])
    scaled_features = np.array([X.T[:][0] / max_features[i] for i in range(X.shape[1])]).T

    return max_features, scaled_features

def set_scaled_vars(train_df):
    X, Y, m, n = set_train_vars(train_df)
    X_max_features, X = scale_features(X)
    return X, Y, m, n


In [94]:
# implement linear regression with sclaed features

# set vars
# X, Y, m, n = set_train_vars(train_df)
# X_max_features, X = scale_features(X)
X, Y, m, n = set_scaled_vars(train_df)
# calculate model weights with functions

w, b = initial_zeros(X)

dw = np.zeros_like(w)
db = np.array([0.])

cost_history = []
num_iters = 1500
learning_rate = 5*10**-3
# run gradient descent 
w, b, J_hist = gradient_descent(X, Y, w, b,
                                calculate_cost, calculate_grads,
                                learning_rate, num_iters)

print(f"b,w found by gradient descent: {w},{b}")
m,_ = X.shape





Iteration    0: Cost     2.63   
Iteration  150: Cost     0.97   
Iteration  300: Cost     0.64   
Iteration  450: Cost     0.57   
Iteration  600: Cost     0.55   
Iteration  750: Cost     0.54   
Iteration  900: Cost     0.54   
Iteration 1050: Cost     0.53   
Iteration 1200: Cost     0.52   
Iteration 1350: Cost     0.52   
b,w found by gradient descent: [[1.06867797e+00 3.08274546e-01 1.12961589e-01 4.70556059e-01
  5.61166296e-04 2.67298559e-02]],1.6804103172049092


In [95]:
# predict all prices for test dataset
Y_hat_test = np.dot(w, scale_features(X_test)[1].T) + b
Y_hat_test

array([[2.5582937 , 2.24279483, 2.02924513, ..., 1.91614699, 1.93933239,
        2.01163422]])

In [96]:
# calculate error metrics
print('mape',  np.mean(abs((Y_test - Y_hat_test) / Y_hat_test)))
print('sk_rmse', math.sqrt(mean_squared_error(Y_test, Y_hat_test)))
print('rmse', np.sqrt(np.mean((Y_hat_test - Y_test )**2)))
print('\nshould return:','mape 0.38216067257255476','\nsk_rmse 1.0399675929681738', '\nrmse 1.0399675929681738')

mape 0.38216067257255476
sk_rmse 1.0399675929681738
rmse 1.0399675929681738

should return: mape 0.38216067257255476 
sk_rmse 1.0399675929681738 
rmse 1.0399675929681738


In [97]:
# predicitons to df
results_df['Y_hat_scaled_features'] = Y_hat_test[0]

In [98]:
results_df

Unnamed: 0,Y_test,Y_hat_raw_features,Y_hat_scaled_features
0,2.306,2.187414,2.558294
1,1.313,1.835018,2.242795
2,1.030,2.094112,2.029245
3,1.208,1.806036,1.873243
4,1.632,0.776431,2.369427
...,...,...,...
6875,0.781,1.446375,1.896775
6876,0.771,1.041145,2.034958
6877,0.923,1.154301,1.916147
6878,0.847,1.124599,1.939332
