In [9]:
# todo: 
#     [] resturcutre project in productionisable python project format
#     [] experiment with including lat-long & best feature engineering approaches to model

Linear regression using

Boston Housing Data

dataset docs: https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset/data

In [38]:
# import packages
import pandas as pd
import numpy as np
from pathlib import Path
import math

from sklearn.datasets import fetch_california_housing

import math, copy
from sklearn.metrics import mean_squared_error

import plotly.express as px

In [33]:
# load & describe data
housing = fetch_california_housing()

print(str(housing['DESCR']))

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [34]:
# helpers - data cleaning helpers

def load_data():
    housing  = fetch_california_housing()
    
    df = pd.DataFrame(data= np.c_[housing['data'], housing['target']],
                     columns= housing['feature_names'] + ['target'])
    df = df.drop(columns=['Latitude', 'Longitude'])

    return df
    

def train_test_split(df):
    n = len(df)
    
    # train test split (2/3 train, 1/3 test)
    n_train = round(2/3*n)

    train_df = df[:n_train]
    test_df = df[n_train:]
    
    return train_df, test_df



def initial_rand(X):
    
    np.random.seed(1)
    
    m = X.shape[0]
    n = X.shape[1]
    
    w = np.random.randn(n).reshape(n,1) * 0.01
    b = np.random.randint(0,100) * 0.01 
    
    return w, b

def initial_zeros(X):
    
    np.random.seed(1)
    
    # m = number of training examples
    m = X.shape[0]
    
    # n = number of features
    n = X.shape[1]
    
    w = np.zeros(n).reshape(n,1).T
    b = 0
    
    return w, b




In [35]:
# run clean data functions

df = load_data()

train_df, test_df = train_test_split(df) 


In [36]:
# set m & n

# m = number of training examples
m = train_df.values.shape[0]

# n = number of features
n = len(train_df.drop(columns='target').columns)



In [37]:
# X should be of the dimensions m, n

# X should be a matrix of with m (number training examples) rows and n (number features) columns 
X = train_df.drop(columns='target').values.reshape(m,n)

# Y should be a matrix with 1 row and n columns
Y = train_df['target'].values.reshape(1,m) 

m, n, X.shape, Y.shape

(13760, 6, (13760, 6), (1, 13760))

In [46]:
px.scatter(train_df, 'target', 'AveRooms')

In [16]:
# define gradient descent functions

def forward_prop(X, w, b):
    n = X.shape[0]
    # reshape step important for later functions
    Y_hat = np.dot(w, X.T) + b

    return Y_hat


def calculate_cost(X, Y, w, b):
    m = X.shape[0]
    Y_hat = forward_prop(X, w, b)
    cost = np.sum((Y_hat - Y)**2 ) / (2*m)
    return cost


def calculate_grads(X, Y, w, b):
    m, n = X.shape
    Y_hat = forward_prop(X, w, b)
    db = np.mean(Y_hat - Y)
    dw = np.sum(((Y_hat - Y) * X.T), axis=1) / m
    return db, dw

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X (ndarray (m,n))   : Data, m examples with n features
      y (ndarray (m,))    : target values
      w_in (ndarray (n,)) : initial model parameters  
      b_in (scalar)       : initial model parameter
      cost_function       : function to compute cost
      gradient_function   : function to compute the gradient
      alpha (float)       : Learning rate
      num_iters (int)     : number of iterations to run gradient descent
      
    Returns:
      w (ndarray (n,)) : Updated values of parameters 
      b (scalar)       : Updated value of parameter 
      """
    
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   ##None
            
        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               ##None
        b = b - alpha * dj_db               ##None
      
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            J_history.append( cost_function(X, y, w, b))

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]:8.2f}   ")

        
    return w, b, J_history #return final w,b and J history for graphing

In [17]:
# calculate model weights with functions

w, b = initial_zeros(X)

dw = np.zeros_like(w)
db = np.array([0.])

cost_history = []
num_iters = 150000
learning_rate = 5*10**-7
# run gradient descent 
w_final, b_final, J_hist = gradient_descent(X, Y, w, b,
                                                    calculate_cost, calculate_grads, 
                                                    learning_rate, num_iters)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m,_ = X.shape


Iteration    0: Cost     2.03   
Iteration 15000: Cost     0.72   
Iteration 30000: Cost     0.68   
Iteration 45000: Cost     0.64   
Iteration 60000: Cost     0.61   
Iteration 75000: Cost     0.58   
Iteration 90000: Cost     0.56   
Iteration 105000: Cost     0.54   
Iteration 120000: Cost     0.54   
Iteration 135000: Cost     0.54   
b,w found by gradient descent: 0.01,[[ 0.1295019   0.02873465  0.06584426  0.00627961  0.00016721 -0.00363167]] 


In [18]:
# import plotly.express as px
# px.line(J_hist )

In [19]:
test_df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'target'],
      dtype='object')

In [20]:
# calculate model weights with np logic

w, b = initial_zeros(X)
dw = np.zeros_like(w)
db = np.array([0.])

cost_history = []
num_iters = 150000
learning_rate = 5*10**-7

# loop thorugh gradient descent steps for number of iterations
for i in range(num_iters):

    
    # set training_exp, features num 
    m, n = X.shape
    
    # forward_prop
    Y_hat = np.dot(w, X.T) + b

    # calc grads
    db = np.mean(Y_hat - Y)
    dw = np.sum(((Y_hat - Y) * X.T), axis=1) / m
    
    # Update Parameters using w, b, learning_rate and gradient
    w = w - learning_rate * dw               
    b = b - learning_rate * db   
    
    cost = np.sum((Y_hat - Y)**2 ) / (2*m)    
    
    if i<100000:      # prevent resource exhaustion 
            cost_history.append(cost)

    # Print cost every at intervals 10 times or as many iterations if < 10
    if i% math.ceil(num_iters / 10) == 0:
        print(f"Iteration {i:4d}: Cost {cost_history[-1]:8.2f}   ")
    
    


Iteration    0: Cost     2.65   
Iteration 15000: Cost     0.72   
Iteration 30000: Cost     0.68   
Iteration 45000: Cost     0.64   
Iteration 60000: Cost     0.61   
Iteration 75000: Cost     0.58   
Iteration 90000: Cost     0.56   
Iteration 105000: Cost     0.54   
Iteration 120000: Cost     0.54   
Iteration 135000: Cost     0.54   


In [21]:
# import plotly.express as px
# px.line(cost_history)

In [22]:
# set test variables
test_df
X_test = test_df.drop(columns=['target']).values
Y_test = test_df['target'].values
X_test.shape, Y_test.shape

m,n = X_test.shape

In [23]:
# predict all prices for test dataset
Y_hat_test = np.dot(w, X_test.T) + b
Y_hat_test

array([[2.54048579, 2.18452727, 1.8916837 , ..., 1.22712008, 1.24225175,
        1.35354649]])

In [24]:
# predicitons to df
data = {'Y_test':list(Y_test)[0], 'Y_hat_test':list(Y_hat_test)[0]}
results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Y_test,Y_hat_test
0,2.306,2.540486
1,2.306,2.184527
2,2.306,1.891684
3,2.306,1.657910
4,2.306,1.404975
...,...,...
6875,2.306,1.400506
6876,2.306,1.316108
6877,2.306,1.227120
6878,2.306,1.242252


mape = mean ( (actual - forecast) / actual )

In [25]:
# sey Y & X shape (so calc below don't get cross)
m, n = X_test.shape

Y_test = test_df.target.values.reshape(1,m)
Y_test.shape, Y_hat_test.shape

((1, 6880), (1, 6880))

In [26]:
# calculate error metrics
print('mape',  np.mean((Y_test - Y_hat_test) / Y_hat_test))

import math
print('sk_rmse', math.sqrt(mean_squared_error(Y_test, Y_hat_test)))
print('rmse', np.sqrt(np.mean((Y_hat_test - Y_test )**2)))

mape 0.14065267307846
sk_rmse 1.051720814580968
rmse 1.051720814580968


pre feature scaling/engineering error metrics <br>
 - mape: 0.14065267307846 <br>
 - sk_rmse: 1.051720814580968 <br>
 - rmse: 1.051720814580968 <br>

In [27]:
# repeating the analytsis with feature scaling/enginerring

In [28]:
def scale_features(X):
    max_features = np.array([np.max(X.T[:][i]) for i in range(X.shape[1])])
    scaled_features = np.array([X.T[:][0] / max_features[i] for i in range(X.shape[1])]).T
    
    return max_features, scaled_features
