## Multivariate Linear Regression

In [122]:
# Imports
import numpy as np
import pandas as pd

In [123]:
def normalize(data):
    num_columns = data[0, ...].size
    num_rows = data[..., 0].size
    column_ranges = np.array([])
    column_minimums = np.array([])

    for i in range(num_columns - 1):
        column_range = np.ptp(data[:, i])
        column_minimum = np.min(data[:, i])
        data[:, i] = (data[:, i] - column_minimum) / column_range

        column_ranges = np.append(column_ranges, [column_range])
        column_minimums = np.append(column_minimums, [column_minimum])
    
    return (data, column_minimums, column_ranges)

##### Importing from california_housing.csv

In [124]:
data = np.loadtxt("washington_housing_prices.csv", delimiter=",", dtype=str, skiprows = 1)
data = np.delete(data, [0, 13, 14, 15, 16, 17], axis = 1)

data = data.astype(float)

# Moving Output Column to the End
data = np.roll(data, -1, axis = 1)

# Normalizing Data (Using Feature Scaling)
num_columns = data[0, ...].size
num_rows = data[..., 0].size
(data, column_minimums, column_ranges) = normalize(data)

print(column_minimums)
print(column_ranges)

# Creating Training Set and Test Set
training_data = data[0:int(num_rows/2), :]
test_data = data[int(num_rows/2):data.size, :]

print("Number of Total Columns (Input Columns + 1 Output Price Column): ")
print(training_data[0, ...].size)
print("\n")

print("Data: ")
print(training_data)

[0.00e+00 0.00e+00 3.70e+02 6.38e+02 1.00e+00 0.00e+00 0.00e+00 1.00e+00
 3.70e+02 0.00e+00 1.90e+03]
[9.00000e+00 8.00000e+00 1.31700e+04 1.07358e+06 2.50000e+00 1.00000e+00
 4.00000e+00 4.00000e+00 9.04000e+03 4.82000e+03 1.14000e+02]
Number of Total Columns (Input Columns + 1 Output Price Column): 
12


Data: 
[[3.33333333e-01 1.87500000e-01 7.36522399e-02 ... 0.00000000e+00
  4.82456140e-01 3.13000000e+05]
 [5.55555556e-01 3.12500000e-01 2.49050873e-01 ... 5.80912863e-02
  1.84210526e-01 2.38400000e+06]
 [3.33333333e-01 2.50000000e-01 1.18451025e-01 ... 0.00000000e+00
  5.78947368e-01 3.42000000e+05]
 ...
 [4.44444444e-01 3.43750000e-01 1.41989370e-01 ... 0.00000000e+00
  6.66666667e-01 5.80379000e+05]
 [4.44444444e-01 3.12500000e-01 2.35383447e-01 ... 0.00000000e+00
  8.33333333e-01 8.75000000e+05]
 [3.33333333e-01 3.12500000e-01 2.27790433e-01 ... 3.52697095e-02
  4.47368421e-01 5.15000000e+05]]


In [125]:
# Parameters w (w_1 ... w_n) and b
start_w = np.zeros(training_data[0].size - 1)
start_b = 0

print("Number of w parameters: ")
print(start_w.size)

Number of w parameters: 
11


In [126]:
def estimate(input, w, b):
    return np.dot(w, input) + b

def cost_function(data, w, b):
    sum = 0
    
    for line in data:
        y_hat = estimate(line[0:-1], w, b)
        error = y_hat - line[-1]
        squared_error = error * error

        sum = sum + squared_error
    
    cost = sum * (1 / (2 * data[..., 0].size))
    return cost

def gradient_descent(data, w, b, alpha, numIterations):
    num_terms = data[..., 0].size

    for i in range(0, numIterations):
        
        est = 0
        derivative_terms_w = 0
        derivative_term_b = 0

        for line in data:
            est = estimate(line[0:-1], w, b)
            derivative_terms_w = derivative_terms_w + (est - line[-1]) * line[0:-1]
            derivative_term_b = derivative_term_b + (est - line[-1])
        
        derivative_terms_w = (1 / num_terms) * derivative_terms_w
        derivative_term_b = (1 / num_terms) * derivative_term_b

        temp_w = w - alpha * derivative_terms_w
        temp_b = b - alpha * derivative_term_b

        w = temp_w
        b = temp_b

    return (w, b)

def root_mean_squared_error(data, w, b):
    sum = 0
    
    for line in data:
        y_hat = estimate(line[0:-1], w, b)
        error = y_hat - line[-1]
        squared_error = error * error

        sum = sum + squared_error
    
    cost = sum * (1 / (data[..., 0].size))
    cost = np.sqrt(cost)
    return cost

In [127]:
(w, b) = gradient_descent(training_data, start_w, start_b, 0.01, 10000)
print("\n")

print("w parameter values: ")
print(w)
print("\n")

print("b parameter value: ")
print(b)



w parameter values: 
[ 227685.29375138  523364.14605992  700169.22572192   -1916.31045547
  257712.38307458  367768.72311961  392732.83143734  117325.95983674
  811371.34136231  391375.88731171 -269103.71833865]


b parameter value: 
95195.96899821016


In [128]:
error_test_data = root_mean_squared_error(test_data, w, b)
print("Cost with Test Data: " + str(error_test_data))

Cost with Test Data: 670986.357716846


In [129]:
from sklearn.metrics import mean_squared_error

temp = np.array([])

for sample in training_data:
    temp = np.append(temp, np.dot(sample[0:-1], w) + b)

linear_rmse = mean_squared_error(training_data[:, -1], temp, squared = False)

linear_rmse

60702575379.395706

## Testing Sample Data

In [130]:
# sample_data = [44, 15]
# print(sample_data)

# sample_data_normalized = (sample_data - column_minimums) / column_ranges
# print(sample_data_normalized)

# print(estimate(sample_data_normalized, w, b))

In [131]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

print(training_data[:, 0:-1])

linear_regression = LinearRegression()
linear_regression.fit(training_data[:, 0:-1], training_data[:, -1])

[[0.33333333 0.1875     0.07365224 ... 0.10730088 0.         0.48245614]
 [0.55555556 0.3125     0.24905087 ... 0.33185841 0.05809129 0.18421053]
 [0.33333333 0.25       0.11845103 ... 0.17256637 0.         0.57894737]
 ...
 [0.44444444 0.34375    0.14198937 ... 0.20685841 0.         0.66666667]
 [0.44444444 0.3125     0.23538345 ... 0.34292035 0.         0.83333333]
 [0.33333333 0.3125     0.22779043 ... 0.3130531  0.03526971 0.44736842]]


In [132]:
w = linear_regression.coef_
b = linear_regression.intercept_

print(w)
print(b)

[-454795.19200139  382492.24854044 1355792.15153696 -812494.3516344
  137268.32098274  747320.28016367  187402.67311211  128578.96901819
 1642167.119187    624604.12412681 -323751.54848023]
228544.82521240273


In [133]:
housing_predictions = linear_regression.predict(training_data[:, 0:-1])

In [134]:
housing_predictions[0:5]

array([ 354769.04042907, 1319206.15307688,  516987.65524318,
        508277.07754848,  424246.68672525])

In [135]:
training_data[0:5, -1]

array([ 313000., 2384000.,  342000.,  420000.,  550000.])

In [136]:
from sklearn.metrics import mean_squared_error

linear_rmse = mean_squared_error(training_data[:, -1], housing_predictions)

linear_rmse

52794904162.41412

In [137]:
error_test_data = root_mean_squared_error(test_data, w, b)
print("Cost with Test Data: " + str(error_test_data))

Cost with Test Data: 669341.0409014781
