In [1]:
import numpy as np
import pandas as pd

import plotly.express as px

In [2]:
def normalize(data):
    for i in range(0,data.shape[1]-1):
        data[:,i] = ((data[:,i] - np.mean(data[:,i]))/np.std(data[:, i]))

In [3]:
def load_data(filename):
    df = pd.read_csv(filename, sep=";", index_col=False, header=None)
    data = np.array(df, dtype=float)
    normalize(data)
    return data[:,:-1], data[:, -1]

In [4]:
# load the dataset
x,y = load_data("data/housePrices.csv")

# y should become a 1D array
y = np.reshape(y, (len(y),1))

# add a column with only ones for the intercept
x = np.hstack((np.ones((x.shape[0],1)), x))

# make for every feature in x a beta (regression parameter)
# B0 = intercept
# the betas are your model, so you have to save them and update them
betas = [np.zeros((x.shape[1], 1))]
print(betas)

# keep track of the cost
cost=[]

[array([[0.],
       [0.],
       [0.]])]


In [5]:
def prediction(x):
    return np.matmul(x, betas)

In [6]:
def cost_function(x, y):
    return ((prediction(x)-y).T@(prediction(x)-y))/(2*y.shape[0])

In [11]:
def updateBetas(eventTuple, timePassed, learning_rate=0.1):

    # number of datapoints, so this is now just one
    # m = x.shape[0]

    # prediction based on the current betas
    predictedTimePassed = prediction(eventTuple)
    print('predicted house price: (should be 0 at first iteration) ', predictedTimePassed)

    # calculate how far you were off
    # error should have the shape [[error B0], [error B1], [error B2]]
    # eventTuple.T should have the shape [[1],[x_feature1], [x_feature2]]
    # predictedTimePassed and TimePassed should have the shape [[value]]
    error = np.reshape(eventTuple, (len(eventTuple),1))@(predictedTimePassed - timePassed)
    print('error:', error)
    # cost_function:
    cost.extend(cost_function(eventTuple, predictedTimePassed))

    # return the updated betas
    betas[0] = betas[0] - learning_rate * error

In [12]:
# run the model
[updateBetas(eventTuple, timePassed, learning_rate=0.1) for eventTuple, timePassed in zip(x, y)]

print(betas)

predicted house price: (should be 0 at first iteration)  [[333858.83421642]]
error: [[-66041.16578358]
 [ -8678.82767221]
 [ 14931.46957076]]
predicted house price: (should be 0 at first iteration)  [[289311.45240785]]
error: [[-40588.54759215]
 [ 20685.57570906]
 [  9176.8014102 ]]
predicted house price: (should be 0 at first iteration)  [[374436.89968596]]
error: [[ 5436.89968596]
 [ 2761.44864402]
 [-1229.24695918]]
predicted house price: (should be 0 at first iteration)  [[292296.64394279]]
error: [[ 60296.64394279]
 [-44841.23081799]
 [-93724.61506828]]
predicted house price: (should be 0 at first iteration)  [[426982.55509109]]
error: [[-112917.44490891]
 [-143526.0609114 ]
 [-124458.1912164 ]]
predicted house price: (should be 0 at first iteration)  [[345020.86172921]]
error: [[45120.86172921]
 [ -899.93787324]
 [49732.44693478]]
predicted house price: (should be 0 at first iteration)  [[273347.29775303]]
error: [[-41552.70224697]
 [ 24665.20714421]
 [  9394.79038297]]
predicted

In [10]:
print(cost)
fig = px.scatter(y=cost)
fig.show()

[array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]]), array([[0.]])]


ValueError: Data must be 1-dimensional