In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
import sklearn.datasets as datasets
import pandas as pd
import numpy as np
import random

In [2]:
calif_housing = datasets.fetch_california_housing(as_frame = True)

In [3]:
x_train, x_test, y_train, y_test = train_test_split(calif_housing['data'],calif_housing['target'])

In [4]:
def getModelPredictions(x,weights):
    #time complexity is O(n*k), where n is number of datapoints, is number of values per input
    
    ans = []
    for i in range(len(x)):
        result = 0
        for j in range(len(x.columns)):
            result += x.iat[i, j] * weights[j]
        ans.append(result)
    return ans

In [5]:
def getLeastSquaredError(dataAnswers, modelPredictions) -> float:
    error = 0
    for i in range(len(dataAnswers)):
        error += (dataAnswers.iat[i] - modelPredictions[i])
    return error

In [6]:
def getLeastSquaresGradient(x_train,y_train,predictions,weight,weight_index):
    #takes in model predictions, weight to get the gradient of, and the index of that weight within "weights"
    ans_sum = 0
    gradient = 0
    x_col = x_train.iloc[:,weight_index]
    for i in range(len(y_train)):
        ans_sum += (y_train.iat[i] - predictions[i]) * x_col.iat[i]
    gradient = ans_sum * -2
    return gradient

In [7]:
def updateWeights(weights, x_train, y_train, learningRate):
    predictions = getModelPredictions(x_train, weights)
    for i in range(len(weights)):
        gradient = getLeastSquaresGradient(x_train, y_train, predictions, weights[i],i)
        weights[i] -= learningRate * gradient
    return weights

In [8]:
def trainModel(x_train, y_train, learningRate = 0.01, numIterations = 30):
    weights = [1] * len(x_train.columns)
    #Initialize the weights to random numbers
    for i in range(len(weights)):
        weights[i] = random.randint(0,100)
    for i in range(numIterations):
        weights = updateWeights(weights, x_train, y_train, learningRate)
    return weights        
    

In [9]:
def testModel(weights,x,y):
    predictions = getModelPredictions(x,weights)
    error = getLeastSquaredError(y, predictions)
    print("The error is " + str(error))

In [10]:
model = trainModel(x_train,y_train)

In [11]:
predictions = getModelPredictions(x_test,model)

In [12]:
testModel(model,x_test,y_test)

The error is -1.0463041286029545e+279


In [13]:
y_train.head()

12487    1.875
11895    0.822
667      1.846
9037     1.239
12410    0.934
Name: MedHouseVal, dtype: float64

In [14]:
x_test.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
7660,4.0687,38.0,4.83208,1.075188,1358.0,3.403509,33.84,-118.22
3148,2.3917,10.0,6.589686,1.356502,1167.0,2.616592,35.27,-118.34
6585,9.2463,39.0,8.12973,1.043243,1240.0,3.351351,34.22,-118.21
8351,3.1719,43.0,5.363764,0.994382,1505.0,2.113764,33.95,-118.32
15715,3.7188,46.0,4.434879,1.024283,761.0,1.679912,37.79,-122.45


In [15]:
y_test.head()

7660     1.60300
3148     0.79000
6585     5.00001
8351     1.83500
15715    5.00001
Name: MedHouseVal, dtype: float64

In [16]:
print(y_train)
print(y_train.iloc[0])

12487    1.875
11895    0.822
667      1.846
9037     1.239
12410    0.934
         ...  
9458     1.000
6659     1.731
18384    4.183
12656    0.841
1808     1.563
Name: MedHouseVal, Length: 15480, dtype: float64
1.875


In [17]:
print(len(x_train))
print(len(y_train))

15480
15480
