In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import normalize
import sklearn.datasets as datasets
import openml as oml
import numpy as np
import random

In [7]:
def modelPredict(weights, x): #function that makes the models predictions based on input data
    ans = np.dot(x, weights)
    return ans

Data normalization function, to predict values which are typically large from having an outsized effect on the model (and scaling all data points to have norm less than or equal to 1):

In [8]:
def normalizeData(X):
    min_vals, max_vals = X.min(axis=0), X.max(axis=0)
    scaled_data = (X - min_vals) / (max_vals - min_vals)
    return scaled_data

Function to calculate the loss. In this case we are using the Mean Squared Error (MSE):

In [9]:
def getMeanSquaredLoss(y_train, modelPredictions):
    diff = ((y_train - modelPredictions) ** 2)
    ans = diff.sum() / len(diff)
    print(ans / len(diff))

In [10]:
def calculateAccuracy(y,predictions):
    residualSumSquares = ((y - predictions) ** 2).sum()
    totalSumSquares = ((y - y.mean()) ** 2).sum()
    accuracy = 1 - (residualSumSquares / totalSumSquares)
    return accuracy

Below is the gradient function, which uses derivatives on the MSE to increase the models accuracy:

In [11]:
def getMeanSquaredGradient(x_train, y_train, modelPredictions):
    N = len(y_train)
    errors = y_train - modelPredictions
    gradient = -2 * np.dot(errors, x_train) / N   
    return gradient

This is the function that actually trains the model. It takes in the training data, with the learning rate and number of iterations set to be optional parameters.

In [12]:
def getModel(x_train, y_train, learningRate = 0.04, numIterations = 1000):
    numWeights = len(x_train[0])
    weights = np.array([1.0] * numWeights)
    for i in range(numWeights):
        weights[i] *= random.random()
    for i in range(numIterations):
        predictions = modelPredict(weights, x_train)
        weights = weights - learningRate * (getMeanSquaredGradient(x_train, y_train, predictions))
    return weights

Here we are trying the model using OpenML's wine quality dataset

In [13]:
no2 = oml.datasets.get_dataset(44136)
X, Y, _, _ = no2.get_data(target=no2.default_target_attribute, dataset_format='array')
X = normalizeData(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [40]:
X, Y = datasets.fetch_california_housing(return_X_y=True)
X = normalizeData(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [36]:
model = getModel(x_train, y_train)
#after getting familiar enough with numpy to replace most for loops with numpy matrix operations
#the amount of time it took this program to train on california_housing went down to seconds

#it used to take like 5 minutes to train the program for 1000 iterations
#now it takes the program only a few seconds to train for 100,000 iterations!

In [37]:
print(model)

[ 1.93451831 -1.30095191  1.11288399  0.81892639  1.64622807  2.43127209
  2.41980959  6.82786045  2.74634414  0.83613925  5.41946642]


In [38]:
temp = modelPredict(model, x_test)
accuracy = calculateAccuracy(y_test, temp)
print(accuracy)

-0.15775008995284345


In [39]:
temp = modelPredict(model, x_train)
accuracy = calculateAccuracy(y_train, temp)
print(accuracy)

-0.20255561278017797


In [33]:
lr = LinearRegression().fit(x_train,y_train)

In [34]:
lr.score(x_test,y_test)

0.5967357176402427