In [724]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import normalize
import sklearn.datasets as datasets
import openml as oml
import numpy as np
import random

In [702]:
def modelPredict(weights, x):
    ans = np.dot(x, weights)
    return ans

In [585]:
def normalizeData(X):
    min_vals, max_vals = X.min(axis=0), X.max(axis=0)
    scaled_data = (X - min_vals) / (max_vals - min_vals)
    return scaled_data

In [287]:
def getMeanSquaredLoss(y_train, modelPredictions):
    diff = ((y_train - modelPredictions) ** 2)
    ans = diff.sum() / len(diff)
    print(ans / len(diff))

In [591]:
def calculateAccuracy(y,predictions):
    residualSumSquares = ((y - predictions) ** 2).sum()
    totalSumSquares = ((y - y.mean()) ** 2).sum()
    accuracy = 1 - (residualSumSquares / totalSumSquares)
    return accuracy

In [590]:
def getMeanSquaredGradient(x_train, y_train, modelPredictions):
    N = len(y_train)
    errors = y_train - modelPredictions
    gradient = -2 * np.dot(errors, x_train) / N   
    return gradient

In [726]:
def getModel(x_train, y_train, learningRate = 0.04, numIterations = 50000):
    numWeights = len(x_train[0])
    weights = np.array([1.0] * numWeights)
    for i in range(numWeights):
        weights[i] *= random.random()
    for i in range(numIterations):
        predictions = modelPredict(weights, x_train)
        weights = weights - learningRate * (getMeanSquaredGradient(x_train, y_train, predictions))
    return weights

In [345]:
no2 = oml.datasets.get_dataset(44136)
X, Y, _, _ = no2.get_data(target=no2.default_target_attribute, dataset_format='array')
X = normalizeData(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [689]:
X, Y = datasets.fetch_california_housing(return_X_y=True)
X = normalizeData(X)
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [690]:
#creating a column of ones at the beginning of the input data
#this creates the intercept
x_test_ones = np.ones((x_test.shape[0], 1))
x_train_ones = np.ones((x_train.shape[0], 1))
x_test = np.hstack((x_test_ones, x_test))
x_train = np.hstack((x_train_ones, x_train))

In [718]:
model = getModel(x_train, y_train)
#after getting familiar enough with numpy to replace most for loops with numpy matrix operations
#the amount of time it took this program to train on california_housing went down to seconds

#it used to take like 5 minutes to train the program for 1000 iterations
#now it takes the program only a few seconds to train for 100,000 iterations!

In [719]:
print(model)

[ 4.1506551   5.42347779  0.48204707  0.34350154  3.29643769 -0.13875428
 -0.67817454 -4.37081073 -4.72232154]


In [720]:
temp = modelPredict(model, x_test)
accuracy = calculateAccuracy(y_test, temp)
print(accuracy)

0.6101694992010812


In [721]:
temp = modelPredict(model, x_train)
accuracy = calculateAccuracy(y_train, temp)
print(accuracy)

0.594034971591298


In [722]:
lr = LinearRegression().fit(x_train,y_train)

In [723]:
lr.score(x_test,y_test)

0.6186151898817117