In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from DataDefinitionFunctions import Functions
from LinearRegression import LinearRegression

# Testing Univariate Linear Regression

## Test 1
In this first test, the model is tasked with learning a very simple relation: $2x=y$.
For univariate linear regression, the hypothesis function is $y =\theta_0 + \theta_1x$.
Ideally the model sets $(\theta_0, \theta_1) = (0,2)$.

#### Data and Training


In [2]:
# Data definition
X = np.random.uniform(size=400)
Y = Functions.double(X)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=69)

# Training
trained_parameters = LinearRegression.gradient_descent(X_train, y_train, 0.05, 2000)

100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 90058.70it/s]


#### Testing

In [3]:
parameter_diff = trained_parameters - (0,2)
assert abs(parameter_diff).sum() <= 0.1

y_pred = np.empty_like(y_test)
for i in range(len(X_test)):
    y_pred[i] = np.dot(trained_parameters, [1, X_test[i]])
    
error = mean_squared_error(y_pred, y_test)
assert error <= 0.001

## Test 2
In second test, the model is tasked with learning a slightly more complex relation: $2x+1=y$.  
Ideally the model sets $(\theta_0, \theta_1) = (1,2)$.

#### Data and training

In [4]:
# Data definition
X = np.random.uniform(size=400)
Y = Functions.double_plus_one(X)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=69)

# Training
trained_parameters = LinearRegression.gradient_descent(X_train, y_train)

100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 110507.28it/s]


#### Testing

In [5]:
parameter_diff = trained_parameters - (1,2)
assert abs(parameter_diff).sum() <= 0.1

y_pred = np.empty_like(y_test)
for i in range(len(X_test)):
    y_pred[i] = np.dot(trained_parameters, [1, X_test[i]])
    
error = mean_squared_error(y_pred, y_test)
assert error <= 0.001

## Test 3
In the third test, the model is tasked with learning a more complex relation: $-x_1 + 2x_2 =y$.
There can be different combinations of parameters that can reflect the behavior of this relation, hence, making a prediction for the value of the trained parameters is pointless. The best way to evaluate this model is to test its accuracy.

#### Data and training

In [6]:
# Data definition
X_1 = np.random.uniform(size=400)
X_2 = np.random.uniform(low=-1, high=0, size=400)
Y = Functions.negate_plus_double(X_1, X_2)
X = np.stack([X_1, X_2], axis=-1)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=69)

# Training
trained_parameters = LinearRegression.gradient_descent(X_train, y_train, epochs=5000)

100%|██████████████████████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 166481.59it/s]


#### Testing

In [7]:
y_pred = np.empty_like(y_test)
for i in range(len(X_test)):
    y_pred[i] = np.dot(trained_parameters, np.insert(X_test[i], 0, 1))

error = mean_squared_error(y_pred, y_test)
assert error <= 0.001

## Test 4
As the tests above show, the model performs well on relatively simple relations in the data. It will now be tested against an actual data set.

#### Data and training

In [8]:
import pandas as pd
from sklearn.datasets import load_diabetes

In [9]:
X_df, Y_df = load_diabetes(return_X_y=True, as_frame=True)
X = pd.DataFrame.to_numpy(X_df)
Y = pd.DataFrame.to_numpy(Y_df)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=69)

In [10]:
trained_parameters = LinearRegression.gradient_descent(X_train, y_train, learning_rate=0.1, epochs=3000)

100%|██████████████████████████████████████████████████████████████████████████| 3000/3000 [00:00<00:00, 125467.77it/s]


#### Testing

In [11]:
y_pred = np.empty_like(y_test)
for i in range(len(X_test)):
    y_pred[i] = np.dot(trained_parameters, np.insert(X_test[i], 0, 1))
my_error = mean_squared_error(y_pred, y_test)

In [12]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
sklearn_error = mean_squared_error(y_pred, y_test)

In [13]:
print(f"""The error of my model is {round(my_error)} and the error of the sklearn linear regression model is {round(sklearn_error)}. Quite close!""")

The error of my model is 2639 and the error of the sklearn linear regression model is 2502. Quite close!
