# Linear regression

We got 
* n - number of rows, samples, observations
* d - number of columns, features

### Equation
  $y = Xw$

* X - entries, data - matrix (n, d+1)
* w - weights - matrix (d+1,)
* y - ground truth / target - matrix (n,)

### After transformation
  $w = (X^T X)^{-1} X^T y$


In [626]:
# imports
from common.consts import DATA_PATH, CATEGORICAL_COLUMN_NAMES

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
 

In [627]:
data = pd.read_csv(DATA_PATH, sep=";")
data.columns = data.columns.str.strip()
data_numeric = data.drop(columns=CATEGORICAL_COLUMN_NAMES)

data_numeric


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,122.0,127.3,20,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,10.8,1.4,1.74
1,160.0,142.5,19,0,6,6,6,14.000000,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79
2,122.0,124.8,19,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,10.8,1.4,1.74
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.400000,0,9.4,-0.8,-3.12
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.000000,0,13.9,-0.3,0.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,125.0,122.2,19,0,6,7,5,13.600000,0,0,6,8,5,12.666667,0,15.5,2.8,-4.06
4420,120.0,119.0,18,0,6,6,6,12.000000,0,0,6,6,2,11.000000,0,11.1,0.6,2.02
4421,154.0,149.5,30,0,7,8,7,14.912500,0,0,8,9,1,13.500000,0,13.9,-0.3,0.79
4422,180.0,153.8,20,0,5,5,5,13.800000,0,0,5,6,5,12.000000,0,9.4,-0.8,-3.12


In [628]:
X = data_numeric.drop("Admission grade", axis=1)
X = StandardScaler().fit_transform(X)
y = data_numeric["Admission grade"]

In [629]:
class CustomLinearRegression:
    def __init__(self):
        self.coefficients = None
        
    def fit_base(self, X, y):
        if isinstance(X, pd.DataFrame):
            X_matrix = np.c_[np.ones(shape=(X.shape[0],1)), X.values]
        else: 
            X_matrix = np.c_[np.ones(shape=(X.shape[0],1)), X]
            
        if isinstance(y, pd.Series):
            y_matrix = y.values.reshape(-1, 1)
        else: 
            y_matrix = y.reshape(-1, 1)
            
        return X_matrix, y_matrix
        
    def predict(self, X):
        if self.coefficients is None:
            raise ValueError("Model is not trained.")
        
        if isinstance(X, pd.DataFrame):
            return np.c_[np.ones((X.shape[0], 1)), X.values] @ self.coefficients
        
        return np.c_[np.ones((X.shape[0], 1)), X] @ self.coefficients

Separate data into training and test sets

In [630]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

My implementation of the closed form solution for linear regression

In [631]:
class ClosedFormSolution(CustomLinearRegression):
    def __init__(self):
        super().__init__()
    
    def fit(self, X, y):
        X_matrix, y_matrix = super().fit_base(X, y)
        self.coefficients = np.linalg.inv(X_matrix.T @ X_matrix) @ X_matrix.T @ y_matrix

My implementation of gradient descent for linear regression

In [632]:
class GradientDescent(CustomLinearRegression):
    def __init__(self):
        super().__init__()
        
    def fit(self, X, y, iterations = 500, batch_size = None, learning_rate = 0.002):
        X_matrix, y_matrix = super().fit_base(X, y)
        
        samples_count, coef_count = X_matrix.shape
        
        self.coefficients = np.zeros((X_matrix.shape[1], 1))
        
        for _ in range(iterations):
            if batch_size:
                indices = np.random.choice(samples_count, batch_size, replace = False)
                X_batch = X_matrix[indices]
                y_batch = y_matrix[indices]
            else:
                X_batch = X_matrix
                y_batch = y_matrix
                
            factor = (2 / (batch_size if batch_size else samples_count)) 
            gradients = factor * X_batch.T @ (X_batch @ self.coefficients - y_batch)
                    
            self.coefficients -= learning_rate * gradients
            

Closed form solution linear regression

In [633]:
closed_form_solution = ClosedFormSolution()
closed_form_solution.fit(X_train, y_train)
y_pred_closed = closed_form_solution.predict(X_test)

Gradient descent linear regression

In [None]:
gradient_descent1 = GradientDescent()
gradient_descent1.fit(X_train, y_train)
y_pred_gradient1 = gradient_descent1.predict(X_test)

gradient_descent2 = GradientDescent()
gradient_descent2.fit(X_train, y_train, iterations=3000)
y_pred_gradient2 = gradient_descent2.predict(X_test)

gradient_descent3 = GradientDescent()
gradient_descent3.fit(X_train, y_train, iterations=3000, batch_size=500)
y_pred_gradient3 = gradient_descent3.predict(X_test)

gradient_descent4 = GradientDescent()
gradient_descent4.fit(X_train, y_train, iterations=3000, batch_size=100, learning_rate = 0.02)
y_pred_gradient4 = gradient_descent4.predict(X_test)


Sklearn linear regression

In [635]:
sklearn_linear_model = LinearRegression()
sklearn_linear_model.fit(X_train, y_train)
y_pred_sklearn = sklearn_linear_model.predict(X_test)

Comparison

In [636]:
mse_closed = mean_squared_error(y_test, y_pred_closed)
r2_closed = r2_score(y_test, y_pred_closed)

print("===== CLOSED FORM SOLUTION LINEAR REGRESSION =====")
print("MSE:", mse_closed)
print("R2 score:", r2_closed)

mse_gradient1 = mean_squared_error(y_test, y_pred_gradient1)
r2_gradient1 = r2_score(y_test, y_pred_gradient1)

mse_gradient2 = mean_squared_error(y_test, y_pred_gradient2)
r2_gradient2 = r2_score(y_test, y_pred_gradient2)

mse_gradient3 = mean_squared_error(y_test, y_pred_gradient3)
r2_gradient3 = r2_score(y_test, y_pred_gradient3)

mse_gradient4 = mean_squared_error(y_test, y_pred_gradient4)
r2_gradient4 = r2_score(y_test, y_pred_gradient4)

print("\n===== GRADIENT DESCENT LINEAR REGRESSION =====")
print("MSE:", mse_gradient1)
print("R2 score:", r2_gradient1)
print("\nMSE:", mse_gradient2)
print("R2 score:", r2_gradient2)
print("\nMSE:", mse_gradient3)
print("R2 score:", r2_gradient3)
print("\nMSE:", mse_gradient4)
print("R2 score:", r2_gradient4)

mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print("\n===== SKLEARN LINEAR REGRESSION =====")
print("MSE:", mse_sklearn)
print("R2:", r2_sklearn)

===== CLOSED FORM SOLUTION LINEAR REGRESSION =====
MSE: 139.8611049198467
R2 score: 0.3689051071428803

===== GRADIENT DESCENT LINEAR REGRESSION =====
MSE: 440.79174800684655
R2 score: -0.9889834356743208

MSE: 139.45085171303336
R2 score: 0.3707562916001068

MSE: 139.48083999668953
R2 score: 0.3706209755472857

MSE: 139.20334859722996
R2 score: 0.3718730992532371

===== SKLEARN LINEAR REGRESSION =====
MSE: 139.8611049198464
R2: 0.3689051071428816


The results are basically the same