In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score




In [17]:
class MatrixPredict:
    def __init__(self, l2=0.0):
        self.l2 = l2

    def fit(self, X_train, y_train):
        X = np.insert(X_train, 0, 1, axis=1)
        
        _, n_features = X.shape

        reg_matrix = self.l2 * np.eye(n_features)
        reg_matrix[0, 0] = 0
        
        weights = np.linalg.multi_dot([np.linalg.inv(X.T @ X + reg_matrix), X.T, y_train])
        self.bias, self.weights = weights[0], weights[1:]
    
    def predict(self, X):
        return np.matmul(X, self.weights) + self.bias

class GBLR:
    def __init__(self, tolerance=0.001, learning_rate=0.01, l1=0.0):
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        self.l1 = l1
    
    def fit(self, X_train, y_train):
        n_samples, n_features = X_train.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        p_dw = np.zeros(n_features)
        p_db = 0

        while True:
            y_pred = X_train @ self.weights + self.bias
            error = y_pred - y_train

            db = 1 / n_samples * np.sum(error)
            dw = 1 / n_samples * X_train.T @ error

            dw += self.l1*np.sign(self.weights)

            self.bias -= self.learning_rate * db
            self.weights -= self.learning_rate * dw
            
            abs_db_diff = np.abs(db - p_db)
            abs_dw_diff = np.abs(dw - p_dw)

            if abs_db_diff < self.tolerance and abs_dw_diff.all() < self.tolerance:
                break

            p_db, p_dw = db, dw
        
    def predict(self, X_test):
        return X_test @ self.weights + self.bias

In [3]:
df_path = "dataset_income.csv"


income = pd.read_csv(df_path)
X1, y1 = income.iloc[:, :-1].values, income.iloc[:, -1].values
X1_scaled = scale(X1)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=0)
X1_train_s, X1_test_s, y1_train, y1_test = train_test_split(X1_scaled, y1, random_state=0)
income

Unnamed: 0,age,experience,income
0,25,1,30450
1,30,3,35670
2,47,2,31580
3,32,5,40130
4,43,10,47830
5,51,7,41630
6,28,5,41340
7,33,4,37650
8,37,5,40250
9,39,8,45150


In [15]:
best_r2_score, best_lr_mape = 0, 100

best_matrix_r2 = MatrixPredict(0)
best_matrix_lr_mape = MatrixPredict(0)

for l in range(100):

    matrix_linear_reg = MatrixPredict(l)
    matrix_linear_reg.fit(X1_train, y1_train)
    matrix_pred = matrix_linear_reg.predict(X1_test)
    
    matrix_lr_r2 = r2_score(y1_test, matrix_pred)
    matrix_lr_mape = mean_absolute_percentage_error(y1_test, matrix_pred)

    if matrix_lr_r2 > best_r2_score: 
        best_r2_score = matrix_lr_r2
        best_matrix_r2 = matrix_linear_reg

    if matrix_lr_mape < best_lr_mape: 
        best_lr_mape = matrix_lr_mape
        best_matrix_lr_mape = matrix_linear_reg
    
print(f'Matrix Linear regression best R2 score: {best_r2_score} {best_matrix_r2.l2}')
print(f'Matrix Linear regression best MAPE: {best_lr_mape} {best_matrix_lr_mape.l2}', '\n')

print(f'weights: {best_matrix_r2.bias, *best_matrix_r2.weights} {best_matrix_r2.l2}')
print(f'weights: {best_matrix_lr_mape.bias, *best_matrix_lr_mape.weights} {best_matrix_lr_mape.l2}')

Matrix Linear regression best R2 score: 0.9310681763428674 4
Matrix Linear regression best MAPE: 0.0441578715890813 14 

weights: (np.float64(31558.350420520463), np.float64(-95.27965249488973), np.float64(2118.510516969198)) 4
weights: (np.float64(31161.937354674355), np.float64(-67.57490211394614), np.float64(2002.677910229932)) 14


In [27]:
l1_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 110, 120, 400]

best_r2_score, best_lr_mape = 0, 100

best_reg_r2 = GBLR(0.0001)
best_reg_lr_mape = GBLR(0.0001)

for l1_t in l1_values: 
    linear_reg = GBLR(l1=l1_t)
    linear_reg.fit(X1_train_s, y1_train)
    lr_pred = linear_reg.predict(X1_test_s)
    
    lr_r2 = r2_score(y1_test, lr_pred)
    lr_mape = mean_absolute_percentage_error(y1_test, lr_pred)

    if lr_r2 > best_r2_score: 
        best_r2_score = lr_r2
        best_reg_r2 = linear_reg

    if lr_mape < best_lr_mape: 
        best_lr_mape = lr_mape
        best_reg_lr_mape = linear_reg

print(f'Matrix Linear regression best R2 score: {best_r2_score} {best_reg_r2.l1}')
print(f'Matrix Linear regression best MAPE: {best_lr_mape} {best_reg_lr_mape.l1}', '\n')

print(f'weights: {best_reg_r2.bias, *best_reg_r2.weights} {best_reg_r2.l1}')
print(f'weights: {best_reg_lr_mape.bias, *best_reg_lr_mape.weights} {best_reg_lr_mape.l1}')

Matrix Linear regression best R2 score: 0.9325100994988575 400
Matrix Linear regression best MAPE: 0.04553425491012968 120 

weights: (np.float64(40841.69615997375), np.float64(-184.97069352489228), np.float64(7821.103276645644)) 400
weights: (np.float64(40898.17951055769), np.float64(-790.3418310911172), np.float64(8449.466032449627)) 120
