In [1]:
import numpy as np
import pandas as pd

import random

from sklearn.datasets import make_regression

In [2]:
X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [3]:
display(X.head())
print(X.shape)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13
0,1.191261,-0.30896,0.675137,-1.460501,0.053059,-1.886129,2.710794,-1.716033,0.86529,0.138078,-0.063745,-2.104583,-0.476876,1.677116
1,1.071476,-1.424766,-1.10975,-0.457677,0.399997,1.587401,-1.54757,0.323247,0.165859,-0.302097,0.203944,-0.212452,0.836991,0.368498
2,-0.045929,1.868603,-0.016568,-0.484258,1.089905,-1.14716,0.590744,0.683325,-0.571184,-0.802199,-0.220114,0.034808,0.043829,0.955803
3,-1.487154,2.220322,0.718332,1.682888,-0.420986,-0.054746,1.900832,-0.101198,0.090042,-0.202924,0.340865,0.606237,-0.037008,-0.841048
4,0.344054,0.657763,0.348342,-0.41743,-0.589112,1.057814,-0.487705,-0.89783,-0.935596,-1.186993,1.074333,-0.069532,-0.177918,-0.912811


(1000, 14)


In [4]:
display(y.head())
print(y.shape)

0    -48.005272
1    145.801614
2    -49.114775
3     24.902238
4   -152.611643
dtype: float64

(1000,)


In [5]:
class MyLineReg():

    def __init__(self, n_iter = 100, learning_rate = 0.1, metric = None, reg = None, l1_coef = 0, l2_coef = 0):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.metric = metric
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.weights = None


    def __repr__(self):
        return f'MyLineReg: n_iter={self.n_iter}, learning_rate={self.learning_rate}'


    def fit(self, X: pd.DataFrame, y: pd.Series, verbose: int = False):
        X = X.copy()
        X.insert(loc=0, column='x0', value=1)
        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            pred = np.dot(X, self.weights)
            grad = 2*(np.dot((pred-y), X)/len(y))

            if self.reg:
                if self.reg == 'l1':
                    grad += self.l1_coef * np.sign(self.weights)
                if self.reg == 'l2':
                    grad += self.l2_coef * 2 * self.weights
                if self.reg == 'elasticnet':
                    grad += self.l1_coef * np.sign(self.weights) + self.l2_coef * 2 * self.weights

            self.weights -= grad*self.learning_rate

            if verbose:

                if self.reg:
                    if self.reg == 'l1':
                        reg = self.l1_coef * np.sum(np.abs(self.weights))
                    if self.reg == 'l2':
                        reg = self.l2_coef * np.sum((self.weights)**2)
                    if self.reg == 'elasticnet':
                        reg = self.l1_coef * np.sum(np.abs(self.weights))
                        reg += self.l2_coef * np.sum((self.weights)**2)
                else:
                    reg = 0

                if iter==1:
                    loss = np.mean((pred-y)**2) + reg
                    res = f'start | loss: {loss}'
                    print_flag=True

                if iter%verbose==0:
                    pred = np.dot(X, self.weights)
                    loss = np.mean((pred-y)**2) + reg
                    res = f'{iter} | loss: {loss}'
                    print_flag=True
                    
                if self.metric:
                    metric_val = self.metric_calc(pred, y)
                    res += f' | {self.metric}: {metric_val}'
                    
                if print_flag:
                    print(res)
                    print_flag=False

    
    def metric_calc(self, pred: pd.Series, y: pd.Series):
        if self.metric:
            if self.metric == 'mae':
                return np.mean(np.abs(y - pred))
            if self.metric == 'mse':
                return np.mean((y-pred)**2)
            if self.metric == 'rmse':
                return np.mean((y-pred)**2)**(1/2)
            if self.metric == 'mape':
                return 100*np.mean(np.abs((y-pred)/y))
            if self.metric == 'r2':
                return 1 - (np.sum((y-pred)**2)/(np.sum((y-np.mean(y))**2)))


    def predict(self, X: pd.DataFrame):
        X = X.copy()
        X.insert(loc=0, column='x0', value=1)
        pred = np.dot(X, self.weights)
        return pred


    def get_coef(self):
        return self.weights[1:]
    

    def get_best_score(self):
        return self.metric_calc(self.predict(X), y)

In [6]:
mlg = MyLineReg(metric='rmse', reg='elasticnet', l1_coef=0.1, l2_coef=0.1)
print(mlg)

MyLineReg: n_iter=100, learning_rate=0.1


In [7]:
mlg.fit(X, y, 10)

start | loss: 20727.484011132256 | rmse: 143.60045138779506
10 | loss: 2388.1165579522212 | rmse: 29.589506906380585
20 | loss: 2233.6016513193213 | rmse: 21.115549756870802
30 | loss: 2231.8768123405744 | rmse: 20.393172570876242
40 | loss: 2231.853419687286 | rmse: 20.319748292100446
50 | loss: 2231.8530479223996 | rmse: 20.311627749252818
60 | loss: 2231.8530414638185 | rmse: 20.310678469476322
70 | loss: 2231.853041346001 | rmse: 20.31056269697826
80 | loss: 2231.8530413437943 | rmse: 20.31054810662919
90 | loss: 2231.853041343752 | rmse: 20.3105462211666
100 | loss: 2231.8530413437516 | rmse: 20.310545972861664


In [8]:
mlg.get_coef()

array([ 3.87293813e+01,  1.48469147e+01,  3.32686384e-02,  5.95640127e+01,
        4.31644596e+01,  5.60334822e+01,  2.19319082e-01, -6.86737980e-01,
        5.38818702e+01,  4.87116304e+01, -2.17163306e-01,  1.28972267e+01,
        1.49632544e+01,  4.32133170e+01])

In [9]:
mlg.predict(X)[:5]

array([ -55.00051504,  119.06857942,  -47.36597507,   21.17791684,
       -118.39194512])

In [10]:
mlg.get_best_score()

20.310545972861664