In [1]:
import numpy as np
import pandas as pd

import random

from sklearn.datasets import make_regression

In [2]:
X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [3]:
display(X.head())
print(X.shape)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13
0,1.191261,-0.30896,0.675137,-1.460501,0.053059,-1.886129,2.710794,-1.716033,0.86529,0.138078,-0.063745,-2.104583,-0.476876,1.677116
1,1.071476,-1.424766,-1.10975,-0.457677,0.399997,1.587401,-1.54757,0.323247,0.165859,-0.302097,0.203944,-0.212452,0.836991,0.368498
2,-0.045929,1.868603,-0.016568,-0.484258,1.089905,-1.14716,0.590744,0.683325,-0.571184,-0.802199,-0.220114,0.034808,0.043829,0.955803
3,-1.487154,2.220322,0.718332,1.682888,-0.420986,-0.054746,1.900832,-0.101198,0.090042,-0.202924,0.340865,0.606237,-0.037008,-0.841048
4,0.344054,0.657763,0.348342,-0.41743,-0.589112,1.057814,-0.487705,-0.89783,-0.935596,-1.186993,1.074333,-0.069532,-0.177918,-0.912811


(1000, 14)


In [4]:
display(y.head())
print(y.shape)

0    -48.005272
1    145.801614
2    -49.114775
3     24.902238
4   -152.611643
dtype: float64

(1000,)


In [105]:
class MyLineReg():

    def __init__(self, n_iter = 100, learning_rate = 0.1, metric = None):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.metric = metric
        self.weights = None


    def __repr__(self):
        return f'MyLineReg: n_iter={self.n_iter}, learning_rate={self.learning_rate}'


    def fit(self, X: pd.DataFrame, y: pd.Series, verbose: int = False):
        X = X.copy()
        X.insert(loc=0, column='x0', value=1)
        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            pred = np.dot(X, self.weights)
            grad = 2*(np.dot((pred-y), X)/len(y))
            self.weights -= grad*self.learning_rate

            if verbose:

                if iter==1:
                    loss = np.mean((pred-y)**2)
                    res = f'start | loss: {loss}'
                    print_flag=True

                if iter%verbose==0:
                    pred = np.dot(X, self.weights)
                    loss = np.mean((pred-y)**2)
                    res = f'{iter} | loss: {loss}'
                    print_flag=True
                    
                if self.metric:
                    metric_val = self.metric_calc(pred, y)
                    res += f' | {self.metric}: {metric_val}'
                    
                if print_flag:
                    print(res)
                    print_flag=False

    
    def metric_calc(self, pred: pd.Series, y: pd.Series):
        if self.metric:
            if self.metric == 'mae':
                return np.mean(np.abs(y - pred))
            if self.metric == 'mse':
                return np.mean((y-pred)**2)
            if self.metric == 'rmse':
                return np.mean((y-pred)**2)**(1/2)
            if self.metric == 'mape':
                return 100*np.mean(np.abs((y-pred)/y))
            if self.metric == 'r2':
                return 1 - (np.sum((y-pred)**2)/(np.sum((y-np.mean(y))**2)))


    def predict(self, X: pd.DataFrame):
        X = X.copy()
        X.insert(loc=0, column='x0', value=1)
        pred = np.dot(X, self.weights)
        return pred


    def get_coef(self):
        return self.weights[1:]
    

    def get_best_score(self):
        return self.metric_calc(self.predict(X), y)

In [106]:
mlg = MyLineReg(metric='mape')
print(mlg)

MyLineReg: n_iter=100, learning_rate=0.1


In [107]:
mlg.fit(X, y, 10)

start | loss: 20621.089638778492 | mape: 101.14798658984488
10 | loss: 510.87011170243284 | mape: 66.20238338348034
20 | loss: 228.98517225684174 | mape: 65.54462635969739
30 | loss: 223.78598384435142 | mape: 65.98506101799835
40 | loss: 223.66811333728646 | mape: 66.03351840331861
50 | loss: 223.6650536997468 | mape: 66.03644241795023
60 | loss: 223.6649675479 | mape: 66.03633092136458
70 | loss: 223.66496500640952 | mape: 66.03624592963196
80 | loss: 223.66496492945188 | mape: 66.03622450939804
90 | loss: 223.6649649270873 | mape: 66.03622013492243
100 | loss: 223.66496492701407 | mape: 66.0362193137032


In [108]:
mlg.get_coef()

array([42.91964777, 16.59743276,  0.49967378, 65.37467671, 47.59325078,
       61.77751235,  0.24742523, -0.10563466, 60.01750602, 53.89234067,
       -0.47496694, 14.42781294, 17.6888767 , 47.31351483])

In [109]:
mlg.predict(X)[:5]

array([ -61.95121506,  131.6819176 ,  -52.07607356,   23.4480151 ,
       -131.73760946])

In [110]:
mlg.get_best_score()

66.0362193137032