In [1]:
import numpy as np
import pandas as pd

import random

from sklearn.datasets import make_classification

In [2]:
X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [3]:
X

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13
0,0.212170,0.588157,0.373931,-1.213137,-0.431857,-1.805413,0.374652,-0.328778,1.662872,0.682909,-0.635783,-0.096290,-0.706476,1.475155
1,-2.136309,-0.340340,-1.518135,2.791709,-0.348785,-0.697299,-3.616860,-0.644757,-3.150791,0.153453,-1.975852,1.927038,-0.225723,1.335919
2,-2.995246,0.418912,-1.147293,4.705204,0.109306,-0.134241,0.297226,2.162918,-6.801806,1.573732,0.234367,-0.348181,-3.033989,-2.326364
3,-4.434309,1.959684,0.313601,0.497666,0.864826,2.565846,-1.654235,-1.603219,1.411960,-0.621943,-2.532930,-0.387911,0.313242,4.148565
4,-6.564804,0.302972,0.394640,1.189341,-2.472628,4.819816,-1.126806,-1.453735,-2.691496,-0.259630,-1.587911,-0.205920,-0.589160,0.759591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.526731,-1.008107,-4.400507,4.747371,-0.875473,-3.828352,-2.089767,-2.770453,-3.079070,1.040369,-0.038261,3.565952,0.084973,1.630007
996,-2.103575,0.431284,-0.754849,1.570126,-0.625290,2.505303,1.858279,-0.591713,2.611486,-2.494160,-0.771243,1.455433,-1.174142,0.142873
997,-7.547149,-0.604030,-3.230908,1.485347,-1.663776,1.225091,-2.065999,2.810627,-4.968167,-0.477255,-3.990299,-1.503262,0.219204,-1.434965
998,-4.256086,-1.225535,-2.306923,0.987779,1.735099,3.331157,2.363935,1.343506,1.423395,0.452345,-2.211194,0.828361,0.134801,-1.086124


In [4]:
y

0      0
1      0
2      0
3      1
4      1
      ..
995    0
996    1
997    1
998    0
999    0
Length: 1000, dtype: int32

In [11]:
class MyLogReg():
    def __init__(self, n_iter: int = 10, learning_rate: float = 0.1):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = None


    def __str__(self) -> str:
        params = [f'{key}={value}' for key, value in self.__dict__.items() if value]
        return 'MyLogReg class: ' + ', '.join(params)
    

    def fit(self, X:pd.DataFrame, y:pd.Series, verbose=False):
        X['ones_col'] = 1
        X=X[['ones_col'] + [col for col in X.columns[:-1]]]
        self.weights = [1 for feature in X.columns]
        for iter in range(1, self.n_iter+1):
            y_pred = 1/(1+np.exp(-np.dot(X, self.weights)))
            eps = 1e-15
            f_sum = y*np.log(y_pred + eps)
            s_sum = (1-y)*np.log(1-y_pred+eps)
            logloss = -sum(f_sum+s_sum)/len(y)
            gradient = np.dot((y_pred - y), X)/len(y)
            self.weights = self.weights - self.learning_rate*gradient
            if verbose:
                if verbose and (iter == 1):
                    print(f'start | loss: {logloss}')
                elif iter%verbose == 0:
                    print(f'{iter} | loss: {logloss}')

    
    def predict_proba(self, X):
        X['ones_col'] = 1
        X=X[['ones_col'] + [col for col in X.columns[:-1]]]
        y_pred = 1/(1+np.exp(-np.dot(X, self.weights)))
        return y_pred
    

    def predict(self, X):
        pred = list(self.predict_proba(X))
        return pd.Series([1 if i>0.5 else 0 for i in pred])


    def get_coef(self):
        return self.weights[1:]

In [6]:
mlr = MyLogReg(50)
print(mlr)

MyLogReg class: n_iter=50, learning_rate=0.1


In [7]:
mlr.fit(X, y, 10)

start | loss: 3.6736886876615538
10 | loss: 1.8729518266222953
20 | loss: 1.1626945313705068
30 | loss: 0.8288847284427477
40 | loss: 0.654884590222696
50 | loss: 0.5606818660628202


In [12]:
mlr.predict(X)

611

In [9]:
np.mean(mlr.get_coef())

0.3330418371591503

In [10]:
mlr.get_coef()

array([ 0.05247374,  0.63579113,  0.35707211,  0.20338172, -0.71400341,
        0.65306157, -0.30353191,  0.74286901,  0.14726892,  0.16403292,
        0.81238906,  0.0167088 ,  0.67663333,  1.21843872])