In [262]:
import numpy as np
import pandas as pd

import random

from sklearn.datasets import make_classification

In [263]:
X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [264]:
display(X.head())
print(X.shape)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13
0,0.21217,0.588157,0.373931,-1.213137,-0.431857,-1.805413,0.374652,-0.328778,1.662872,0.682909,-0.635783,-0.09629,-0.706476,1.475155
1,-2.136309,-0.34034,-1.518135,2.791709,-0.348785,-0.697299,-3.61686,-0.644757,-3.150791,0.153453,-1.975852,1.927038,-0.225723,1.335919
2,-2.995246,0.418912,-1.147293,4.705204,0.109306,-0.134241,0.297226,2.162918,-6.801806,1.573732,0.234367,-0.348181,-3.033989,-2.326364
3,-4.434309,1.959684,0.313601,0.497666,0.864826,2.565846,-1.654235,-1.603219,1.41196,-0.621943,-2.53293,-0.387911,0.313242,4.148565
4,-6.564804,0.302972,0.39464,1.189341,-2.472628,4.819816,-1.126806,-1.453735,-2.691496,-0.25963,-1.587911,-0.20592,-0.58916,0.759591


(1000, 14)


In [265]:
display(y.head())
print(y.shape)

0    0
1    0
2    0
3    1
4    1
dtype: int32

(1000,)


In [273]:
class MyLogReg():

    def __init__(self, n_iter = 100, learning_rate = 0.01, metric = None):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.metric = metric
        self.weights = None


    def __repr__(self) -> str:
        return f'MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}'
    

    def fit(self, X: pd.DataFrame, y: pd.Series, verbose: int = False):
        X=X.copy()
        X.reset_index(inplace=True, drop=True)
        X.insert(loc=0, column='x0', value=1)
        self.weights = np.ones(X.shape[1])

        for iter in range(1, self.n_iter+1):
            pred = self.sigmoid(np.dot(X, self.weights))
            grad = np.dot((pred - y), X)/len(y)
            self.weights -= grad*self.learning_rate

            if verbose:
                
                if iter == 1:
                    pred = self.sigmoid(np.dot(X, np.ones(X.shape[1])))
                    loss = -np.mean(y*np.log(pred) + (1-y)*np.log(1-pred))
                    res = f'start | loss: {loss}'
                    print_flag = True

                if iter%verbose==0:
                    pred = self.sigmoid(np.dot(X, self.weights))
                    loss = -np.mean(y*np.log(pred+1e-15) + (1-y)*np.log(1-pred+1e-15))
                    res = f'{iter} | loss: {loss}'
                    print_flag = True

                if self.metric and print_flag:
                    metric_val = self.metric_calc(pred, y)
                    res += f' | {self.metric}: {metric_val}'

                if print_flag:
                    print(res)
                    print_flag = False

    
    def predict_proba(self, X:pd.DataFrame):
        X = X.copy()
        X.insert(loc=0, column='x0', value=1)
        return pd.Series(self.sigmoid(np.dot(X, self.weights)))
    

    def predict(self, X:pd.DataFrame):
        pred = self.predict_proba(X)
        return pred.apply(lambda x: 1 if x>0.5 else 0)
    

    def metric_calc(self, pred: pd.Series, y: pd.Series):
        if self.metric != 'roc_auc':
            pred = pd.Series(pred).apply(lambda x: 1 if x>0.5 else 0)
            TP, TN, FP, FN = 0, 0, 0, 0
            for i in zip(y, pred):
                if i[0]==1 and i[1]==1:
                    TP+=1
                elif i[0]==0 and i[1]==0:
                    TN+=1
                elif i[0]==0 and i[1]==1:
                    FP+=1
                else:
                    FN+=1

        if self.metric == 'accuracy':
            return (TP+TN)/(TP+TN+FP+FN)
        if self.metric == 'precision':
            return TP/(TP+FP+1e-15)
        if self.metric == 'recall':
            return TP/(TP+FN+1e-15)
        if self.metric == 'f1':
            self.metric = 'precision'
            pr = self.metric_calc(pred, y)
            self.metric = 'recall'
            r =  self.metric_calc(pred, y)
            self.metric = 'f1'
            return (2*pr*r)/(pr+r+1e-15)
        if self.metric == 'roc_auc':
            zippo = sorted(zip(pred, y), reverse=True)
            pred, y = zip(*zippo)
            total = 0
            for i in range(len(y)):
                if y[i]==0:
                    count = sum(y[:i])
                    for j in pred[:i][::-1]:
                        if pred[i] == j:
                            count-= 0.5
                        if j>y[i]:
                            break
                    total+=count
            P, N = sum(y), len(y) - sum(y)
            return total/(P*N)


    def sigmoid(self, value):
        return 1/(1+np.exp(-value))
    

    def get_coef(self):
        return self.weights[1:]
    

    def get_best_score(self):
        return self.metric_calc(self.predict_proba(X), y)

In [274]:
log_reg = MyLogReg(50, 0.1, 'roc_auc')
print(log_reg)

MyLogReg class: n_iter=50, learning_rate=0.1


In [275]:
log_reg.fit(X, y, verbose=10)

start | loss: 3.6742056254341415 | roc_auc: 0.5326141304565218
10 | loss: 1.7627744144564494 | roc_auc: 0.6219904879619519
20 | loss: 1.1191187560143865 | roc_auc: 0.729254917019668
30 | loss: 0.8058902605465909 | roc_auc: 0.8050832203328814
40 | loss: 0.6428121701954602 | roc_auc: 0.8475193900775603
50 | loss: 0.5538400820040549 | roc_auc: 0.8720714882859532


In [276]:
np.mean(log_reg.get_coef())

0.33304183715915026

In [277]:
log_reg.predict(X)

0      1
1      1
2      0
3      1
4      1
      ..
995    0
996    1
997    0
998    0
999    1
Length: 1000, dtype: int64

In [278]:
log_reg.get_best_score()

0.8720714882859532