In [4]:
import numpy as np
import pandas as pd
import random

In [5]:
class MyLogReg:
    def __init__(self, n_iter = 10 , learning_rate = 0.1, metric = None, reg = None, l1_coef = 0, l2_coef = None, sgd_sample = None, random_state = 42):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = None
        self.metric = metric
        self.best_score = None
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.sgd_sample = sgd_sample
        self.random_state = random_state
    
    def __str__(self):
        return f"MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"
    def _log_loss(self, y_true, y_pred, n):
        epsilon =  1e-15
        if self.reg:
            if self.reg == 'l1':
                return (-1/n) * np.sum(y_true * np.log(y_pred + epsilon) + (1-y_true) * np.log(1 - y_pred + epsilon)) * self.l1_coef * np.sum(np.abs(self.weights))
            elif self.reg == 'l2':
                return (-1/n) * np.sum(y_true * np.log(y_pred + epsilon) + (1-y_true) * np.log(1 - y_pred + epsilon)) * self.l2_coef * np.sum(self.weights ** 2)
            elif self.reg == "elasticnet":
                return (-1/n) * np.sum(y_true * np.log(y_pred + epsilon) + (1-y_true) * np.log(1 - y_pred + epsilon)) * self.l1_coef * np.sum(np.abs(self.weights)) + self.l2_coef * np.sum(self.weights ** 2)
            else:
                raise ValueError("неправильный метод регуляризации!")
        return (-1/n) * np.sum(y_true * np.log(y_pred + epsilon) + (1-y_true) * np.log(1 - y_pred + epsilon))
        
    def _gradient(self, y_true, y_pred, X, n):
        if self.reg:
            if self.reg == 'l1':
                return (1/n) * (y_pred - y_true) @ X + self.l1_coef * np.sign(self.weights)
            elif self.reg == 'l2':
                return (1/n) * (y_pred - y_true) @ X + self.l2_coef * 2 * (self.weights)
            elif self.reg == 'elasticnet':
                return (1/n) * (y_pred - y_true) @ X +  self.l1_coef * np.sign(self.weights)+ self.l2_coef * 2 * (self.weights)
        return (1/n) * (y_pred - y_true) @ X

    def _get_learning_rate(self, iteration):
        if callable(self.learning_rate):
            return (float(self.learning_rate(iteration)))
        else:
            return(float(self.learning_rate))
            
    def _confusion_matrix(self, y_true, y_pred):
        pred = np.where(y_pred > 0.5, 1, 0)
        
        tp = np.sum((y_true == 1) & (pred == 1))
        tn = np.sum((y_true == 0) & (pred == 0))
        fp = np.sum((y_true == 0) & (pred == 1))
        fn = np.sum((y_true == 1) & (pred == 0))
        return tp, tn, fp, fn
    
    def _accuracy(self, y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred) 
        return (tp + tn) / (tp + tn + fp + fn)
        
    def _precision(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return tp / (tp + fp)
    def _recall(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return tp / (tp + fn)
    def _f1(self,  y_true, y_pred):
        tp, tn, fp, fn = self._confusion_matrix(y_true, y_pred)
        return 2 * tp / (2 * tp + fp + fn)
        
    def _roc_auc(self, y_true, y_pred):
        Y_true = y_true.copy()
        Y_pred = y_pred.copy()
        
        sorted_indices = np.argsort(Y_pred)[::-1]
        Y_pred = Y_pred[sorted_indices]
        Y_true = Y_true[sorted_indices]
        
        Y_pred = np.round(Y_pred, 10)

        s = 0
        p = np.sum(Y_true == 1)
        n = np.sum(Y_true == 0)
        for i in range(len(Y_true)):
            if Y_true[i] == 0:
                positives = 0
                for j in range(i):
                    if Y_true[j] == 1:
                        positives += 1
                positives_same_score = 0
                for j in range(len(Y_pred)):
                    if i != j and Y_pred[i] == Y_pred[j] and Y_true[j] == 1:
                        positives_same_score += 1
                s += positives + positives_same_score * 0.5
        return s / (p * n)
    def _calculate_metric(self, y_true, y_pred):
         if self.metric == "accuracy":
            return self._accuracy(y_true, y_pred)
         elif self.metric == "precision":
            return self._precision(y_true, y_pred)
         elif self.metric == "recall":
            return self._recall(y_true, y_pred)
         elif self.metric == "f1":
            return self._f1(y_true, y_pred)
         elif self.metric == "roc_auc":
            return self._roc_auc(y_true, y_pred)
         else:
            raise ValueError("Нет такой метрики!")    
        
        
    def fit(self, X, y, verbose = False):
        random.seed(self.random_state)
        if isinstance(X, pd.DataFrame):
            X_array = X.values
        else:
             X_array = np.asarray(X)
            
        if isinstance(y, pd.DataFrame):
            y_array = y.values.flatten()
        else:
            y_array = np.asarray(y)
            
        n_samples, n_features = X.shape
        X_array = np.c_[np.ones(n_samples), X_array]
        self.weights = np.ones(n_features + 1)
        for i in range(1, self.n_iter + 1):
            
           
            lr = self._get_learning_rate(i)
            z = X_array @ self.weights
            y_pred = 1 / (1 + np.exp(-z)) 
            
            if self.sgd_sample:
                if isinstance(self.sgd_sample, int):
                    sgd_sample = self.sgd_sample
                elif isinstance(self.sgd_sample, float):
                    sgd_sample = round(X.shape[0] * self.sgd_sample)
                else:
                    raise ValueError("SGD sample должен быть целым или дробным числом!")
                sample_rows_idx = random.sample(range(X.shape[0]), sgd_sample)
                X_train = X_array[sample_rows_idx]
                y_train = y_array[sample_rows_idx]
                z_batch = X_train @ self.weights
                y_pred_batch = 1 / (1 + np.exp(-z_batch))
                grad = self._gradient(y_train, y_pred_batch, X_train, len(X_train))
            else:
                grad = self._gradient(y_array, y_pred, X_array, n_samples)
                
            loss = self._log_loss(y_array, y_pred, n_samples) 
            self.weights -= lr * grad
            if self.metric:
                metric_value = self._calculate_metric(y_array, y_pred)
            if verbose and i % verbose == 0:
                if i == 0:
                    print(f"start | loss: {loss:.2f}", f"| {self.metric} {metric_value}" if self.metric else "")
                else:
                    print(f" {i} | loss: {loss:.2f}",  f"| {self.metric}  {metric_value}" if self.metric else "")
        if self.metric:
            z = X_array @ self.weights
            y_final = 1 / (1 + np.exp(-z))
            self.best_score = self._calculate_metric(y_array, y_final)
    
    def get_coef(self):
        return self.weights[1:]
        
    def predict_proba(self, X):
        
        if isinstance(X, pd.DataFrame):
            X_array = X.values
        else:
             X_array = np.asarray(X)
        X_array = np.c_[np.ones(len(X_array)), X_array] 
        z = X_array @ self.weights
        y_pred_proba = 1 / (1 + np.exp(-z))
        return y_pred_proba
        
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        y_pred = np.where(y_pred_proba > 0.5, 1, 0)
        return y_pred
        
    def get_best_score(self):
        return self.best_score

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, roc_auc_score
# Фиксируем сид для воспроизводимости
np.random.seed(42)

# Синтетические данные
X, y = make_classification(
    n_samples=1000,
    n_features=5,
    n_informative=5,
    n_redundant=0,
    n_classes=2
)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)



In [7]:
model = MyLogReg(
    learning_rate=0.1,
    n_iter=300,
    metric="accuracy",
    sgd_sample = 0.2# ← передаём название метрики
)

model.fit(X_train, y_train, verbose = 100, )
print(model.get_coef())

 100 | loss: 0.42 | accuracy  0.8285714285714286
 200 | loss: 0.42 | accuracy  0.8385714285714285
 300 | loss: 0.41 | accuracy  0.8357142857142857
[-0.76415499  0.84129879  0.76160111 -0.28972576 -0.68540487]


In [287]:
y_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
roc_auc = roc_auc_score(y_train, y_pred)
print(roc_auc)

0.836934958486746


In [242]:
y_true = np.array([0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0])
y_pred = np.array([0.1, 0.4, 0.35, 0.8, 0.2, 0.15, 0.7, 0.25, 0.6, 0.9, 
                     0.3, 0.1, 0.4, 0.7, 0.2, 0.6, 0.85, 0.15, 0.55, 0.3])

In [243]:
print(roc_auc_score(y_true, y_pred))

0.9949494949494949


In [244]:
sorted_pred = np.argsort(y_pred)[::-1]
y_true = y_true[sorted_pred]
y_pred = y_pred[sorted_pred]

In [252]:
y_pred

array([0.9 , 0.85, 0.8 , 0.7 , 0.7 , 0.6 , 0.6 , 0.55, 0.4 , 0.4 , 0.35,
       0.3 , 0.3 , 0.25, 0.2 , 0.2 , 0.15, 0.15, 0.1 , 0.1 ])

In [254]:
s = 0

p = np.sum(y_true == 1)
n = np.sum(y_true == 0)
for i in range(len(y_true)):
    if y_true[i] == 0:
        positives = 0
        for j in range(i):
            if y_true[j] == 1:
                positives += 1
        positives_same_score = 0
        for j in range(len(y_pred)):
            if i != j and y_pred[i] == y_pred[j] and y_true[j] == 1:
                positives_same_score += 1
        s += positives + positives_same_score * 0.5
print(s / (p * n))
            
                  

0.9949494949494949
