In [1]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
from sklearn.base import BaseEstimator
from sklearn.metrics import log_loss

import numpy as np
import matplotlib.pyplot as plt

In [None]:
X_data, y_data = make_classification(
    n_samples=10000, n_features=20, 
    n_classes=2, n_informative=20, 
    n_redundant=0,
    random_state=42
)

In [197]:
class MyLogisticRegression(BaseEstimator):
    def __init__(self, C=1.0, random_state=None, iters=100,
                 batch_size=1000, step=0.1, penalty='l2'
                ):
        self.C = C
        self.random_state = random_state
        self.iters = iters
        self.batch_size = batch_size
        self.step = step
        self.penalty = penalty
    
    @staticmethod
    def sigmoid(z):
        return 1.0 / (1 + np.exp(-z))
    
    # будем пользоваться этой функцией для подсчёта <w, x>
    def __predict(self, X):
        return self.sigmoid(np.dot(X, self.w) + self.w0)

    # sklearn нужно, чтобы predict возвращал классы, поэтому оборачиваем наш __predict в это
    def predict(self, X):
        res = self.__predict(X)
        res[res >= 0.5] = 1
        res[res < 0.5] = 0
        return res
    
    # производная регуляризатора
    def der_reg(self):
        if self.penalty == 'l1':
            return self.C * np.sign(self.w)
        elif self.penalty == 'l2':
            return self.C *self.w

    # будем считать стохастический градиент не на одном элементе, а сразу на пачке (чтобы было эффективнее)
    def der_loss(self, x, y):
        # x.shape == (batch_size, features)
        # y.shape == (batch_size,)

        # считаем производную по каждой координате на каждом объекте
        # для масштаба возвращаем средний градиент по пачке
        d = self.__predict(x) - y
        ders_w = np.mean(x * d[:, np.newaxis], axis=0)
        der_w0 = np.mean(d)
        return ders_w, der_w0

    def fit(self, X_train, y_train):
        # RandomState для воспроизводитмости
        random_gen = np.random.RandomState(self.random_state)
        
        # получаем размерности матрицы
        size, dim = X_train.shape
        
        # случайная начальная инициализация
        self.w = random_gen.rand(dim)
        self.w0 = random_gen.randn()
        self.learning_curve = np.zeros(self.iters)
        for i in range(self.iters):  
            # берём случайный набор элементов
            rand_indices = random_gen.choice(size, self.batch_size)
            # исходные метки классов это 0/1
            X = X_train[rand_indices]
            y = y_train[rand_indices]
            self.learning_curve[i] += log_loss(y,self.__predict(X), labels=(0,1))
            # считаем производные
            der_w, der_w0 = self.der_loss(X, y)
            der_w += self.der_reg()
            # обновляемся по антиградиенту
            self.w -= der_w * self.step
            self.w0 -= der_w0 * self.step
        # метод fit для sklearn должен возвращать self
        return self

In [198]:
clf = SGDClassifier(loss='log', learning_rate='constant', eta0=0.1, max_iter=1000)

my_clf = MyLogisticRegression(penalty='l2', step=0.01, batch_size=100, iters=1000, C=1, random_state=42)

In [199]:
print(np.mean(cross_val_score(clf, X_data, y_data, cv=2, scoring='accuracy')))
print(np.mean(cross_val_score(my_clf, X_data, y_data, cv=2, scoring='accuracy')))

0.7125
0.7895


In [205]:
%%time
param_grid = {
    'C': [2,1,0.1],
    'random_state': [1, 25, 42],
    'iters': [100,1000,5000],
    'batch_size': [10,100,1000],
    'step': [0.01,0.1,0.5],
    'penalty': ['l1','l2'],
}

param_grid = {
    'C': [2,1,0.5,0.1],
    'random_state': [1, 25, 42],
    'iters': [100,1000,5000,10000],
    'batch_size': [100,500,1000],
    'step': [0.005,0.01,0.1,0.5],
    'penalty': ['l1','l2'],
}

grid_pipe = GridSearchCV(my_clf, param_grid, 
                         scoring='accuracy',
                         n_jobs=-1, cv=2,
                         verbose=1).fit(X_data, y_data)

Fitting 2 folds for each of 1152 candidates, totalling 2304 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 222 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 472 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 822 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 1272 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 1822 tasks      | elapsed: 38.6min
[Parallel(n_jobs=-1)]: Done 2304 out of 2304 | elapsed: 51.4min finished


CPU times: user 11.4 s, sys: 757 ms, total: 12.2 s
Wall time: 51min 27s


In [206]:
grid_pipe.best_score_, grid_pipe.best_params_

(0.7995,
 {'C': 0.1,
  'batch_size': 100,
  'iters': 5000,
  'penalty': 'l2',
  'random_state': 25,
  'step': 0.1})