# Logistic Regression

In [1]:
import scipy as sp
import numpy as np
import pandas as pd

%run basic_model.ipynb

In [6]:
class LogisticRegression(BasicModel):
    def __init__(
        self,
        penalty='l2',
        tol=1e-4,
        C=1.0,
        fit_intercept=True,
        max_iter=100
    ):
        super().check_value_and_set(
            'penalty',
            penalty,
            ['l1', 'l2', None]
        )
        
        super().check_value_type_and_set(
            'tol',
            tol,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'C',
            C,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'fit_intercept',
            fit_intercept,
            bool
        )
        
        super().check_value_type_and_set(
            'max_iter',
            max_iter,
            int
        )
        
        self.w = None
    
    def __get_l1_penalty(self):
        def l1_penalty(w):
            return 1/self.C * np.abs(w)
        
        def der_l1_penalty(w):
            # ignoring the fact that the limit
            # of the derivative modulus at zero does not exist
            return 1/self.C * ((w > 0) * 1 + (w <= 0) * -1)
        
        return l1_penalty, der_l1_penalty
    
    def __get_l2_penalty(self):
        def l2_penalty(w):
            return 1/self.C * np.multiply(w, w)
        
        def der_l2_penalty(w):
            return 2/self.C * w
        
        return l2_penalty, der_l2_penalty
    
    def __get_None_penalty(self):
        return None, None
    
    def fit(self, X, y, debug=False):
        X = super().check_and_transform_X(X)
        y = super().check_and_transform_y(X, y)
        
        if self.fit_intercept:
            X = np.hstack((
                X, 
                np.ones(
                    (X.shape[0], 1)
                )
            ))
        
        args = [X, y]
        
        args.extend(
            getattr(
                self,
                '_LogisticRegression__get_' + str(self.penalty) + '_penalty'
            )()
        )
        
        self.w = np.ones((X.shape[1], 1))
#         self.w = np.random.rand(X.shape[1], 1)
        
        if debug:
            return args
        
        result = sp.optimize.minimize(
            self.__cost,
            self.w,
            args,
            'L-BFGS-B',
            self.__gradient,
            tol=self.tol,
            options={
                'maxiter': self.max_iter
            }
        )
        
        assert result.success, result.message
        
        self.w = result.x
    
    @staticmethod
    def __predict(X, w):
        def predict_real(x, w):
            return x @ w

        def sigmoid(z):
            return 1 / (1 + np.exp(-z))
        
        return sigmoid(predict_real(X, w))
    
    @staticmethod
    def __cost(w, args):
        X, y, penalty, _ = args
        
        predictions = LogisticRegression.__predict(X, w)
        
        m = X.shape[0]
        
        cost0 = -(1 - y).T @ np.log(1 - predictions)
        cost1 = -y.T @ np.log(predictions)
        
        penalty_part = penalty(w).sum() if penalty else 0
        
        final_cost = (cost0 + cost1).sum() / m + penalty_part
        
        return final_cost
    
    def predict(self, X):
        assert self.w != None, "Not fitted"
        
        X = super().check_and_transform_X(X)
        
        if self.fit_intercept:
            X = np.hstack((
                X, 
                np.ones(
                    (X.shape[0], 1)
                )
            ))
        return self.__predict(X, self.w)
    
    @staticmethod
    def __gradient(w, args):
        X, y, _, der_penalty = args
        w = w.reshape((-1, 1))
        
        predictions = LogisticRegression.__predict(X, w)
        
        penalty_part = der_penalty(w) if der_penalty else 0
        
        return X.T @ (predictions - y) + penalty_part

# Testing

In [3]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import unittest

def dummy_dataset():
    X, y = make_classification(100, 20)
    y = y.reshape((100, 1))
    return X, y

def prepare(debug=True, penalty=None):
    X, y = dummy_dataset()

    lr = LogisticRegression(penalty=penalty)

    args = lr.fit(X, y, debug)
    
    return lr, X, y, args

class TestLogisticRegression(unittest.TestCase):
    def test_gradient(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            lr._LogisticRegression__gradient(lr.w, args).shape,
            (21, 1)
        )
        
    def test_cost(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            type(lr._LogisticRegression__cost(lr.w, args)),
            np.float64
        )
    
    def test_None(self):
        lr, X, y, args = prepare(False)
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
        
    def test_l1(self):
        lr, X, y, args = prepare(False, 'l1')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
    
    def test_l2(self):
        lr, X, y, args = prepare(False, 'l2')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))

In [4]:
unittest.main(argv=['first-arg-is-ignored', '--verbose'], exit=False)

test_None (__main__.TestLogisticRegression) ... ok
test_cost (__main__.TestLogisticRegression) ... ok
test_gradient (__main__.TestLogisticRegression) ... ok
test_l1 (__main__.TestLogisticRegression) ... ok
test_l2 (__main__.TestLogisticRegression) ... 

Score: 0.966
Score: 0.9975990396158463


FAIL

FAIL: test_l2 (__main__.TestLogisticRegression)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-3-30bff43b0f2f>", line 47, in test_l2
    lr, X, y, args = prepare(False, 'l2')
  File "<ipython-input-3-30bff43b0f2f>", line 15, in prepare
    args = lr.fit(X, y, debug)
  File "<ipython-input-2-7fccd8e6be4e>", line 102, in fit
    assert result.success, result.message
AssertionError: b'ABNORMAL_TERMINATION_IN_LNSRCH'

----------------------------------------------------------------------
Ran 5 tests in 0.040s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7fd9345c3c50>

Sometimes this happens due to too large l2 normalization.

In [5]:
unittest.main(argv=['first-arg-is-ignored', '--verbose'], exit=False)

ok
test_cost (__main__.TestLogisticRegression) ... ok
test_gradient (__main__.TestLogisticRegression) ... ok
test_l1 (__main__.TestLogisticRegression) ... ok
test_l2 (__main__.TestLogisticRegression) ... 

Score: 1.0
Score: 0.9819927971188476
Score: 0.961984793917567


ok

----------------------------------------------------------------------
Ran 5 tests in 0.024s

OK


<unittest.main.TestProgram at 0x7fd934614850>