In [3]:
class BasicModel:
    def check_value_and_set(self, name, value, allowed):
        message = "Wrong {}: '{}'\n\nAllowed values: {}"
        
        if value not in allowed:
            raise ValueError(message.format(
                name,
                value,
                allowed
            ))
        
        setattr(self, name, value)
        
    def check_value_type_and_set(self, name, value, allowed):
        message = "Wrong {} type: '{}'\n\nAllowed types: {}"
        
        if not isinstance(value, allowed):
            raise TypeError(message.format(
                name,
                value,
                allowed
            ))
        
        setattr(self, name, value)

# Logistic Regression

In [3]:
import scipy as sp
import numpy as np
import pandas as pd

In [2]:
# train_df = pd.read_csv(
#     '../lab01/nyc-taxi-trip-duration/cleaned_train.csv'
# )

In [18]:
class LogisticRegression(BasicModel):
    def __init__(
        self,
        penalty='l2',
        tol=1e-4,
        C=1.0,
        fit_intercept=True,
        max_iter=100
    ):
        super().check_value_and_set(
            'penalty',
            penalty,
            ['l1', 'l2', None]
        )
        
        super().check_value_type_and_set(
            'tol',
            tol,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'C',
            C,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'fit_intercept',
            fit_intercept,
            bool
        )
        
        super().check_value_type_and_set(
            'max_iter',
            max_iter,
            int
        )
    
    def __get_l1_penalty(self):
        def l1_penalty(w):
            return 1/self.C * np.abs(w)
        
        def der_l1_penalty(w):
            # ignoring zeros existence
            return 1/self.C * ((w > 0) * 1 + (w <= 0) * -1)
        
        return l1_penalty, der_l1_penalty
    
    def __get_l2_penalty(self):
        def l2_penalty(w):
            return 1/self.C * np.multiply(w, w)
        
        def der_l2_penalty(w):
            return 2/self.C * w
        
        return l2_penalty, der_l2_penalty
    
    def __get_None_penalty(self):
        return None, None
    
    def fit(self, X, y, debug=False):
        assert \
            len(X.shape) == 2, \
            "X should be 2D vector"
        assert \
            y.shape == (X.shape[0], 1), \
            "y should be 2D vector and should correspond to X"
        
        if self.fit_intercept:
            X = np.hstack((
                X, 
                np.ones(
                    (X.shape[0], 1)
                )
            ))
        
        args = [X, y]
        
        args.extend(
            getattr(
                self,
                '_LogisticRegression__get_' + str(self.penalty) + '_penalty'
            )()
        )
        
        self.w = np.ones((X.shape[1], 1))
        
        if debug:
            return args
        
        result = sp.optimize.minimize(
            self.__cost,
            self.w,
            args,
            'L-BFGS-B',
            self.__gradient,
            tol=self.tol,
            options={
                'maxiter': self.max_iter
            }
        )
        
        assert result.success, result.message
        
        self.w = result.x
    
    @staticmethod
    def __predict(X, w):
        def predict_real(x, w):
            return x @ w

        def sigmoid(z):
            return 1 / (1 + np.exp(-z))
        
        return sigmoid(predict_real(X, w))
    
    @staticmethod
    def __cost(w, args):
        X, y, penalty, _ = args
        
        predictions = LogisticRegression.__predict(X, w)
        
        m = X.shape[0]
        
        cost0 = -(1 - y).T @ np.log(1 - predictions)
        cost1 = -y.T @ np.log(predictions)
        
        penalty_part = penalty(w).sum() if penalty else 0
        
        final_cost = (cost0 + cost1).sum() / m + penalty_part
        
        return final_cost
    
    def predict(self, X):
        if self.fit_intercept:
            X = np.hstack((
                X, 
                np.ones(
                    (X.shape[0], 1)
                )
            ))
        return self.__predict(X, self.w)
    
    @staticmethod
    def __gradient(w, args):
        X, y, _, der_penalty = args
        w = w.reshape((-1, 1))
        
        predictions = LogisticRegression.__predict(X, w)
        
        penalty_part = der_penalty(w) if der_penalty else 0
        
        return X.T @ (predictions - y) + penalty_part

In [19]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import unittest

def dummy_dataset():
    X, y = make_classification(100, 20)
    y = y.reshape((100, 1))
    return X, y

def prepare(debug=True, penalty=None):
    X, y = dummy_dataset()

    lr = LogisticRegression(penalty=penalty)

    args = lr.fit(X, y, debug)
    
    return lr, X, y, args

class TestLogisticRegression(unittest.TestCase):
    def test_gradient(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            lr._LogisticRegression__gradient(lr.w, args).shape,
            (21, 1)
        )
        
    def test_cost(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            type(lr._LogisticRegression__cost(lr.w, args)),
            np.float64
        )
    
    def test_None(self):
        lr, X, y, args = prepare(False)
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
        
    def test_l1(self):
        lr, X, y, args = prepare(False, 'l1')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
    
    def test_l2(self):
        lr, X, y, args = prepare(False, 'l2')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))

In [20]:
unittest.main(argv=['first-arg-is-ignored', '--verbose'], exit=False)

ok
test_cost (__main__.TestLogisticRegression) ... ok
test_gradient (__main__.TestLogisticRegression) ... ok
test_l1 (__main__.TestLogisticRegression) ... ok
test_l2 (__main__.TestLogisticRegression) ... 

Score: 1.0
Score: 0.9863945578231293


FAIL

FAIL: test_l2 (__main__.TestLogisticRegression)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-19-30bff43b0f2f>", line 47, in test_l2
    lr, X, y, args = prepare(False, 'l2')
  File "<ipython-input-19-30bff43b0f2f>", line 15, in prepare
    args = lr.fit(X, y, debug)
  File "<ipython-input-18-12246dbbe0cf>", line 104, in fit
    assert result.success, result.message
AssertionError: b'ABNORMAL_TERMINATION_IN_LNSRCH'

----------------------------------------------------------------------
Ran 5 tests in 0.038s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7f505ad0f5d0>

Sometimes this happens due to too large l2 normalization.

In [27]:
unittest.main(argv=['first-arg-is-ignored', '--verbose'], exit=False)

ok
test_cost (__main__.TestLogisticRegression) ... ok
test_gradient (__main__.TestLogisticRegression) ... ok
test_l1 (__main__.TestLogisticRegression) ... ok
test_l2 (__main__.TestLogisticRegression) ... 

Score: 1.0
Score: 0.9867947178871549
Score: 1.0


ok

----------------------------------------------------------------------
Ran 5 tests in 0.042s

OK


<unittest.main.TestProgram at 0x7f505acaf910>

# DecisionTree

In [6]:
from collections import namedtuple
import functools

# CART DT (maybe)
class DecisionTree(BasicModel):
    class Node:
        def __init__(self, feature, value, observation_indexes):
            self.feature = feature
            self.split_value = value
            self.left = None
            self.right = None
            self.answer = None
            self.observation_indexes = observation_indexes
            
        def is_leaf(self):
            return self.left or self.right
    
    criterion_name_to_calculator = {
        'gini': '_DecisionTree__gini_index'
    }
    
    criterion_name_to_cmp = {
        'gini': lambda x, y: x-y
    }
    
    criterion_name_to_task = {
        'gini': 'classification',
        'entropy': 'classification',
        'gain_info': 'classification',
        'mse': 'regression',
        'mae': 'regression'
    }
    
    def __init__(
        self,
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        max_features=None,
        random_state=42,
        allow_pruning=False  # node saves corresponding to it observation indexes
    ):
        super().check_value_and_set(
            'criterion',
            criterion,
            ['gini', 'entropy', 'gain_info', 'mse', 'mae']
        )
        
        self.__cmp_criterion_values = \
            self.criterion_name_to_cmp[criterion]
        self.__calc_criterion_value = \
            self.criterion_name_to_calculator[criterion]
        
        self.__task = criterion_name_to_task[criterion]
        
        super().check_value_and_set(
            'splitter',
            splitter,
            ['best', 'random']
        )
        
        super().check_value_type_and_set(
            'max_depth',
            max_depth,
            (int, type(None))
        )
        
        super().check_value_type_and_set(
            'min_samples_split',
            min_samples_split,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'min_weight_fraction_leaf',
            min_weight_fraction_leaf,
            float
        )
        
        super().check_value_type_and_set(
            'max_features',
            max_features,
            (int, float, str, type(None))
        )
        
        if type(max_features) == str:
            super().check_value_and_set(
                'max_features',
                max_features,
                ['auto', 'sqrt', 'log2']
            )
        if max_features == 'auto':
            max_features = 'sqrt'
            
        super().check_value_type_and_set(
            'random_state',
            random_state,
            (np.random.RandomState, int)
        )
        if type(random_state) == int:
            self.random_state = np.random.RandomState(random_state)
            
        super().check_value_type_and_set(
            'allow_pruning',
            allow_pruning,
            bool
        )
    
    def fit(self, X, y):
        assert \
            len(X.shape) == 2, \
            "X should be 2D vector"
        assert \
            y.shape == (X.shape[0], 1), \
            "y should be 2D vector and should correspond to X"
        
        self.X, self.y = X.to_numpy(), y.to_numpy()
        
        if max_features != None:
            if type(max_features) == float:
                max_features = np.floor(X.shape[1] * max_features)
            elif type(max_features) == str:
                max_features = np.floor(
                    getattr(np, max_features)(X.shape[1])
                )
        
        self.__construct_tree()
        
    def predict(self, X):
        predictions = []
        
        for el in X:
            prediction = self.__predict_observation(el)
            predictions.append(prediction)
        
        return np.array(predictions).reshape((-1, 1))
            
    def __predict_observation(self, el):
        node = self.root
        
        while not node.is_leaf():
            prev_node = node
            
            if el[node.feature] <= node.split_value:
                node = node.left
            else:
                node = node.right
        
        return node.answer
    
    def __gini_impurity(self, observation_indexes):        
        node_y = self.y[[observation_indexes], :]
        
        p_0 = (node_y == 0).sum()
        p_1 = (node_y == 1).sum()
        
        return 1 - p_0 * p_0 - p_1 * p_1
    
    def __gini_index(
        self, 
        left_split_indexes, 
        right_split_indexes
    ):
        w_left = len(left_split_indexes)
        w_right = len(right_split_indexes)
        
        W = w_left + w_right
        
        gini_index = w_left/W * self.__gini_impurity(left_split_indexes)
        
        gini_index += w_right/W * self.__gini_impurity(right_split_indexes)
        
        return gini_index
    
    def __get_split_pairs_gen(
        self,
        observation_indexes
    ):
        node_x = self.X[[observation_indexes], [feature]]
        
        observation_indexes, node_x = zip(
            *sorted(
                zip(
                    observation_indexes,
                    node_x.ravel()
                ), 
                key=lambda x: x[1]
            )
        )
        
        # to use numpy views
        observation_indexes = np.array(observation_indexes)
        
        uniques = list(dict.fromkeys(node_x).keys())
        
        last_i = 0
        id_unique = 0
        
        while id_unique < len(uniques) - 1:
            for i in range(last_i, len(node_x)):
                if node_x[i] > uniques[id_unique]:
                    last_i = i
                    break
            
            left_split_indexes = observation_indexes[:last_i]
            right_split_indexes = observation_indexes[last_i:]
            
            yield left_split_indexes, right_split_indexes, uniques[id_unique]
            
            id_unique += 1
    
    BestFeatureSplit = namedtuple(
        'BestFeatureSplit', 
        [
            'criterion_value',
            'split_value',
            'left_split_indexes',
            'right_split_indexes'
        ]
    )
    
    # output can be (None, None)
    def __find_best_split_by_feature(
        self, 
        feature, 
        observation_indexes
    ):
        split_pairs_gen = self.__get_split_pairs_gen(
            observation_indexes
        )
        
        # find best split
        best_split = DecisionTree.BestFeatureSplit(
            None,
            None,
            None,
            None
        )
        
        for left_split_indexes, right_split_indexes, split_value in split_pairs_gen:
            criterion_value = self.__calc_criterion_value(
                left_split_indexes,
                right_split_indexes
            )
            
            current_split = DecisionTree.BestFeatureSplit(
                criterion_value,
                split_value,
                left_split_indexes,
                right_split_indexes
            )
            
            if best_criterion_value is None or \
               self.__cmp_criterion_values(
                   criterion_value, 
                   best_criterion_value
               ) < 0:
                best_split = current_split
        
        return best_split

    def __construct_tree_helper(self, observation_indexes, depth):
        if depth > self.max_depth or \
           observation_indexes.shape[0] < self.min_samples_split:
            return None
        
        features = np.arange(self.X.shape[1])
        if self.splitter == 'random':
            self.random_state.shuffle(features)
        
        best_feature_split = None
        best_feature = None
    
        feature_counter = 0 if self.max_features else float('inf')
        
        cmp_criterion_values = self.__get_criterion_cmp()
        
        for feature in features:
            if best_feature and feature_counter > self.max_features:
                break
            
            feature_split = self.__find_best_split_by_feature(
                feature,
                observation_indexes
            )
            
            if feature_split.criterion_value is None:
                continue
            
            if best_feature is None or \
               cmp_criterion_values(
                   feature_split.criterion_value,
                   best_feature_split.criterion_value
               ) < 0:
                best_feature_split = feature_split
                best_feature = feature
            
            feature_counter += 1
        
        if best_feature is None or \
           best_feature_split.criterion_value == 0:
            return None
        
        node = DecisionTree.Node(
            best_feature,
            best_feature_split.split_value,
            observation_indexes
        )
        
        node.left = self.__construct_tree_helper(
            best_feature_split.left_split_indexes,
            depth + 1
        )
        
        node.right = self.__construct_tree_helper(
            best_feature_split.right_split_indexes,
            depth + 1
        )
        
        if node.is_leaf():
            node.answer = self.__get_answer(
                observation_indexes
            )
        
        return node
    
    def __get_answer(self, observation_indexes):
        if self.__task == 'classification':
            ones = self.y[[observation_indexes]].sum()
            threshold = y.shape[0] / 2
            
            if ones > threshold:
                return 1
            elif ones < threshold:
                return 0
            else:
                return self.random_state.randint(0, 1)
        else:
            return np.mean(y)
    
    def __construct_tree(self):
        observation_indexes = np.arange(self.y.shape[0])
        
        self.root = self.__construct_tree_helper(
            observation_indexes, 
            1
        )
        
        if self.root is None:
            self.root = DecisionTree.Node(
                None,
                None,
                observation_indexes
            )
            
            self.answer = self.__get_answer(
                observation_indexes
            )

In [48]:
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import unittest

def dummy_dataset():
    X, y = make_classification(100, 20)
    y = y.reshape((100, 1))
    return X, y

def fit_and_score(*args, **kwargs):
    X, y = dummy_dataset()

    dt = DecisionTree(*args, **kwargs)

    dt.fit(X, y)
    
    return dt, X, y, args

class TestLogisticRegression(unittest.TestCase):
    def test_gini(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            lr._LogisticRegression__gradient(lr.w, args).shape,
            (21, 1)
        )
        
    def test_cost(self):
        lr, X, y, args = prepare()
        
        self.assertEqual(
            type(lr._LogisticRegression__cost(lr.w, args)),
            np.float64
        )
    
    def test_None(self):
        lr, X, y, args = prepare(False)
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
        
    def test_l1(self):
        lr, X, y, args = prepare(False, 'l1')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))
    
    def test_l2(self):
        lr, X, y, args = prepare(False, 'l2')
        score = roc_auc_score(y, lr.predict(X))
        print("Score: {}".format(score))

<bound method DecisionTree.__gini_index of <__main__.DecisionTree object at 0x7f505ad43c10>>