In [50]:
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

## Data Preprocessing

In [51]:
class Preprocessor():
    def __init__(self, _data_path: str, _label_col: str):
        self.label_col = _label_col
        self.data = pd.read_csv(_data_path)
        self.x = None
        self.y = None
    
    def clean_data(self):
        self.data = self.data.dropna(subset=[self.label_col])
        self.data = self.data.drop_duplicates()

    def encode_data(self):
        label_encoder = LabelEncoder()
        for column in self.x.columns:
            if self.x[column].dtype == 'object':
                if len(self.x[column].unique()) <= 2:
                    self.x[column] = label_encoder.fit_transform(self.x[column])
                else:
                    self.x = pd.get_dummies(self.x, columns=[column])
    
    def scale_data(self, _mode = 'standard'):
        numerical_columns = []
        for column in self.x.columns:
            if self.x[column].dtype not in [type(object), type(bool)]:
                numerical_columns.append(column)
        if _mode == 'standard':
            scaler = StandardScaler()
        elif _mode == 'minmax':
            scaler = MinMaxScaler()
        self.x[numerical_columns] = scaler.fit_transform(self.x[numerical_columns])
    
    def preprocess_data(self):
        if self.x == None and self.y == None:
            self.clean_data()
            self.x = self.data.drop(columns=[self.label_col])
            self.y = self.data[self.label_col]
            self.y = LabelEncoder().fit_transform(self.y)
            self.encode_data()
            self.scale_data()
        return np.array(self.x), np.array(self.y)


In [52]:
class TelcoPreprocessor(Preprocessor):
    def __init__(self, _data_path: str, _label_col: str):
        super().__init__(_data_path, _label_col)

    def preprocess_data(self):
        self.data = self.data.drop(columns=['customerID'])
        self.data['TotalCharges'] = pd.to_numeric(self.data['TotalCharges'], errors='coerce')
        self.data = self.data.dropna(subset=['TotalCharges'])
        return super().preprocess_data() 

## Logistic Regression

In [53]:
class Model(ABC):
    @abstractmethod
    def fit(self, lr: float, epoch: int, batch_size: int, verbose: bool):
        pass
    
    @abstractmethod
    def predict(self, x: np.ndarray, threshold: float):
        pass

    @abstractmethod
    def test(self, x: np.ndarray, y: np.ndarray):
        pass

In [54]:
class BinaryClassifier(Model):
    @abstractmethod
    def h(self, x: np.ndarray):
        pass

    def __calculate_confusion_matrix(self, _y_pred: np.ndarray, _y_true: np.ndarray):
        tp = np.sum((_y_pred == 1) & (_y_true == 1))
        tn = np.sum((_y_pred == 0) & (_y_true == 0))
        fp = np.sum((_y_pred == 1) & (_y_true == 0))
        fn = np.sum((_y_pred == 0) & (_y_true == 1))
        return tp, tn, fp, fn
    
    def __calculate_aupr(self, _y_prob: np.ndarray, _y_true: np.ndarray):
        thresholds = np.sort(np.unique(_y_prob))
        precision = []
        recall = []
        for threshold in thresholds:
            y_pred = np.where(_y_prob > threshold, 1, 0)
            tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, _y_true)
            if tp + fp == 0:
                precision.append(1)
            else:
                precision.append(tp / (tp + fp))
            if tp + fn == 0:
                recall.append(1)
            else:
                recall.append(tp / (tp + fn))
        aupr = 0
        for i in range(1, len(precision)):
            aupr += (recall[i] - recall[i-1]) * precision[i]
        return aupr

    def __calculate_auroc(self, _y_prob: np.ndarray, _y_true: np.ndarray):
        thresholds = np.sort(np.unique(_y_prob))
        tpr = []
        fpr = []
        for threshold in thresholds:
            y_pred = np.where(_y_prob > threshold, 1, 0)
            tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, _y_true)
            if tp + fn == 0:
                tpr.append(1)
            else:
                tpr.append(tp / (tp + fn))
            if tn + fp == 0:
                fpr.append(1)
            else:
                fpr.append(fp / (tn + fp))
        auroc = 0
        for i in range(1, len(fpr)):
            auroc += (fpr[i] - fpr[i-1]) * tpr[i]
        return auroc
    
    def calculate_metrics(self, _x: np.ndarray, _y: np.ndarray):
        y_true = _y 
        y_prob = self.h(_x)
        y_pred = self.predict(_x)
        tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, y_true)

        metrics = {}
        metrics['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
        if tp + fn == 0:
            metrics['sensitivity'] = 1
        else:
            metrics['sensitivity'] = tp / (tp + fn)
        if tn + fp == 0:
            metrics['specificity'] = 1
        else:
            metrics['specificity'] = tn / (tn + fp)
        if tp + fp == 0:
            metrics['precision'] = 1
        else:
            metrics['precision'] = tp / (tp + fp)
        metrics['f1'] = 2 * metrics['precision'] * metrics['sensitivity'] / (metrics['precision'] + metrics['sensitivity'])
        metrics['auroc'] = self.__calculate_auroc(y_prob, y_true)
        metrics['aupr'] = self.__calculate_aupr(y_prob, y_true)
        return metrics
 

In [55]:
class LogisticRegressor(BinaryClassifier):
    def __init__(self, x: np.ndarray, y: np.ndarray):
        self.x = x
        self.y = y
        self.__w = np.zeros(self.x.shape[1])
   
    def __miniBGD(self, _lr: float, _epoch: int, _batch_size: int, _verbose: bool):
        for ep in range(_epoch):
            for i in range(0, self.y.size, _batch_size):
                self.__w -= _lr * self.x[i:i+_batch_size].T @ (self.h(self.x)[i:i+_batch_size] - self.y[i:i+_batch_size]) / _batch_size
                if _verbose:
                    print(f'Epoch {ep+1}/{_epoch} | Batch {i//_batch_size+1}/{self.y.size//_batch_size} - Loss: {self.__negative_log_likelihood(self.x[i:i+_batch_size], self.y[i:i+_batch_size])}')
            metrics = self.calculate_metrics(self.x, self.y)
            if _verbose:
                print(f'Epoch {ep+1}/{_epoch} - Loss: {self.__negative_log_likelihood(self.x, self.y)} | Accuracy: {metrics["accuracy"]} | Sensitivity: {metrics["sensitivity"]} | Specificity: {metrics["specificity"]} | Precision: {metrics["precision"]} | F1: {metrics["f1"]} | AUROC: {metrics["auroc"]} | AUPR: {metrics["aupr"]}')
    
    def __negative_log_likelihood(self, _x: np.ndarray, _y: np.ndarray):
        return -np.sum(_y * np.log(self.h(_x)) + (1 - _y) * np.log(1 - self.h(_x))) / _y.size

    def h(self, x: np.ndarray):
        assert x.shape[1] == self.__w.size
        return 1 / (1 + np.exp(-x @ self.__w))
    
    def fit(self, lr: float = 0.01, epoch: int = 3, batch_size: int = 1, verbose: bool = False):
        self.__miniBGD(lr, epoch, batch_size, verbose)
        return self.calculate_metrics(self.x, self.y)
    
    def predict(self, x: np.ndarray, threshold: float = 0.5):
        p = self.h(x)
        return np.where(p > threshold, 1, 0)
    
    def test(self, x: np.ndarray, y: np.ndarray):
        return self.calculate_metrics(x, y)


## Ensemble

In [56]:
class Bootstrapper:
    def __init__(self, x: np.ndarray, y: np.ndarray, n_estimators: int = 9):
        self.x = x
        self.y = y
        self.n_estimators = n_estimators
        self.estimators = []
    
    def __bootstrap_sample(self):
        indices = np.random.choice(self.y.size, size=self.y.size, replace=True)
        return self.x[indices], self.y[indices]
    
    def __fit_estimators(self, _lr: float, _epoch: int, _batch_size: int, _verbose: bool):
        self.estimators = []
        for i in range(self.n_estimators):
            if _verbose:
                print(f'\nFitting estimator {i+1}/{self.n_estimators}')
            x_sample, y_sample = self.__bootstrap_sample()
            estimator = LogisticRegressor(x_sample, y_sample)
            estimator.fit(lr = _lr, epoch = _epoch, batch_size = _batch_size, verbose = _verbose)
            self.estimators.append(estimator)
    
    def get_estimators(self, lr: float = 0.01, epoch: int = 3, batch_size: int = 1, verbose: bool = False):
        self.__fit_estimators(lr, epoch, batch_size, verbose)
        return self.estimators
 

In [57]:
class MeanEnsembler(BinaryClassifier):
    def __init__(self, x: np.ndarray, y: np.ndarray, n_estimators: int = 9):
        self.x = x 
        self.y = y 
        self.n_estimators = n_estimators
        self.__bootstrapper = Bootstrapper(self.x, self.y, self.n_estimators)
        self.__estimators = []
    
    def h(self, x: np.ndarray):
        assert self.__estimators != []
        return np.mean([estimator.h(x) for estimator in self.__estimators], axis=0)
    
    def fit(self, lr: float = 0.01, epoch: int = 3, batch_size: int = 1, verbose: bool = False):
        self.__estimators = self.__bootstrapper.get_estimators(lr, epoch, batch_size, verbose)
        return self.calculate_metrics(self.x, self.y)
    
    def predict(self, x: np.ndarray, threshold: float = 0.5):
        p = self.h(x)
        return np.where(p > threshold, 1, 0)
    
    def test(self, x: np.ndarray, y: np.ndarray):
        return self.calculate_metrics(x, y)

In [58]:
class VotingEnsembler(BinaryClassifier):
    def __init__(self, x: np.ndarray, y: np.ndarray, n_estimators: int = 9):
        self.x = x 
        self.y = y 
        self.n_estimators = n_estimators
        self.__bootstrapper = Bootstrapper(self.x, self.y, self.n_estimators)
        self.__estimators = []
    
    def h(self, x: np.ndarray):
        assert self.__estimators != []
        return np.median([estimator.h(x) for estimator in self.__estimators], axis=0)
    
    def fit(self, lr: float = 0.01, epoch: int = 3, batch_size: int = 1, verbose: bool = False):
        self.__estimators = self.__bootstrapper.get_estimators(lr, epoch, batch_size, verbose)
        return self.calculate_metrics(self.x, self.y)
    
    def predict(self, x: np.ndarray, threshold: float = 0.5):
        votes = np.array([estimator.predict(x) for estimator in self.__estimators])
        return np.where(np.sum(votes, axis=0) > self.n_estimators / 2, 1, 0) 
    
    def test(self, x: np.ndarray, y: np.ndarray):
        return self.calculate_metrics(x, y)

In [59]:
class StackingEnsembler(BinaryClassifier):
    def __init__(self, x: np.ndarray, y: np.ndarray, n_estimators: int = 9):
        self.x = x 
        self.y = y 
        self.n_estimators = n_estimators
        self.__bootstrapper = Bootstrapper(self.x, self.y, self.n_estimators)
        self.__base_learners = []
        self.__meta_x = None
        self.__meta_y = None
        self.__meta_learner = None
    
    def __generate_metaset(self):
        self.__meta_x = np.array([base_learner.h(self.x) for base_learner in self.__base_learners]).T
        self.__meta_y = self.y
    
    def __fit_meta_learner(self, _lr: float, _epoch: int, _batch_size: int, _verbose: bool):
        self.__generate_metaset()
        self.__meta_learner = LogisticRegressor(self.__meta_x, self.__meta_y)
        if _verbose:
            print('\nFitting meta learner')
        self.__meta_learner.fit(lr = _lr, epoch = _epoch, batch_size = _batch_size, verbose = _verbose)

    def h(self, x: np.ndarray):
        assert self.__base_learners != [] and self.__meta_learner != None
        meta_x = np.array([base_learner.h(x) for base_learner in self.__base_learners]).T
        return self.__meta_learner.h(meta_x)
    
    def fit(self, lr: float = 0.01, epoch: int = 3, batch_size: int = 1, verbose: bool = False):
        self.__base_learners = self.__bootstrapper.get_estimators(lr, epoch, batch_size, verbose)
        self.__fit_meta_learner(lr, epoch, batch_size, verbose)
        return self.calculate_metrics(self.x, self.y)
    
    def predict(self, x: np.ndarray, threshold: float = 0.5):
        meta_x = np.array([base_learner.h(x) for base_learner in self.__base_learners]).T
        return self.__meta_learner.predict(meta_x)
    
    def test(self, x: np.ndarray, y: np.ndarray):
        return self.calculate_metrics(x, y)

In [60]:
tp = TelcoPreprocessor('./Telco-Customer-Churn.csv', 'Churn')
x, y = tp.preprocess_data()
print(x.shape, y.shape)
print(y[:10])

logistic_regressor = LogisticRegressor(x, y)
mean_ensembler = MeanEnsembler(x, y)
voting_ensembler = VotingEnsembler(x, y)
stacking_ensembler = StackingEnsembler(x, y)

# print(logistic_regressor.fit(lr = 0.01, epoch = 3, batch_size = 500, verbose=True))
# print(mean_ensembler.fit(lr = 0.01, epoch = 3, batch_size = 500, verbose=True))
# print(voting_ensembler.fit(lr = 0.01, epoch = 3, batch_size = 500, verbose=True))
print(stacking_ensembler.fit(lr = 0.01, epoch = 3, batch_size = 500, verbose=True))

(7010, 40) (7010,)
[0 0 1 0 1 1 0 0 1 0]

Fitting estimator 1/9
Epoch 1/3 | Batch 1/14 - Loss: 0.6882897410014102
Epoch 1/3 | Batch 2/14 - Loss: 0.686590431054059
Epoch 1/3 | Batch 3/14 - Loss: 0.6820442231906936
Epoch 1/3 | Batch 4/14 - Loss: 0.6796879127625499
Epoch 1/3 | Batch 5/14 - Loss: 0.6761480758484647
Epoch 1/3 | Batch 6/14 - Loss: 0.6748086171648982
Epoch 1/3 | Batch 7/14 - Loss: 0.670702251945238
Epoch 1/3 | Batch 8/14 - Loss: 0.6688926692890517
Epoch 1/3 | Batch 9/14 - Loss: 0.6668003429193714
Epoch 1/3 | Batch 10/14 - Loss: 0.6647940668313185
Epoch 1/3 | Batch 11/14 - Loss: 0.6625042107564314
Epoch 1/3 | Batch 12/14 - Loss: 0.6620706830390891
Epoch 1/3 | Batch 13/14 - Loss: 0.6647760338021278
Epoch 1/3 | Batch 14/14 - Loss: 0.6644287139448833
Epoch 1/3 | Batch 15/14 - Loss: 0.6799427108793037
Epoch 1/3 - Loss: 0.6573899454689763 | Accuracy: 0.6597717546362339 | Sensitivity: 0.8728579325594251 | Specificity: 0.5856566044991348 | Precision: 0.42287091590787357 | F1: 0.56972