In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

## Data Preprocessing

In [2]:
def load_data(_path: str):
    data = pd.read_csv(_path)
    return data

In [3]:
def clean(_data: pd.DataFrame):
    _data = _data.dropna()
    _data = _data.drop_duplicates()
    return _data

In [4]:
def encode(_data: pd.DataFrame):
    label_encoder = LabelEncoder()
    for column in _data.columns:
        if _data[column].dtype == 'object':
            if len(_data[column].unique()) <= 2:
                _data[column] = label_encoder.fit_transform(_data[column])
            else:
                _data = pd.get_dummies(_data, columns=[column])
    return _data

In [5]:
def normalize(_data: pd.DataFrame, _mode: str = 'minmax'):
    # normalize only numerical columns
    numerical_columns = []
    for column in _data.columns:
        if _data[column].dtype != 'object':
            numerical_columns.append(column)
    if _mode == 'minmax':
        scaler = MinMaxScaler()
    elif _mode == 'standard':
        scaler = StandardScaler()
    _data[numerical_columns] = scaler.fit_transform(_data[numerical_columns])
    return _data

In [6]:
def separate_label(_data: pd.DataFrame, _label: str):
    label = _data[_label]
    features = _data.drop(columns=[_label])
    return features, label

In [7]:
def preprocess_features(_features: pd.DataFrame):
    _features = encode(_features)
    _features = normalize(_features)
    return _features

def preprocess_label(_label: pd.Series):
    label_encoder = LabelEncoder()
    _label = label_encoder.fit_transform(_label)
    return _label

def preprocess_data(_data: pd.DataFrame, _label: str):
    _data = clean(_data)
    features, label = separate_label(_data, _label)
    x = preprocess_features(features)
    y = preprocess_label(label)
    x = np.array(x)
    y = np.array(y)
    return x, y

In [8]:
def preprocess_telco(_path: str):
    data = load_data(_path)
    x, y = preprocess_data(data, 'Churn')
    return x, y

x, y = preprocess_telco('./Telco-Customer-Churn.csv')

In [9]:
print(x.shape, y.shape)

(7043, 13613) (7043,)


## Logistic Regression

In [28]:
class LogisticRegression:
    def __init__(self, _x: np.ndarray, _y: np.ndarray):
        self.x = _x
        self.y = _y
        self.__w = np.zeros(self.x.shape[1])
    
    def __h(self, _x: np.ndarray):
        assert _x.shape[1] == self.__w.size
        return 1 / (1 + np.exp(-_x @ self.__w))
    
    def __BGD(self, _lr: float, _epoch: int):
        for _ in range(_epoch):
            self.__w -= _lr * self.x.T @ (self.__h(self.x) - self.y) / self.y.size
    
    def __SGD(self, _lr: float, _epoch: int):
        for _ in range(_epoch):
            for i in range(self.y.size):
                self.__w -= _lr * self.x[i].T * (self.__h(self.x)[i] - self.y[i])
    
    def __miniBGD(self, _lr: float, _epoch: int, _batch_size: int):
        for ep in range(_epoch):
            for i in range(0, self.y.size, _batch_size):
                self.__w -= _lr * self.x[i:i+_batch_size].T @ (self.__h(self.x)[i:i+_batch_size] - self.y[i:i+_batch_size]) / _batch_size
                print(f'Epoch {ep+1}/{_epoch} | Batch {i//_batch_size+1}/{self.y.size//_batch_size} - Loss: {self.__negative_log_likelihood(self.x[i:i+_batch_size], self.y[i:i+_batch_size])}')
            metrics = self.__calculate_metrics(self.x, self.y)
            print(f'Epoch {ep+1}/{_epoch} - Loss: {self.__negative_log_likelihood(self.x, self.y)} | Accuracy: {metrics["accuracy"]} | Sensitivity: {metrics["sensitivity"]} | Specificity: {metrics["specificity"]} | Precision: {metrics["precision"]} | F1: {metrics["f1"]} | AUROC: {metrics["auroc"]} | AUPR: {metrics["aupr"]}')
    
    def __calculate_confusion_matrix(self, _y_pred: np.ndarray, _y_true: np.ndarray):
        tp = np.sum((_y_pred == 1) & (_y_true == 1))
        tn = np.sum((_y_pred == 0) & (_y_true == 0))
        fp = np.sum((_y_pred == 1) & (_y_true == 0))
        fn = np.sum((_y_pred == 0) & (_y_true == 1))
        return tp, tn, fp, fn
    
    def __calculate_aupr(self, _y_prob: np.ndarray, _y_true: np.ndarray):
        thresholds = np.sort(np.unique(_y_prob))
        precision = []
        recall = []
        for threshold in thresholds:
            y_pred = np.where(_y_prob > threshold, 1, 0)
            tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, _y_true)
            if tp + fp == 0:
                precision.append(1)
            else:
                precision.append(tp / (tp + fp))
            if tp + fn == 0:
                recall.append(1)
            else:
                recall.append(tp / (tp + fn))
        aupr = 0
        for i in range(1, len(precision)):
            aupr += (recall[i] - recall[i-1]) * precision[i]
        return aupr

    def __calculate_auroc(self, _y_prob: np.ndarray, _y_true: np.ndarray):
        thresholds = np.sort(np.unique(_y_prob))
        tpr = []
        fpr = []
        for threshold in thresholds:
            y_pred = np.where(_y_prob > threshold, 1, 0)
            tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, _y_true)
            if tp + fn == 0:
                tpr.append(1)
            else:
                tpr.append(tp / (tp + fn))
            if tn + fp == 0:
                fpr.append(1)
            else:
                fpr.append(fp / (tn + fp))
        auroc = 0
        for i in range(1, len(fpr)):
            auroc += (fpr[i] - fpr[i-1]) * tpr[i]
        return auroc
    
    def __calculate_metrics(self, _x: np.ndarray, _y: np.ndarray):
        y_true = _y 
        y_prob = self.__h(_x)
        y_pred = self.predict(_x)
        tp, tn, fp, fn = self.__calculate_confusion_matrix(y_pred, y_true)

        metrics = {}
        metrics['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
        if tp + fn == 0:
            metrics['sensitivity'] = 1
        else:
            metrics['sensitivity'] = tp / (tp + fn)
        if tn + fp == 0:
            metrics['specificity'] = 1
        else:
            metrics['specificity'] = tn / (tn + fp)
        if tp + fp == 0:
            metrics['precision'] = 1
        else:
            metrics['precision'] = tp / (tp + fp)
        metrics['f1'] = 2 * metrics['precision'] * metrics['sensitivity'] / (metrics['precision'] + metrics['sensitivity'])
        metrics['auroc'] = self.__calculate_auroc(y_prob, y_true)
        metrics['aupr'] = self.__calculate_aupr(y_prob, y_true)
        return metrics
    
    def __negative_log_likelihood(self, _x: np.ndarray, _y: np.ndarray):
        return -np.sum(_y * np.log(self.__h(_x)) + (1 - _y) * np.log(1 - self.__h(_x))) / _y.size
    
    
    def fit(self, _lr: float, _epoch: int, _batch_size: int = 1):
        self.__miniBGD(_lr, _epoch, _batch_size)
        return self.__calculate_metrics(self.x, self.y)
    
    def predict(self, _x: np.ndarray, _threshold: float = 0.5):
        p = self.__h(_x)
        return np.where(p > _threshold, 1, 0)
    
    def test(self, _x: np.ndarray, _y: np.ndarray):
        return self.__calculate_metrics(_x, _y)


In [29]:
lr = LogisticRegression(x, y)
print(lr.fit(0.01, 10, 500))

Epoch 1/10 | Batch 1/14 - Loss: 0.6895932253045496
Epoch 1/10 | Batch 2/14 - Loss: 0.6859629641264079
Epoch 1/10 | Batch 3/14 - Loss: 0.6838905237275854
Epoch 1/10 | Batch 4/14 - Loss: 0.6796088304508049
Epoch 1/10 | Batch 5/14 - Loss: 0.6759149806171987
Epoch 1/10 | Batch 6/14 - Loss: 0.6733608608460622
Epoch 1/10 | Batch 7/14 - Loss: 0.6718499475252069
Epoch 1/10 | Batch 8/14 - Loss: 0.6707242614480841
Epoch 1/10 | Batch 9/14 - Loss: 0.6634996610864116
Epoch 1/10 | Batch 10/14 - Loss: 0.6630093587706335
Epoch 1/10 | Batch 11/14 - Loss: 0.6612308712728877
Epoch 1/10 | Batch 12/14 - Loss: 0.660104067142682
Epoch 1/10 | Batch 13/14 - Loss: 0.6552880953219625
Epoch 1/10 | Batch 14/14 - Loss: 0.6558634217391023
Epoch 1/10 | Batch 15/14 - Loss: 0.6585263284330837
Epoch 1/10 - Loss: 0.6532128756858274 | Accuracy: 0.7346301292063041 | Sensitivity: 0.0 | Specificity: 1.0 | Precision: 1 | F1: 0.0 | AUROC: -0.7593339790279582 | AUPR: -0.47906775161065757
Epoch 2/10 | Batch 1/14 - Loss: 0.649430

KeyboardInterrupt: 