# Logistic Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression

In [2]:
class _LogisticRegression:
    
    def __init__(self, alpha=0.0001, epsilon=0.1, max_iter=300):
        self.alpha = alpha
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.intercept_ = None
        self.coef_ = None
        self.iter = None
        
    def fit(self, X, y):
        X = np.array(X)
        y = y.reshape(-1, 1)
        self.intercept_ = np.random.sample()
        self.coef_ = np.random.sample((len(X.T), 1))
        self.iter = 0

        while True:
            self.iter += 1
            if self.iter > self.max_iter:
                break

            e = np.ones(len(X))
            z = self._z(X)
            y_cap = self._sigmoid(z)
            
            delta_b = -self.alpha * -e.dot(y - y_cap)
            delta_w = -self.alpha * -X.T.dot(y - y_cap)
            self.intercept_ += delta_b[0]
            self.coef_ += delta_w
            if (delta_b[0] ** 2) + (delta_w ** 2).sum() < self.epsilon:
                break
                
        return self
        
    # weighted input function
    def _z(self, X):
        return X.dot(self.coef_) + self.intercept_
        
    # logistic function
    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    # quantizer
    def _q(self, a):
        return np.where(a < 0.5, 0, 1)
    
    def predict(self, X):
        z = self._z(X)
        y_cap = self._sigmoid(z)
        q = self._q(y_cap)    
        return q.reshape(1, -1)[0]
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

## Heart Disease

### Import

In [3]:
columns = ['age', 'sex', 'cp', 'trestbps','chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', delimiter=' ', names=columns)

### Preprocessing

In [4]:
df = pd.get_dummies(df, columns=['sex', 'fbs', 'exang'])
target_attribute = ['num']
all_fields = list(df.drop(target_attribute, axis=1))
df[target_attribute] = np.where(df[target_attribute] == 1, 0, 1)

### Slicing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df[all_fields], df[target_attribute], test_size=0.2)
y_train = np.array(y_train).reshape(1,-1)[0]
y_test = np.array(y_test).reshape(1,-1)[0]

### Fitting

In [6]:
model = _LogisticRegression(alpha=0.00003, epsilon=0.001).fit(X_train, y_train)
#model = LogisticRegression(solver='liblinear').fit(X_train, y_train)

### Performance

In [7]:
print('total iterations', model.iter)
print('train set accuracy', model.score(X_train, y_train))
print('test set accuracy', model.score(X_test, y_test))

total iterations 301
train set accuracy 0.6157407407407407
test set accuracy 0.6481481481481481
