# Logistic Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import LogisticRegression

In [2]:
class _LogisticRegression:
    
    def __init__(self, alpha=0.0001, epsilon=0.1, max_iter=300):
        self.alpha = alpha
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.intercept_ = None
        self.coef_ = None
        self.iter = None
        
    def fit(self, X, y):
        X = np.array(X)
        y = y.reshape(-1, 1)
        self.intercept_ = np.random.sample()
        self.coef_ = np.random.sample((len(X.T), 1))
        self.iter = 0

        while True:
            self.iter += 1
            if self.iter > self.max_iter:
                break

            e = np.ones(len(X))
            z = self._z(X)
            y_cap = self._sigmoid(z)
            
            delta_b = -self.alpha * -e.dot(y - y_cap)
            delta_w = -self.alpha * -X.T.dot(y - y_cap)
            self.intercept_ += delta_b
            self.coef_ += delta_w
            
            if (delta_b ** 2) + (delta_w ** 2).sum() < self.epsilon:
                break
                
        return self
        
    # weighted input function
    def _z(self, X):
        return X.dot(self.coef_) + self.intercept_
        
    # logistic function
    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    # quantizer
    def _q(self, a):
        return np.where(a < 0.5, 0, 1)
    
    def predict(self, X):
        z = self._z(X)
        y_cap = self._sigmoid(z)
        q = self._q(y_cap)    
        return q.reshape(1, -1)[0]
    
    def score(self, X, y):
        return accuracy_score(y, self.predict(X))

## Heart Disease

### Import

In [3]:
columns = ['age', 'sex', 'cp', 'trestbps','chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat', delimiter=' ', names=columns)

### Preprocessing

In [4]:
real_attributes = [0,3,4,7,9,11]
ordered_attributes = [10]
binary_attributes = [1,5,8]
nominal_attributes = [6,2,12]
target_attribute = 'num'

In [5]:
df[target_attribute] = np.where(df[target_attribute] == 1, 0, 1)

In [6]:
df.iloc[:,real_attributes] = StandardScaler().fit_transform(df.iloc[:,real_attributes])
df.iloc[:,ordered_attributes] = Normalizer().fit_transform(df.iloc[:,ordered_attributes])
df = pd.get_dummies(df, columns=df.columns[binary_attributes+nominal_attributes])

In [7]:
all_fields = list(df.drop(target_attribute, axis=1))

In [8]:
df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,num,sex_0.0,sex_1.0,...,restecg_0.0,restecg_1.0,restecg_2.0,cp_1.0,cp_2.0,cp_3.0,cp_4.0,thal_3.0,thal_6.0,thal_7.0
0,1.712094,-0.07541,1.402212,-1.759208,1.181012,1.0,2.472682,1,0,1,...,0,0,1,0,0,0,1,1,0,0
1,1.38214,-0.916759,6.093004,0.446409,0.481153,1.0,-0.711535,0,1,0,...,0,0,1,0,0,1,0,0,0,1
2,0.282294,-0.41195,0.219823,-0.375291,-0.656118,1.0,-0.711535,1,0,1,...,1,0,0,0,1,0,0,0,0,1
3,1.052186,-0.18759,0.258589,-1.932198,-0.7436,1.0,0.349871,0,0,1,...,1,0,0,0,0,0,1,0,0,1
4,2.152032,-0.63631,0.37489,-1.240239,-0.7436,1.0,0.349871,0,1,0,...,0,0,1,0,1,0,0,1,0,0


### Slicing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df[all_fields], np.array(df[target_attribute]), test_size=0.2)

### Fitting

In [10]:
model = _LogisticRegression(alpha=0.01, epsilon=0.001).fit(X_train, y_train)

### Performance

In [11]:
print('total iterations', model.iter)
print('train set accuracy', model.score(X_train, y_train))
print('test set accuracy', model.score(X_test, y_test))

total iterations 28
train set accuracy 0.8611111111111112
test set accuracy 0.8888888888888888
