In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from csv import reader
from sklearn.preprocessing import StandardScaler
from random import randrange
import operator
%matplotlib inline  


# prediction values
def pred_val(theta, X, hard=True):
    pred_prob = logistic_val_func(theta, X)
    pred_value = np.where(pred_prob > 0.5, 1, 0)
    if hard:
        return pred_value
    else:
        return pred_prob


def logistic_grad_func(theta, x, y):
    # compute gradient
    m = x.shape[0]
    y_hat = logistic_val_func(theta, x)
    x = np.c_[np.ones(x.shape[0]), x]
    grad = (1.0 / m) * np.sum((y_hat - y) * x, axis=0)
    return grad

def sigmoid(x):
    # sigmoid function
    sig = 1.0 / (1.0 + np.exp(-1.0 * x))
    return sig


def logistic_val_func(theta, x):
    return sigmoid(np.dot(np.c_[np.ones(x.shape[0]), x], theta.T))


def logistic_cost_func(theta, x, y):
    # compute cost (loss)
    y_hat = logistic_val_func(theta, x)
    cost = np.sum(-1.0 * y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat), axis=0)
    cost *= 1.0 / x.shape[0]
    return cost

# def logistic_cost_func(theta, X, y):
#     # compute cost (loss)
#     y_hat = logistic_val_func(theta, X)
    
#     # since sigmoid(4e+01) is already 1.0 in python
#     # np.log(1-y_hat) cannot be calculated properly
#     # -log(1-h) = log(1+exp(z))
#     # where h = 1/(1+exp(-z))
#     # --> avoid RuntimeWarning: divide by zero encountered in log
#     #       and RuntimeWarning: invalid value encountered in double_scalars
    
#     z = np.dot(np.c_[np.ones(X.shape[0]), X], theta.T)
# #     cost = -np.sum(y * np.log(1 + np.exp(-z))) + np.sum((1 - y) * np.log(1+np.exp(z)))
#     cost = np.sum(y * np.log(y_hat)) - np.sum((1 - y) * np.log(1+np.exp(z))) # very slow
# #     cost = np.sum(y * np.log(y_hat)) + np.sum((1 - y) * np.log(1-y_hat)) # very fast
#     cost *= 1.0 / X.shape[0]
# #     print("logistic_cost_func completed")
#     return -cost


def logistic_grad_desc(theta, X_train, Y_train, lr, max_iter, tolerance):
    cost_iter = []
    cost = logistic_cost_func(theta, X_train, Y_train)
    cost_iter.append(cost)
    cost_change = 1
    i = 1
    while cost_change > tolerance and i < max_iter:
        pre_cost = cost
        # compute gradient
        grad = logistic_grad_func(theta, X_train, Y_train)
        theta -= lr * grad
        cost = logistic_cost_func(theta, X_train, Y_train)
        cost_iter.append(cost)
        cost_change = abs(pre_cost - cost)
        i += 1
    return theta, cost_iter


def load_dataset(filename):
    with open(filename, 'r') as dest_f:
        data_iter = reader(dest_f, delimiter=',', quotechar='"')
        data = [data for data in data_iter]
        data_array = np.asarray(data)
    return data_array

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def logistic_regression(dataset, n_folds, lr, max_iter, tolerance):
    # split dataset into n-folds
    dataset_split = cross_validation_split(dataset, n_folds)
    acc_train = []
    acc_test = []

    sklearn_acc_train = []
    sklearn_acc_test = []

    precision_test = []
    recall_test = []
             
    for i in range(n_folds):
        test = np.array(dataset_split[i])
        train = list(dataset_split)
        train.pop(i)
        # combine the remaining lists of folder into one
        train = np.array(reduce(operator.add, train))
        
        # Normalize X_Train
        X_train = train[:, :-1]
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        
        #Get the mean and std to normalize the test dataset
        X_test = test[:, :-1]
        X_test = scaler.transform(X_test)
        
        Y_train = train[:, -1]
        Y_test = test[:,-1]
        
        Y_train = Y_train[:, None]
        Y_test = Y_test[:, None]

        # Logitic regression
        #  Initialize the weights for the gradient descent algorithm to all zeros
        theta = np.zeros((1, X_train.shape[1] + 1))
        
        # Initialize the weights for the gradient descent algorithm between 0 and 1
        #theta = np.random.rand(1, X_train.shape[1] + 1)
        
        # Initialize the weights for the gradient descent algorithm between -1 and 1
        #theta = np.random.uniform(-1, 1, X_train.shape[1] + 1)
        
        
        fitted_theta, cost_iter = logistic_grad_desc(theta, X_train, Y_train, lr, max_iter, tolerance)
        
       # _, cost_iter_2 = logistic_grad_desc(theta, X_train, Y_train, lr, max_iter, tolerance=0.001)
        
        #print ('cost_iter_2', cost_iter_2)
        # choose fold one 
        if i == 0:
            plt.figure()
            plt.plot(cost_iter, label = "cost_iter")
           # plt.plot(cost_iter_2, label = "cost_iter_2")
            plt.grid()
            plt.xlabel('Iteration')
            plt.ylabel('Logistic_cost')
            plt.legend(bbox_to_anchor=(1.05,1), loc=2, shadow=True)
            plt.show()
       
        predict_test = pred_val(fitted_theta, X_test)
        predict_train = pred_val(fitted_theta, X_train)
        acc_test.append(np.sum(predict_test == Y_test) * 1.0 / X_test.shape[0])
        acc_train.append(np.sum(predict_train == Y_train) * 1.0 / X_train.shape[0])
       
        #Precision: TP / TP + FP 
        #Recall: TP / TP + FN
        TP = np.sum(Y_test == 1)
        FP = 0
        FN = 0
        for j in range(len(Y_test)):
            if predict_test[j] == 1 and Y_test[j] == 0:
                FP += 1
            if predict_test[j] == 0 and Y_test[j] == 1:
                FN += 1
        
        
        precision = TP * 1. / (TP + FP)
        recall = TP * 1. / (TP + FN)
        precision_test.append(precision)
        recall_test.append(recall)
        
        # Built-in Logistic regression
        regressor = linear_model.LogisticRegression()
        regressor.fit(X_train, np.ravel(Y_train))
        sklearn_acc_test.append(np.sum((regressor.predict(X_test) == np.ravel(Y_test))) * 1.0 / X_test.shape[0])
        
        print "For fold", i
        print('Train Accuracy: {}'.format(acc_train[i]))
        print('Test Accuracy: {}'.format(acc_test[i]))
        print ('Sklearn Test Accuracy: {}').format(sklearn_acc_test[i])
        print('Test Precision: {}').format(precision_test[i])
        print('Test Recall: {}').format(recall_test[i])
        print ("")
        
    print('Overall Mean Train Accuracy Across Folds: {}'.format(np.sum(acc_train)*1./len(acc_train)))
    print('Overall Mean Test Accuracy Across Folds: {}'.format(np.sum(acc_test)*1. / len(acc_test)))
    print('For sklearn, Overall Mean Test Accuracy Across Folds: {}'.format(np.sum(sklearn_acc_test)*1. / len(sklearn_acc_test)))

    print('Overall Mean Test Precision Across Folds: {}').format(np.sum(precision_test)*1./len(precision_test))
    print('Overall Mean Test Recall Across Folds: {}').format(np.sum(recall_test)*1./len(recall_test))
    
    print('std of train accuracy: {}'.format(np.std(np.array(acc_train), axis=0)))
    print('std of test accuracy: {}'.format(np.std(np.array(acc_test), axis=0)))
    
    return acc_test, acc_train
    
def main():
    print ('')
    n_folds = 10
    dataset = load_dataset("spambase.csv")
    dataset = dataset.astype(float)  
    print('Spam dataset Logistic Regression')
    logistic_regression(dataset, n_folds, lr=0.1, max_iter=5000, tolerance=1e-4)
    print ('')
    
    dataset = load_dataset("breastcancer.csv")
    dataset = dataset.astype(float)  
    print('Breast cancer dataset Logistic Regression')
    logistic_regression(dataset, n_folds, lr=0.01, max_iter=5000, tolerance=.00001)
    print ('')
    
    dataset = load_dataset("diabetes.csv")
    dataset = dataset.astype(float) 
    print('Diabetes dataset Logistic Regression')
    logistic_regression(dataset, n_folds, lr=0.01, max_iter=5000, tolerance=.00001)
    print ('')
    
    
if __name__ == "__main__":
    main()


Spam dataset Logistic Regression
