In [179]:
#from project_helpers import load_csv_data, predict_labels
import csv
import numpy as np
from sklearn import datasets

In [180]:
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (0,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = 0
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

In [181]:
#Compute sigmoid function
def sigmoid(z):
    #return 1 / (1+np.exp(-z))
    #This approximation of the sigmoid function avoids exp overflow
    return .5 * (1 + np.tanh(.5 * z))

In [213]:
#Compute loss function
def loss_f_lr(h, y):
    #epsilon is added to log computations in order to avoid log(0) situations
    epsilon = 1e-5 
    return (-y*np.log(h+epsilon) - (1-y)*np.log(1-h+epsilon)).sum()

In [214]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    
    w = initial_w
    h = 0
    
    for i in range(max_iters):
    
        #Compute x_t*w
        z = np.dot(tx, w)
        #Compute sigmoid of z
        h = sigmoid(z)

        #Compute gradient of loss function
        gradient = np.dot(tx.T, h-y)
        
        #Update w according to gradient
        w = w - gamma*gradient
    
    loss = loss_f_lr(h, y)
    
    return (w, loss)

In [215]:
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    
    w = initial_w
    h = 0
    
    for i in range(max_iters):
    
        z = np.dot(tx, w)
        h = sigmoid(z)

        #The only difference with the previous function is the regularization constraint factored
        #in the gradient computation
        gradient = np.dot(tx.T, h-y) + lambda_*w
        w = w - gamma*gradient
    
    loss = loss_f_lr(h, y) + (lambda_/2)*np.dot(w.T,w)
    
    return (w, loss)

In [185]:
def predict_probs(X, w):
    return sigmoid(np.dot(X, w))

def predict(X, w, threshold=0.5):
    return predict_probs(X, w) >= threshold

In [186]:
iris = sklearn.datasets.load_iris()
X = iris.data[:, :2]
y = (iris.target != 0) * 1
intercept = np.ones((X.shape[0], 1))
X = np.concatenate((intercept, X), axis=1)

In [187]:
w1, loss1 = logistic_regression(y, X, np.zeros(X.shape[1]), 100000, 0.01)

In [188]:
preds = predict(X,w1)
(preds == y).mean()

1.0

In [189]:
y_tr, tx, ids = load_csv_data('Data/train.csv', sub_sample=True)
intercept = np.ones((tx.shape[0], 1))
tx = np.concatenate((intercept, tx), axis=1)

In [190]:
w2, loss2 = logistic_regression(y_tr, tx, np.zeros(tx.shape[1]), 100000, 0.01)

In [191]:
y_te, tx_te, ids_te = load_csv_data('Data/test.csv', sub_sample=True)
intercept = np.ones((tx_te.shape[0], 1))
tx_te = np.concatenate((intercept, tx_te), axis=1)

In [192]:
preds = predict(tx_te,w2)
(preds == y_te).mean()

0.6851737791465025