# Project 1 - Team BAK

## Step 1 - Getting started

In [4]:
#Import some libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
from helpfulfun import *
%load_ext autoreload
%autoreload 2

In [6]:
# import the datasets
train_list = np.genfromtxt("train.csv", dtype=None, delimiter=",", skip_header =1, unpack=True, encoding=None)
train = np.array(train_list)

test_list = np.genfromtxt("test.csv", dtype=None, delimiter=",", skip_header =1, unpack=True, encoding=None)
test = np.array(test_list)

In [7]:
x_te = test[2:].T

In [8]:
y = train[1]
x_tr = train[2:].T
print("x: ", x_tr.shape, " y: ", y.shape)

x:  (250000, 30)  y:  (250000,)


## Step 2 - Preprocessing

In [9]:
y_tr = np.where(y == "s",1,0)
print(y_tr.shape, " and y[:5]: ", y_tr[:5])

(250000,)  and y[:5]:  [1 0 0 0 0]


In [18]:
# delete features with more than 30% NaN values
x_tr_prep = np.delete(x_tr, 0, 1) # infer this one later
x_te_prep = np.delete(x_te, 0, 1) # infer this one later
for i in [4, 4, 4, 9, 18, 18, 18, 18, 18, 18, 18]:
    x_tr_prep = np.delete(x_tr_prep, i, 1)
    x_te_prep = np.delete(x_te_prep, i, 1)
x_tr_prep = x_tr_prep.astype(float)
x_te_prep = x_te_prep.astype(float)
x_tr_prep.shape

(250000, 18)

In [19]:
x, mean_x, std_x = standardize(x_tr_prep)
x_te, _, _ = standardize(x_te_prep)

In [20]:
x.shape

(250000, 19)

## Step 3 - Implement ML Methods

In [13]:
initial_w = np.zeros((x.shape[1],), dtype=float)

In [32]:
def get_predictions(x, best_w):
    preds = x.dot(best_ws).reshape((x.shape[0],))
    y_te = np.where(preds < .5,-1,1)
    y_pred = np.c_[test[0], y_te]
    print(y_pred[0])
    y_pred = np.insert(y_pred, 0, ["Id", "Prediction"], axis=0)
    return y_pred

#### Linear regression using gradient descent

In [15]:
def mean_squared_error_gd(y, tx, initial_w, max_iters, gamma):
    """The Gradient Descent (GD) algorithm.
        
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        max_iters: a scalar denoting the total number of iterations of GD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of GD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of GD 
    """
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        gradient = compute_gradient(y, tx, w)
        loss = compute_loss(y, tx, w)
        
        w = w - gamma * gradient
        
        # store w and loss
        ws.append(w)
        losses.append(loss)
        print("GD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return losses, ws
    

In [16]:
losses, ws = mean_squared_error_gd(y_tr, x, initial_w, max_iters=150, gamma=.005)

GD iter. 0/149: loss=0.171334, w0=0.0017133399999999998, w1=0.00043645910164310744
GD iter. 1/149: loss=0.16499256885493627, w0=0.0033209621458787815, w1=0.0008386390404596991
GD iter. 2/149: loss=0.159648601737275, w0=0.004831742599413398, w1=0.0012094336335768635
GD iter. 3/149: loss=0.15513735265013393, w0=0.0062537896663250065, w1=0.0015514862427284625
GD iter. 4/149: loss=0.1513214094005602, w0=0.007594510397486251, w1=0.0018672114987907773
GD iter. 5/149: loss=0.1480861566518639, w0=0.008860671412039433, w1=0.002158815141566248
GD iter. 6/149: loss=0.14533599206028716, w0=0.010058454443722411, w1=0.0024283121383331407
GD iter. 7/149: loss=0.14299117049171758, w0=0.011193507068207447, w1=0.0026775432304924408
GD iter. 8/149: loss=0.14098517206359557, w0=0.01227098902953978, w1=0.0029081900446875147
GD iter. 9/149: loss=0.1392625070624069, w0=0.013295614547490146, w1=0.003121788892940366
GD iter. 10/149: loss=0.13777688521990059, w0=0.014271690954509888, w1=0.003319743375543079
GD 

GD iter. 109/149: loss=0.11557538852528586, w0=0.05534570705102231, w1=0.005549282840430491
GD iter. 110/149: loss=0.11550354915781064, w0=0.055603642293208695, w1=0.005526592933686017
GD iter. 111/149: loss=0.11543275785236541, w0=0.05585994598392998, w1=0.005503463699106767
GD iter. 112/149: loss=0.11536299887714385, w0=0.05611463075757812, w1=0.005479898607351094
GD iter. 113/149: loss=0.11529425673660583, w0=0.056367709120974135, w1=0.005455901092364148
GD iter. 114/149: loss=0.11522651616792012, w0=0.05661919345713772, w1=0.005431474552570557
GD iter. 115/149: loss=0.11515976213746121, w0=0.05686909602878472, w1=0.005406622351978639
GD iter. 116/149: loss=0.11509397983736047, w0=0.057117428981575805, w1=0.005381347821203719
GD iter. 117/149: loss=0.11502915468211089, w0=0.05736420434713748, w1=0.0053556542584174555
GD iter. 118/149: loss=0.11496527230522288, w0=0.05760943404587471, w1=0.0053295449302294805
GD iter. 119/149: loss=0.11490231855593183, w0=0.05785312988959291, w1=0.00

In [33]:
best_w = ws[-1]
pred = get_predictions(x_te, best_w)

['350000' '-1']


In [34]:
np.savetxt("submission.csv", pred, fmt="%s", delimiter=",")

#### Linear regression using stochastic gradient descent


In [None]:
def mean_squared_error_sgd(y, tx, initial_w, max_iters, gamma):
    """The Stochastic Gradient Descent algorithm (SGD).
            
    Args:
        y: numpy array of shape=(N, )
        tx: numpy array of shape=(N,2)
        initial_w: numpy array of shape=(2, ). The initial guess (or the initialization) for the model parameters
        batch_size: a scalar denoting the number of data points in a mini-batch used for computing the stochastic gradient
        max_iters: a scalar denoting the total number of iterations of SGD
        gamma: a scalar denoting the stepsize
        
    Returns:
        losses: a list of length max_iters containing the loss value (scalar) for each iteration of SGD
        ws: a list of length max_iters containing the model parameters as numpy arrays of shape (2, ), for each iteration of SGD 
    """
    
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    
    for n_iter in range(max_iters):
        for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
            gradient = compute_stoch_gradient(minibatch_y, minibatch_tx, w)
            loss = compute_loss(y, tx, w)
            w = w - gamma * gradient
        # store w and loss
            ws.append(w)
            losses.append(loss)

        print("SGD iter. {bi}/{ti}: loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))
    return losses, ws
    

#### Least squares regression using normal equations

In [None]:
def least_squares(y, tx):
    opt_weights = np.linalg.solve(tx.T.dot(tx)).dot(tx.T.dot(y))
    e = y - tx.dot(opt_weights)
    mse = 1/(2*len(y)) * e.T.dot(e)
    return opt_weights, mse

#### Ridge regression using normal equations

In [None]:
def ridge_regression(y, tx, lambda_ ):
    aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    return np.linalg.solve(a, b)

#### Logistic regression using gradient descent or SGD (y ∈ {0, 1})

In [None]:
# calculating loss with sigmoid and using gradient descent
def logistic_regression(y, tx, w):
    loss = calculate_loss_lr(y, tx, w)
    gradient = calculate_gradient_lr(y, tx, w)
    hessian = calculate_hessian(y, tx, w)
    return loss, gradient, hessian

In [None]:
def logistic_regression_demo(y, x):
    # init parameters
    max_iter = 50
    threshold = 1e-5
    lambda_ = 0.1
    gamma = 1.
    losses = []

    # build tx
    tx = np.c_[np.ones((y.shape[0], 1)), x]
    w = np.zeros((tx.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and gradient descent on w.
        loss, gradient, hessian = logistic_regression(y, tx, w)
        w -= gamma * np.linalg.solve(hessian, gradient)
        
        print("Current iteration={i}, the loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break


In [None]:
logistic_regression_demo(y, x)