In [54]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [111]:
from proj1_helpers import *
#Modify DATA_PATH if needed
DATA_TRAIN_PATH = '../../data_project1/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [112]:
import csv

def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids


def predict_labels(weights, data):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.dot(data, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    
    return y_pred


def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})
            
def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    """
    Generate a minibatch iterator for a dataset.
    Takes as input two iterables (here the output desired values 'y' and the input data 'tx')
    Outputs an iterator which gives mini-batches of `batch_size` matching elements from `y` and `tx`.
    Data can be randomly shuffled to avoid ordering in the original data messing with the randomness of the minibatches.
    Example of use :
    for minibatch_y, minibatch_tx in batch_iter(y, tx, 32):
        <DO-SOMETHING>
    """
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x

In [113]:
print("Training examples: ", tX, " & shape: ")
print("Targets: ", y)
print("Ids: ",ids)
print("Shapes of tX, y & Ids: ", tX.shape, y.shape, ids.shape)

Training examples:  [[ 138.47    51.655   97.827 ...    1.24    -2.475  113.497]
 [ 160.937   68.768  103.235 ... -999.    -999.      46.226]
 [-999.     162.172  125.953 ... -999.    -999.      44.251]
 ...
 [ 105.457   60.526   75.839 ... -999.    -999.      41.992]
 [  94.951   19.362   68.812 ... -999.    -999.       0.   ]
 [-999.      72.756   70.831 ... -999.    -999.       0.   ]]  & shape: 
Targets:  [ 1. -1. -1. ...  1. -1. -1.]
Ids:  [100000 100001 100002 ... 349997 349998 349999]
Shapes of tX, y & Ids:  (250000, 30) (250000,) (250000,)


In [114]:
#Preprocess data
tX, tX_mean, tX_std = standardize(tX)

train_valid_split = int(tX.shape[0] / 10)
print("Validation data size: ", train_valid_split)
tX_valid = tX[train_valid_split:,:]
y_valid = y[train_valid_split:]
id_valid = ids[train_valid_split:]

tX = tX[:train_valid_split]
y = y[:train_valid_split]
ids = ids[:train_valid_split]

print("Shapes of tX, y & Ids for Training: ", tX.shape, y.shape, ids.shape)
print("Shapes of tX, y & Ids for Validation: ", tX_valid.shape, y_valid.shape, id_valid.shape)


Validation data size:  25000
Shapes of tX, y & Ids for Training:  (25000, 30) (25000,) (25000,)
Shapes of tX, y & Ids for Validation:  (225000, 30) (225000,) (225000,)


## Do your thing crazy machine learning thing here :) ...

In [115]:
'''GRAD AND LOSS FUNCTIONS'''
def compute_loss(y, tx, w, typ):
    '''typ = <LOSS_TYPE(WITH CAPITAL LETTERS)>'''
    loss = 0
    N = y.shape[0]
    if typ == "MSE":
        loss = (1/(2*N))*np.sum(np.square(y - (tx@w)))        
    elif typ == "MAE":
        loss = (1/(2*N))*np.sum(np.abs(y - (tx@w)))
    return loss

def compute_gradient(y, tx, w):
    '''GRADIENT COMPUTATION'''
    N = y.shape[0]
    e = y - tx@w
    grad = (-1/N) * (tx.T@e)
    return grad

def compute_stoch_gradient(y, tx, w):
    '''STOCHASTIC GRADIENT DESCENT GRADIENT COMPUTATION''' 
    N = y.shape[0]
    e = y - tx@w
    grad = (-1/N)*(tx.T@e)
    return grad

def compute_rdg_loss(y, tx, w, lambda_):
    '''typ = <LOSS_TYPE>(CAPITAL LETTERS)'''
    loss = 0
    N = y.shape[0]
    loss = (1/(2*N))*np.sum(np.square(y - (tx@w))) + (lambda_*np.sum(np.square(w)))
    return loss

def sigmoid(tx):
    return 1 / (1 + np.exp(-tx))

def compute_log_loss(y, tx, w):
    '''LOGISTIC LOSS'''
    loss = 0
    sigm = sigmoid(tx)
    N = y.shape[0]
    loss = (-1/N)*(np.sum(np.log(hx).T * y - (np.log(1 - hx).T * (1-y))))
    return loss

In [145]:
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    '''BATCH GRADIENT DESCENT'''
    w = initial_w
    lr_scheduler = [(max_iters) / 4, (2* max_iters) / 4,(3* max_iters) / 4]
    for n_iter in range(max_iters):
        if n_iter in lr_scheduler:
            gamma = gamma / 10
        loss = compute_loss(y, tx, w, "MSE")
        grad = compute_gradient(y, tx, w)
        w = w - (gamma * grad)
        print("Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))

    return (w, loss)

In [146]:
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    '''STOCHASTIC GRADIENT DESCENT'''
    w = initial_w 
    lr_scheduler = [(max_iters) / 4, (2* max_iters) / 4,(3* max_iters) / 4]
    for n_iter in range(max_iters):
        if n_iter in lr_scheduler:
            gamma = gamma / 10
        for minibatch_y, minibatch_tx in batch_iter(y, tx, 1):
            loss = compute_loss(minibatch_y, minibatch_tx, w, "MSE")
            grad = compute_gradient(minibatch_y, minibatch_tx, w)
            w = w - gamma * grad
            print("Stochastic Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))
    return (w, loss)

In [147]:
def least_squares(y, tx):
    '''COMPUTE W_STAR: WEIGHT FOR NORMAL EQUATIONS BY LINEAR EQUATION SOLVER'''
    w_star = np.linalg.solve(tx.T@tx, tx.T@y)
    loss = compute_loss(y, tx, w_star, "MSE")
    return (w_star,loss)

In [140]:
def ridge_regression(y, tx, lambda_):
    '''RIDGE REGRESSION WITH LAMBDA PARAMETER AS REGULARIZATION PARAMETER'''
    N = y.shape[0]
    w_ridge = np.linalg.solve((tx.T@tx)+(lambda_/(2*N))*np.identity(tx.shape[1]), tx.T@y)
    loss = compute_rdg_loss(y, tx, w_ridge, lambda_)
    return (w_ridge, loss)
    

In [141]:
def logistic_regression(y, tx, initial_w, max_iters, gamma):
    '''FOR GRADIENT DESCENT'''
    w = initial_w
    lr_scheduler = [(max_iters) / 4, (2* max_iters) / 4,(3* max_iters) / 4]
    for n_iter in range(max_iters):
        if n_iter in lr_scheduler:
            gamma = gamma / 10
        loss = compute_loss(y, tx, w, "MSE")
        grad = compute_gradient(y, tx, w)
        w = w - (gamma * grad)
        print("Logistic Regression Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))

    return (w, loss)
    '''FOR STOCHASTIC GRADIENT DESCENT'''
    '''
    w = initial_w 
    lr_scheduler = [(max_iters) / 4, (2* max_iters) / 4,(3* max_iters) / 4]
    for n_iter in range(max_iters):
        if n_iter in lr_scheduler:
            gamma = gamma / 10
        for minibatch_y, minibatch_tx in batch_iter(y, tx, 1):
            loss = compute_loss(minibatch_y, minibatch_tx, w, "MSE")
            grad = compute_gradient(minibatch_y, minibatch_tx, w)
            w = w - gamma * grad
            print("Stochastic Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))
    return (w, loss)
    '''

## Generate predictions and save ouput in csv format for submission:

In [121]:
DATA_TEST_PATH = '../../data_project1/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
#Pre-process
tX_test, tX_test_mean, tX_test_std = standardize(tX_test)

In [122]:
ww = np.random.rand(tX.shape[1])
init_w = np.array(ww, dtype=np.float64)
#init_w = np.zeros(tX.shape[1])
print(init_w.shape)
print(init_w)

(30,)
[0.90005632 0.89003445 0.13229146 0.34351175 0.39691614 0.74837181
 0.52214846 0.24522845 0.62121985 0.09057787 0.92747648 0.1966588
 0.62592623 0.06981853 0.65645942 0.02013562 0.52883783 0.38868988
 0.1995837  0.32664805 0.38360278 0.26608571 0.58172377 0.44113436
 0.94304921 0.77617273 0.4856734  0.36996948 0.53185935 0.64121021]


In [142]:
max_iter = 500
alpha = 1e-6

In [143]:
'''PREDICTIONS FOR MODELS'''
'''BATCH GD'''
(w1,loss1) = least_squares_GD(y, tX, init_w, max_iter, alpha)
gd_tr_pred = predict_labels(w1, tX_valid)
print((gd_tr_pred == y_valid).mean())
gd_pred = predict_labels(w1, tX_test)

Gradient Descent(0/499): loss=28.886864455035116
Gradient Descent(1/499): loss=28.886232744128954
Gradient Descent(2/499): loss=28.88560104852366
Gradient Descent(3/499): loss=28.884969368218847
Gradient Descent(4/499): loss=28.884337703214154
Gradient Descent(5/499): loss=28.88370605350921
Gradient Descent(6/499): loss=28.88307441910363
Gradient Descent(7/499): loss=28.882442799997044
Gradient Descent(8/499): loss=28.881811196189066
Gradient Descent(9/499): loss=28.881179607679325
Gradient Descent(10/499): loss=28.88054803446745
Gradient Descent(11/499): loss=28.87991647655306
Gradient Descent(12/499): loss=28.879284933935782
Gradient Descent(13/499): loss=28.878653406615246
Gradient Descent(14/499): loss=28.87802189459107
Gradient Descent(15/499): loss=28.877390397862875
Gradient Descent(16/499): loss=28.87675891643029
Gradient Descent(17/499): loss=28.876127450292937
Gradient Descent(18/499): loss=28.875495999450443
Gradient Descent(19/499): loss=28.874864563902438
Gradient Descent(

Gradient Descent(247/499): loss=28.817171823118088
Gradient Descent(248/499): loss=28.81716552285143
Gradient Descent(249/499): loss=28.817159222586287
Gradient Descent(250/499): loss=28.817152922322673
Gradient Descent(251/499): loss=28.81714662206059
Gradient Descent(252/499): loss=28.817140321800032
Gradient Descent(253/499): loss=28.817134021540994
Gradient Descent(254/499): loss=28.81712772128349
Gradient Descent(255/499): loss=28.817121421027508
Gradient Descent(256/499): loss=28.817115120773046
Gradient Descent(257/499): loss=28.817108820520126
Gradient Descent(258/499): loss=28.817102520268712
Gradient Descent(259/499): loss=28.81709622001884
Gradient Descent(260/499): loss=28.817089919770485
Gradient Descent(261/499): loss=28.81708361952366
Gradient Descent(262/499): loss=28.817077319278358
Gradient Descent(263/499): loss=28.817071019034582
Gradient Descent(264/499): loss=28.817064718792334
Gradient Descent(265/499): loss=28.817058418551607
Gradient Descent(266/499): loss=28.8

In [78]:
'''SGD'''
(w2,loss2) = least_squares_SGD(y, tX, init_w, max_iter, alpha)
sgd_tr_pred = predict_labels(w2, tX_valid)
print((sgd_tr_pred == y_valid).mean())
sgd_pred = predict_labels(w2, tX_test)

Stochastic Gradient Descent(0/399): loss=26124267.57961738
Stochastic Gradient Descent(1/399): loss=236393.57089246958
Stochastic Gradient Descent(2/399): loss=235428.37692316354
Stochastic Gradient Descent(3/399): loss=306518.22554784344
Stochastic Gradient Descent(4/399): loss=122955.01948023823
Stochastic Gradient Descent(5/399): loss=78672.24840316524
Stochastic Gradient Descent(6/399): loss=307494.4617813148
Stochastic Gradient Descent(7/399): loss=393310.5708922117
Stochastic Gradient Descent(8/399): loss=85935.82027349809
Stochastic Gradient Descent(9/399): loss=250049.58807462105
Stochastic Gradient Descent(10/399): loss=319.6998117776728
Stochastic Gradient Descent(11/399): loss=619.5944011272667
Stochastic Gradient Descent(12/399): loss=3465.500642948747
Stochastic Gradient Descent(13/399): loss=9274.10150972572
Stochastic Gradient Descent(14/399): loss=13097.921205002787
Stochastic Gradient Descent(15/399): loss=2891.3434233741823
Stochastic Gradient Descent(16/399): loss=34

Stochastic Gradient Descent(167/399): loss=1603.0741576340995
Stochastic Gradient Descent(168/399): loss=1154.1783702557696
Stochastic Gradient Descent(169/399): loss=20607.506024152062
Stochastic Gradient Descent(170/399): loss=52611.14039924702
Stochastic Gradient Descent(171/399): loss=6504.896403543011
Stochastic Gradient Descent(172/399): loss=4153.678925378714
Stochastic Gradient Descent(173/399): loss=2153.086520660052
Stochastic Gradient Descent(174/399): loss=431230.52725592814
Stochastic Gradient Descent(175/399): loss=4864.457885314963
Stochastic Gradient Descent(176/399): loss=4728.023264150276
Stochastic Gradient Descent(177/399): loss=1079.960978758179
Stochastic Gradient Descent(178/399): loss=3895.970061389226
Stochastic Gradient Descent(179/399): loss=2825.336709296781
Stochastic Gradient Descent(180/399): loss=9055.373166258883
Stochastic Gradient Descent(181/399): loss=31438.131763013163
Stochastic Gradient Descent(182/399): loss=10358.96456481776
Stochastic Gradient

Stochastic Gradient Descent(313/399): loss=5025.404072217542
Stochastic Gradient Descent(314/399): loss=27929.420704331365
Stochastic Gradient Descent(315/399): loss=1390.1865268094764
Stochastic Gradient Descent(316/399): loss=19611.03805730012
Stochastic Gradient Descent(317/399): loss=457.4532372349228
Stochastic Gradient Descent(318/399): loss=26010.473132848925
Stochastic Gradient Descent(319/399): loss=36.107388128437606
Stochastic Gradient Descent(320/399): loss=19450.286980854024
Stochastic Gradient Descent(321/399): loss=9984.041973335463
Stochastic Gradient Descent(322/399): loss=41617.73711723526
Stochastic Gradient Descent(323/399): loss=1074.853425932241
Stochastic Gradient Descent(324/399): loss=2827.210440820649
Stochastic Gradient Descent(325/399): loss=1.746214124732272
Stochastic Gradient Descent(326/399): loss=913.6519793159086
Stochastic Gradient Descent(327/399): loss=4059.844608849279
Stochastic Gradient Descent(328/399): loss=3285.868664464841
Stochastic Gradient

In [79]:
'''LS WITH NORMAL EQ'''
(w3,loss3) = least_squares(y, tX)
ls_tr_pred = predict_labels(w3, tX_valid)
print((ls_tr_pred == y_valid).mean())
ls_pred = predict_labels(w3, tX_test)

0.7462


In [80]:
'''RIDGE REGRESSION'''
'''CHOOSE BEST LAMBDA'''
lambda_ = np.logspace(-1, -6, 30)
min_loss = 1000000
ind = 0
for i in range(lambda_.shape[0]):
    (w4,loss4) = ridge_regression(y, tX, lambda_[i])
    if min_loss > loss4:
        min_loss = loss4
        ind = i
(w4,loss4) = ridge_regression(y, tX, lambda_[ind])       
rd_tr_pred = predict_labels(w4, tX_valid)
print((rd_tr_pred == y_valid).mean())
rd_pred = predict_labels(w4, tX_test)

0.7462


In [144]:
OUTPUT_PATH = '../../data_project1/results_least_sq.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w1, tX_test)
create_csv_submission(ids_test, gd_pred, OUTPUT_PATH)