In [1]:
from random import seed
from random import randrange
import random
from csv import reader
from math import exp
import numpy as np

## Define Functions

In [2]:
# Load a CSV file
def load_csv(filename):
    '''
    Load data from .csv file
    Arguments: 
          filename: File name [string]
    Outputs: 
        dataset: Loaded dataset [numpy array]
    '''
    dataset=np.genfromtxt(filename, delimiter=',', dtype="|U5")
    return dataset

In [3]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset):
    '''
    Normalizing dataset in range of (0,1)
    Arguments: 
         dataset: The input data [numpy]
    Outputs: 
         dataset: Normalized dataset with all elements of each data point in range of (0,1) [numpy]
    '''
    dataset = (dataset-dataset.min(axis=1,keepdims=True))/(dataset.max(axis=1,keepdims=True)-dataset.min(axis=1,keepdims=True)) 
    return dataset
        

In [4]:
# Standardize dataset columns
def standardize_dataset(dataset):
    '''
    Standardizing dataset 
    Arguments: 
         dataset: The input data [numpy]
    Outputs: 
         dataset: Standardize dataset [numpy]
    '''
    mean_val = np.mean(dataset, axis=0)
    dataset = dataset - mean_val
    std_val = np.std(dataset, axis=0)
    dataset = dataset/std_val
    return dataset

In [5]:
              
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    '''
    Split data randomly into k folds for cross validation
    Arguments: 
         dataset: The input data [numpy],
         n_folds: Number of required folds to be split into [int]
    Outputs: 
         train_index: Indices of the random section chosen as the train data in each step of learning [numpy array], 
         test_index: Indices of the random section chosen as the test data in each step of learning [numpy array]
    '''
    
    # find size of each fold = find size of test slice
    test_size = dataset.shape[0] / n_folds
    test_size = np.rint(test_size)        
    test_size = test_size.astype(int)
        
    low_range = 0
    high_range = dataset.shape[0]
    
    # randomly devide data into test and train slices of specified sizes
    test_index = random.sample(range(low_range, high_range), test_size)
    all_indices = np.arange(low_range,high_range)
    train_index = np.delete(all_indices, test_index)
   
    
    return train_index, test_index 

In [6]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    '''
    Calculate the accuracy score according to the predicted and actual labels
    Arguments: 
         actual: Real labels of train data [int], 
         predicted: The implemented model's predicted labels [int]
    Outputs: 
         accuracy: Accuracy score in percentage [float] 
    '''
    tolerance = 1e-10
    accuracy = (np.abs(predicted - actual[:, np.newaxis]) < tolerance ).mean()*100
        
    return accuracy

In [32]:
# Find coefficients and apply on test_slice of training data
def find_coeffs(y, tx, y_test, tx_test, algorithm, *args):
    '''
    Finds W coefficients using specified algorithm and gradient descent
    Arguments: 
         y: Corresponding labels of the train data [numpy array],
         tx: The random section chosen as the train data in each step of learning [numpy array], 
         y_test: Corresponding labels of the test data [numpy array],
         tx_test: The random section chosen as the test data in each step of learning [numpy array],        
         algorithm: Chosen algorithm for classification; It could be 'least_squares'/ 'least_squares_GD'/
         'least_squares_SGD'/'logistic_regression'/'reg_logistic_regression'/'ridge_regression' [name of function]
    Outputs: 
         coef: Classification coefficients [float],
         loss: Loss function calculated for the method [float], 
         accuracy: Mean accuracy score of the used algorithm on test slice [float]
    '''
        
    # apply specified classification method on train slice and define W coefficients
    if (algorithm == 'least_squares_GD' ):
        func = least_squares_GD
        coef, loss = func(y, tx, initial_w, max_iters, gamma) 
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)
    
    if (algorithm == 'least_squares_SGD' ):
        func = least_squares_SGD
        coef, loss = func(y, tx, initial_w, max_iters, gamma)
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)
    
    if (algorithm == 'least_squares' ):
        func = least_squares
        coef, loss = func(y, tx)  
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)

    
    if (algorithm == 'ridge_regression' ):
        func = ridge_regression
        coef, loss = func(y, tx, lambda_) 
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)
        y_hat = y_hat[:, np.newaxis]

    
    if (algorithm == 'logistic_regression' ):
        func = logistic_regression
        coef, loss = func(y, tx, initial_w, max_iters, gamma)  
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)
    
    if (algorithm == 'reg_logistic_regression' ):
        func = reg_logistic_regression
        coef, loss = func(y, tx, lambda_, initial_w, max_iters, gamma) 
        
        # calculate yhat predictions and apply sigmoid function on predicted labels
        y_hat = predict_sigmoid(tx_test, coef)
    
        
   
    
    # cast predicted labels to binary class identifiers with 0.6 treshhold
    y_hat[y_hat>=0.6] = 1
    y_hat[y_hat<0.6] = 0
    
    
    # compute accuracy measure
    accuracy = accuracy_metric(y_test, y_hat)
    
    #predictions = np.append(predictions,yhat)
    return (coef, loss, accuracy)

In [9]:
def least_squares(y, tx):
    '''
    Finds W coefficients using least squares method
    Arguments: 
        tx: The random section chosen as the train data in each step of learning [numpy array], 
        y: Corresponding labels of the chosen data [numpy array],
    Outputs: 
        w: Classification coefficients [float],
        loss: Loss function calculated for the method [float]       
    '''
    
    # prepare data
    a = tx.T.dot(tx)
    b = tx.T.dot(y)
    
    # compute regression coefficients
    w = np.linalg.solve(a, b)
    
    # calculate predicted labels 
    yhat = np.matmul(tx,w)
    
    # calculate error
    error = (y[:, np.newaxis] - yhat[:, np.newaxis])
    
    # compute loss function
    loss = ((np.matmul(error.T,error))/(yhat.shape[0]))
    
    return w[:, np.newaxis], loss

In [10]:
# Estimate least square coefficients using gradient descent
def least_squares_GD(y, tx, initial_w, max_iters, gamma):
    '''
    Finds W coefficients using least square loss function with gradient descent
    Arguments: 
         y: The labels of randomly chosen train data [numpy array],
         tx: The random section chosen as the train data in each step of learning [numpy array], 
         initial_w: Initial value for the weight vector [numpy array],
         max_iters: Maximum number of steps to run [int],
         gamma: Learning rate of gradient descent method [float]
         
    Outputs: 
         w: Classification coefficients [float],
         loss: Loss function calculated for the method [float]
    '''  
    # Initilizing wights
    w = initial_w
    
    # apply gradient descent on least square loss function until meeting the max number of iterations
    for epoch in range(max_iters):
        # calculate yhat predictions and apply sigmoid function on predicted labels
        yhat = predict_sigmoid(tx, w) # This function should be changed
        # compute prediction error
        error = (y[:, np.newaxis] - yhat)

        # gradient descent
        gradient = np.matmul(error.T,tx)/(tx.shape[0])
        gradient = gamma * gradient

        # update coefficients
        w = w + gradient.T
     
    loss = ((np.matmul(error.T,error))/(yhat.shape[0]))
    return w, loss

In [11]:
# Estimate least square coefficients using stochastic gradient descent
def least_squares_SGD(y, tx, initial_w, max_iters, gamma):
    '''
    Finds W coefficients using least square loss function with gradient descent
    Arguments: 
         y: The labels of randomly chosen train data [numpy array],
         tx: The random section chosen as the train data in each step of learning [numpy array], 
         initial_w: Initial value for the weight vector [numpy array],
         max_iters: Maximum number of steps to run [int],
         gamma: Learning rate of gradient descent method [float]
        
    Outputs: 
         w: Classification coefficients [float],
         loss: Loss function calculated for the method [float]
    '''  
    # Initilizing wights and batch size
    w = initial_w
    B_size = 1
    
    # apply gradient descent on least square loss function until meeting the max number of iterations
    for epoch in range(max_iters):
        # Selecting one 
        B_idx = random.sample(range(0, tx.shape[0]), B_size)
        # calculate yhat predictions and apply sigmoid function on predicted labels
        yhat = predict_sigmoid(tx_train[B_idx,:], w) # This function should be changed
        # compute prediction error
        error = (y_train[B_idx, np.newaxis] - yhat)

        # gradient descent
        gradient = np.matmul(error.T,tx_train[B_idx,:])/(tx_train[B_idx,:].shape[0])
        gradient = gamma * gradient

        # update coefficients
        w = w + gradient.T
     
    loss = ((np.matmul(error.T,error))/(yhat.shape[0]))
    return w, loss

In [12]:
# Estimate logistic regression coefficients using gradient descent
def logistic_regression (y, tx, initial_w, max_iters, gamma):
    '''
    Finds W coefficients using least square loss function with gradient descent
    Arguments: 
         y: The labels of randomly chosen train data [numpy array],
         tx: The random section chosen as the train data in each step of learning [numpy array], 
         initial_w: Initial value for the weight vector [numpy array],
         max_iters: Maximum number of steps to run [int],
         gamma: Learning rate of gradient descent method) [float]
        
    Outputs: 
        w: Classification coefficients [float],
        loss: Loss function calculated for the method [float]
    '''   
    # initialize coefficients with zero
    w = initial_w
    
    error =[]
    gradient =[]
    
    # apply gradient descent on least square loss function until meeting the max number of iterations
    for epoch in range(max_iters):
   
            #  calculate yhat predictions and apply sigmoid function on predicted labels
            yhat = predict_sigmoid(tx, w)
            
            # compute prediction error
            error = (y[:, np.newaxis] - yhat)
            
            # gradient descent
            multiplied = np.matmul(tx,w)
            sigma = 1/ (1 + np.exp(-multiplied))
            gradient = np.matmul(tx.T, sigma-yhat) 
            gradient = gamma * gradient
            
            # update coefficients
            w = w - gradient
        
    loss = (np.matmul(-y.T, np.log(yhat)) - np.matmul((1 -y.T), np.log(1 - yhat)))/(yhat.shape[0])     
    return  w, loss

In [13]:
# Estimate regularized logistic regression coefficients using gradient descent
def reg_logistic_regression (y, tx, lambda_, initial_w, max_iters, gamma):
    '''
    Finds W coefficients using least square loss function with gradient descent
    Arguments:      
         y: The labels of randomly chosen train data [numpy array],
         tx: The random section chosen as the train data in each step of learning [numpy array], 
         lambda_ : Regularization parameter [float],
         initial_w: Initial value for the weight vector [numpy array],
         max_iters: Maximum number of steps to run [int],
         gamma: Learning rate of gradient descent method) [float]
        
    Outputs: 
        w: Classification coefficients [float],
        loss: Loss function calculated for the method [float]
    '''   
    # initialize coefficients with zero
    w = initial_w
    
    error =[]
    gradient =[]
    
    # apply gradient descent on least square loss function until meeting the max number of iterations
    for epoch in range(max_iters):
   
            #  calculate yhat predictions and apply sigmoid function on predicted labels
            yhat = predict_sigmoid(tx, w)
            
            # compute prediction error
            error = (y[:, np.newaxis] - yhat)
            
            # gradient descent
            multiplied = np.matmul(tx,w)
            sigma = 1/ (1 + np.exp(-multiplied))
            gradient = np.matmul(tx.T, sigma-yhat) + lambda_ * w
            gradient = gamma * gradient
            
            # update coefficients
            w = w - gradient
        
    loss = (np.matmul(-y.T, np.log(yhat)) - np.matmul((1 -y.T), np.log(1 - yhat)))/(yhat.shape[0])  + lambda_/2 *(np.linalg.norm(w))   
    return  w, loss

In [14]:
# Estimate ridge regression coefficients using normal equations
def ridge_regression(y, tx, lambda_):
    '''
    Finds W coefficients using ridge regression loss function
    Arguments: 
        y: Corresponding labels of the chosen data [numpy array],
        tx: The random section chosen as the train data in each step of learning [numpy array], 
        lambda_ : Regularization parameter [float]
    Outputs: 
        w: Classification coefficients [float],
        loss: Loss function calculated for the method [float]       
    '''
    
    # apply regularization parameter on data
    aI = lambda_ * np.identity(tx.shape[1])
    a = tx.T.dot(tx) + aI
    b = tx.T.dot(y)
    
    # compute regression coefficients
    w = np.linalg.solve(a, b)
    
    # calculate predicted labels 
    yhat = np.matmul(tx,w)
    
    # calculate error
    error = (y[:, np.newaxis] - yhat[:, np.newaxis])
    
    # compute loss function
    loss = ((np.matmul(error.T,error))/(yhat.shape[0]))
    
    return w, loss

In [15]:
#  Calculate yhat predictions and apply sigmoid function on predicted labels
def predict_sigmoid(data, coefficients):
    '''
    Calculates yhat and applies sigmoid function on it
    Arguments: 
        data: Dataset to predict its labels [numpy array], 
        coefficients: Classification coefficients [float]
    Outputs: 
        yhat: Predicted labels [float]
    '''    
    yhat = np.matmul(data,coefficients)
    return 1.0 / (1.0 + np.exp(-yhat))

#### Data Cleaning Functions

In [16]:
# Bound outliers 
def data_bounded(X_d, times_SE=3.0):
    '''
    Arguments:
        X_d: Standardized data matrix to be bounded by 3 times of SE [Numpy array]
    Output:
        X_bounded: Bounded data matrix [Numpy Array]
    
    '''
    X_b = X_d
    for i in range(X_b.shape[1]):
        X_i = X_b[:,i]
        X_i[(X_i <-times_SE) | (X_i>times_SE)] = np.mean(X_i[(X_i >=-times_SE) & (X_i<=times_SE)])
        X_b[:,i] = X_i
    
    return X_b

In [17]:
# Clean undefined, unmeasured, or invalid data
def data_cleaning(X):
    '''
    Arguments:
        X: Data matrix [Numpy array]
    Outputs:
        X_c: Cleaned data [Numpy array]
    '''
    X_c = np.copy(X)
    for i in range(X.shape[1]):
        X_i = X[:,i]
        X_i[X_i == -999] = np.mean(X_i[X_i != -999])
        X_c[:,i] = X_i
        
    return X_c

## Load and Prepare Data

In [18]:
# Test the model
seed(1)

# load and prepare data
train_data = load_csv('train.csv')

# omit first row(header) and first column(data ids)
train_data = train_data[1:,1:]

# replace labels 's' and 'b' with '1' and '0'
train_data[:,0][train_data[:,0]=='s']='1'
train_data[:,0][train_data[:,0]=='b']='0'

# cast data type to float
train_data = train_data.astype(float)


## Clear Data

In [19]:
# Standardize and clear data

# separate labels
y = np.copy(train_data[:,0])

X = np.copy(train_data[:,1:])

# clean dataset
X = data_cleaning(X)

# standardize dataset
X = standardize_dataset(X)

# bound dataset
tx = data_bounded(X, times_SE = 3.0)


## Evaluate Model

In [46]:
# Evaluate an algorithm using a cross validation split


# number of folds for cross validation
n_folds = 5

# learning rate for gradient descent (step size)
gamma = 0.8

# maximum number of iterations to perform
max_iters = 100

# regularization parameter
lambda_ = 0.1

scores = []
num_folds = [n_folds]
    
# Initilize initial_w
initial_w = np.zeros((tx.shape[1], 1))
 

for fold in num_folds:
    # apply k-fold cross validation on train data and seperate it into two slices: train_slice, test_slice
    train_idx, test_idx  = cross_validation_split(tx, n_folds)
    
    tx_train, tx_test = tx[train_idx,:], tx[test_idx,:]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # find W coefficients using the specified algorithm and calculate accuracy score using predicted data labels
    
    # Least Square GD
    coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test, 'least_squares_GD', initial_w, max_iters, gamma)
    
    # Least Square SGD
    #coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test, 'least_squares_SGD', initial_w, max_iters, gamma)
    
    # Least Square Normal Equation
    #coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test,'least_squares')
    
    # Ridge Regression
    #coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test, 'ridge_regression', lambda_)
    
    #Logistic Regression
    #coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test, 'logistic_regression', initial_w, max_iters, gamma)
    
    # Regularized Logistic Regression
    #coefficients, loss, accuracy = find_coeffs(y_train, tx_train, y_test, tx_test, 'reg_logistic_regression', lambda_, initial_w, max_iters, gamma)
    
    scores = np.append(scores, accuracy)
        
        
# report results
print('Mean Accuracy Score: %s' % np.mean(scores))
#print('The Last Weight Vector Of The Method:', coefficients)
print('Loss Value: %s' % loss[0][0])

Mean Accuracy Score: 74.17399999999999
Loss Value: 0.18612283382506395


## Run On Test Data 

In [28]:
# Run algorithm on train data

# load and prepare test data
test_data = load_csv('test.csv')

# omit first row(header) and the first two columns(data ids & unknown labels '?')
test_data = test_data[1:,2:]

# cast data type to float
test_data = test_data.astype(float)

In [29]:
# Standardize and clear data

# clear dataset
test_data = data_cleaning(test_data)

# standardize dataset
test_data = standardize_dataset(test_data)

# bound dataset
test_data = data_bounded(test_data, times_SE = 3.0)

In [30]:
# Predict tes labels

predictionsTest = []

# calculate yhat predictions and apply sigmoid function on predicted labels
yhat_test = predict_sigmoid(test_data, coefficients)

# cast predicted labels to binary class identifiers with 0.6 treshhold
yhat_test[yhat_test>=0.6] = 1
yhat_test[yhat_test<0.6] = 0

# predicted labels
predictionsTest = np.append(predictionsTest,yhat_test)


# set the labels to +/- 1
predictionsTest[predictionsTest == 0]= -1

## Make Output .csv File

In [31]:
# Make output .csv file
import csv
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: 
          ids: event ids associated with each prediction
          y_pred: predicted class labels 
          name: string name of .csv output file to be created
               
    Outputs: 
         testLabels: File containing predicted test lables [.csv]
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

            
ids = np.arange(350000, 918238)
create_csv_submission(ids, predictionsTest, 'testLabels')