# Run.ipynb
### This notebook lays out the extended pipeline, including all possibilities of the decision tree, where the run.py path will be extracted from. Preliminaries (Chapter 1) include:
1. Loading in the data
2. Creating the feature subsets
3. Laying out the methods

### After this, the pipeline (Chapter 2) can be ran where the different combinations of feature subsets, methods, and hyperparameters will be tested. The results (Chapter 3) of which will be visualized accordingly.

## Chapter 1

### 1.0 Imports and Constants

In [1]:
import numpy as np
import inspect

from validation import *
from proj1_helpers import *
from implementations import *

# Paths to train and test folders
DATA_TRAIN_PATH = "../data/train.csv"
DATA_TEST_PATH = "../data/test.csv"

### 1.1 Loading the data

In [2]:
labels, data, ids = load_csv_data(DATA_TRAIN_PATH)

### 1.2 Feature subsets
#### Including:
- All the features as is (naive)
- Merging highly (t = 0.96) correlated features

#### Mixed with:
- Categorical feature extraction
- Principal Component Analysis features

In [3]:
feature_subsets = {
    "All features" : list(range(data.shape[1])),
    "Without correlated features" : [0,1,2,3,4,5,7,8,9,10,11,13,
                                     14,15,16,17,18,19,20,21,22,23],
    "Without calculated features" : [0,1,2,3,4,5,7,8,9,10,11,13],
    "Without corr >.8" : [0,1,2,3,7,8,10,11,13,14,15,16,17,18,19,20]
}

### 1.3 Methods

In [4]:
methods = {
    'least_squares' : least_squares, 'least_squares_GD' : least_squares_GD,
    'least_squares_SGD' : least_squares_SGD, 'ridge_regression' : ridge_regression,
    'logistic_reg_GD' : logistic_reg_GD,
    'penalized_logistic_reg_GD' : penalized_logistic_reg_GD
    #, 'logistic_reg_newton' : logistic_reg_newton,
}

## Chapter 2

### 2.1 Helper-functions

In [7]:
"""
    Inputs a function and outputs a dictionary with the appropriate param, value pair
"""
def get_parameter_values(func, y, tX):
    values = {}
    mapping = {
        'y'         : {'y'         : y},
        'tx'        : {'tx'        : tX},
        'initial_w' : {'initial_w' : np.zeros(tX.shape[1])},
        'max_iters' : {'max_iters' : 100},
        'gamma'     : {'gamma'     : True},
        'lambda_'   : {'lambda_'   : True}
    }
    
    for param in inspect.signature(func).parameters:
        values.update(mapping[param])
        
    return values

"""
    Returns the prediction accuracy using the prototypes and the test set
"""
def pred_acc(y, tX, w):
    y_pred = np.squeeze(tX @ w)
    y_pred[np.where(y_pred <= 0)] = 0
    y_pred[np.where(y_pred > 0)] = 1
    
    return sum(y == y_pred) / len(y)

"""
    Performs z-score normalization on the data tX
"""
def normalize(tX):
    return (tX - np.mean(tX, axis=0)) / np.std(tX, axis=0)

""" 
    Convert labels from {-1, 1} to {0, 1}. 
"""
def normalize_labels(y):
    return np.round((y + 1) / 2)

"""
    Helper function for cross-validation
"""
def cross_validation_step(y, tx, indices, k, func, parameters):
    # get k'th subgroup in test, others in train
    test_indices = indices[k]
    train_indices = np.delete(indices, k, axis=0).flat
    test_tx, test_y = tx[test_indices], y[test_indices]
    train_tx, train_y = tx[train_indices], y[train_indices]
    parameters['tx'] = train_tx
    parameters['y'] = train_y
    # train model on training data
    w, _ = func(**parameters)
    # calculate the prediction accuracy for test data
    return pred_acc(test_y, test_tx, w)

"""
    Cross-validation
    Returns the mean accuracy of all the folds
"""
def cross_validation(y, tx, func, parameters):
    indices = build_k_indices(y, 10)
    accs = np.array([
        cross_validation_step(y, tx, indices, k, func, parameters)
        for k in range(len(indices))
    ])
    return np.mean(accs)

"""
    Performs hyperparameter optimization on a function
"""
def start(func, parameters):
    gammas = lambdas = [0.1, 0.05, 0.01]
    results = []
    
    tX = parameters['tx']
    y = parameters['y']
    
    if 'gamma' not in parameters and 'lambda_' not in parameters:
        w, _ = func(**parameters)
        return (w, '-', '-', cross_validation(y, tX, func, parameters))
    
    if 'gamma' in parameters and 'lambda_' in parameters:
        for gamma in gammas:
            for lambda_ in lambdas:
                parameters['gamma'] = gamma
                parameters['lambda_'] = lambda_
                w, _ = func(**parameters)
                results.append((w, gamma, lambda_, cross_validation(y, tX, func, parameters)))
        return results
    
    if 'gamma' in parameters: 
        for gamma in gammas:
            parameters['gamma'] = gamma
            w, _ = func(**parameters)
            results.append((w, gamma, '-', cross_validation(y, tX, func, parameters)))
        
    if 'lambda_' in parameters: 
        for lambda_ in lambdas:
            parameters['lambda_'] = lambda_
            w, _ = func(**parameters)
            results.append((w, '-', lambda_, cross_validation(y, tX, func, parameters)))
        
    return max(results, key=lambda x:x[3])

def feature_expansion(tX, degree):
    powers = [np.power(tX, deg) for deg in range(1, degree)]
    return np.concatenate(powers, axis=1)

### 2.2 Pipeline

In [8]:
y = normalize_labels(labels)
tX = normalize(data)

In [9]:
result = []
for name, features in feature_subsets.items():
    for method, func in methods.items():
        subset = tX[:, features]
        parameters = get_parameter_values(func, y, subset)
            
        w, gamma, lambda_, acc = start(func, parameters)
        
        print(f"{name} with {method} and gamma:{gamma}, lambda:{lambda_} = accuracy: {acc}")
        
    

All features with least_squares and gamma:-, lambda:- = accuracy: 0.7183999999999998
All features with least_squares_GD and gamma:0.1, lambda:- = accuracy: 0.7144039999999999
All features with least_squares_SGD and gamma:0.01, lambda:- = accuracy: 0.6096520000000001
All features with ridge_regression and gamma:-, lambda:0.01 = accuracy: 0.71634


  exp = np.exp(t)
  return exp / (1 + exp)


All features with logistic_reg_GD and gamma:0.1, lambda:- = accuracy: 0.0


ValueError: too many values to unpack (expected 4)

In [None]:
# 2, acc = 0.7313, 0.7310, 0.7312, 0.7315
# 3, acc = 0.7199, 0.7268, 0.7271, 0.7300
# 4, acc = 0.6921