# Run.ipynb
### This notebook lays out the extended pipeline, including all possibilities of the decision tree, where the run.py path will be extracted from. Preliminaries (Chapter 1) include:
1. Loading in the data
2. Creating the feature subsets
3. Laying out the methods

### After this, the pipeline (Chapter 2) can be ran where the different combinations of feature subsets, methods, and hyperparameters will be tested. The results (Chapter 3) of which will be visualized accordingly.

## Chapter 1

### 1.0 Imports and Constants

In [13]:
import numpy as np
import inspect

from validation import *
from proj1_helpers import *
from implementations import *

# Paths to train and test folders
DATA_TRAIN_PATH = "../data/train.csv"
DATA_TEST_PATH = "../data/test.csv"

### 1.1 Loading the data

In [9]:
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

### 1.2 Feature subsets
#### Including:
- All the features as is (naive)
- Merging highly (t = 0.96) correlated features

#### Mixed with:
- Categorical feature extraction
- Principal Component Analysis features

In [3]:
feature_subsets = {}
# All the features
feature_subsets.update({"All features" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18, \
                                          19,20,21,22,23,24,25,26,27,28,29]}) 
# Without highly correlated features
feature_subsets.update({"Without correlated features" : [0,1,2,3,4,5,7,8,9,10,11,13, \
                                                        14,15,16,17,18,19,20,21,22,23]})

### 1.3 Methods

In [4]:
methods = {'least_squares' : least_squares, 'least_squares_GD' : least_squares_GD, \
           'least_squares_SGD' : least_squares_SGD, 'ridge_regression' : ridge_regression, \
           'logistic_reg_GD' : logistic_reg_GD, 'logistic_reg_newton' : logistic_reg_newton, \
           'penalized_logistic_reg_GD' : penalized_logistic_reg_GD
          }

## Chapter 2

### 2.1 Helper-functions

In [5]:
"""
    Inputs a function and outputs a dictionary with the appropriate param, value pair
"""
def get_parameter_values(func, y, tX):
    values = {}
    mapping = {
        'y'         : {'y'         : y},
        'tx'        : {'tx'        : tX},
        'initial_w' : {'initial_w' : np.zeros(tX.shape[1])},
        'max_iters' : {'max_iters' : 500},
        'gamma'     : {'gamma'     : 10 ** -4},
        'lambda_'   : {'lambda_'   : 0.0001}
        'gamma'     : {'gamma'     : [0.1,0.01,0.001,0.0001,0.00001]},
        'lambda_'   : {'lambda_'   : }
    }
    
    for param in inspect.signature(func).parameters:
        values.update(mapping[param])
        
    return values

"""
    Returns the prediction accuracy using the prototypes and the test set
"""
def pred_acc(y, tX, w):
    y_pred = np.squeeze(tX @ w)
    y_pred[np.where(y_pred <= 0)] = 0
    y_pred[np.where(y_pred > 0)] = 1
    
    return sum(y == y_pred) / len(y)

"""
    Performs z-score normalization on the data tX
"""
def normalize(tX):
    return (tX - np.mean(tX, axis=0)) / np.std(tX, axis=0)

""" 
    Convert labels from {-1, 1} to {0, 1}. 
"""
def normalize_labels(y):
    return np.round((y + 1) / 2)

"""
    Performs hyperparameter optimization on a function
"""
def start(func, parameters):
    indices = build_k_indices(y, 6)
    gammas = lambdas = [0.1, 0.01, 0.001, 0.0001, 0.00001]
    results = []
    
    if 'gamma' not in parameters: 
        gammas = []
    if 'lambda_' not in parameters: 
        lambdas = []
        
    # Gridsearch
    for gamma in gammas:
        for lambda_ in lambdas:
            parameters['gamma'] = gamma
            parameters['lambda_'] = lambda_
            
            w, _ = func(**parameters)
            
            results.append((w, pred_acc(y, tX, w, indices))) # k-fold
    
    return max(results, key=lambda x:x[1])
    # TODO: hyperparameter optimization

### 2.2 Pipeline

In [14]:
# The result: a list of tuples, [(features, method, acc, w), ...]
result = []
y = normalize_labels(y)
tX = normalize(tX)

for name, features in feature_subsets.items():
    for method, func in methods.items():
        parameters = get_parameter_values(func, y, tX[:,features])
            
        w, acc = start(func, parameters)
        
        print(f"{name} with {method} - accuracy: {acc}")
        break
    break

All features with least_squares - 
