# Run.ipynb
### This notebook lays out the extended pipeline, including all possibilities of the decision tree, where the run.py path will be extracted from. Preliminaries (Chapter 1) include:
1. Loading in the data
2. Creating the feature subsets
3. Laying out the methods

### After this, the pipeline (Chapter 2) can be ran where the different combinations of feature subsets, methods, and hyperparameters will be tested. The results (Chapter 3) of which will be visualized accordingly.

## Chapter 1

### 1.0 Imports and Constants

In [None]:
import numpy as np
import inspect

from validation import *
from proj1_helpers import *
from implementations import *

# Paths to train and test folders
DATA_TRAIN_PATH = "../data/train.csv"
DATA_TEST_PATH = "../data/test.csv"

### 1.1 Loading the data

In [None]:
train_y, train_tX, train_ids = load_csv_data(DATA_TRAIN_PATH)

### 1.2 Feature subsets
#### Including:
- All the features as is (naive)
- Merging highly (t = 0.96) correlated features

#### Mixed with:
- Categorical feature extraction
- Principal Component Analysis features

In [None]:
feature_subsets = {}
# All the features
feature_subsets.update({"All features" : train_tX}) 
# Without highly correlated features
feature_subsets.update({"Without highly correlated features" : \
                       np.delete(train_tX, [24,25,6,12,26,27,28,29], axis=1)})

### 1.3 Methods

In [None]:
methods = {'least_squares' : least_squares, 'least_squares_GD' : least_squares_GD, \
           'least_squares_SGD' : least_squares_SGD, 'ridge_regression' : ridge_regression, \
           'logistic_reg_GD' : logistic_reg_GD, 'logistic_reg_newton' : logistic_reg_newton}

## Chapter 2

### 2.1 Helper-functions

In [None]:
"""
    Inputs a function and outputs a dictionary with the appropriate param, value pair
"""
def get_parameter_values(func, tX):
    values = {}
    mapping = {
        'y'         : {'y'         : train_y},
        'tx'        : {'tx'        : tX},
        'initial_w' : {'initial_w' : np.zeros(tX.shape[1])},
        'max_iters' : {'max_iters' : 500},
        'gamma'     : {'gamma'     : np.logspace(-9,-7,num=10)},
        'lambda_'   : {'lambda_'   : [0.1, 0.01, 0.001, 0.0001, 0.00001]}
    }
    
    for param in inspect.signature(func).parameters:
        values.update(mapping[param])
        
    return values

"""
    Returns the prediction accuracy using the prototypes and the test set
"""
def pred_acc(y, tx, w):
    print(tx, w)
    y_pred = np.squeeze(tx @ w)
    print(y_pred)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    print(y_pred)
    return sum(y == y_pred) / len(y)

### 2.2 Pipeline

In [None]:
for data_set, tX in feature_subsets.items():
    for method, func in methods.items():
        parameters = get_parameter_values(func, tX)
        
        # assume gamma and lambda are independent
        # hyperparameter optimization, parameters['gamma'] and ['lambda_'] hold the values
        if 'gamma' in parameters: 
            parameters['gamma'] = 10^-8 # change
            
        if 'lambda_' in parameters:
            parameters['lambda_'] = 0.0001 # change
            
        w, _ = func(**parameters)
        
        print(f"{data_set} using {method} {pred_acc(test_y, test, w)}")
        