In [17]:
# Useful starting lines
%matplotlib inline
import numpy as np
%load_ext autoreload
%autoreload 2

#plot library
import seaborn as sns
import matplotlib.pyplot as plt

# Progression bar
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from helpers import *
from cross_val import *
from preprocessing import *
from plot import *
from run import *

In [19]:
DEBUG = False
PATH_TRAIN= '../data/train.csv'
PATH_TEST = '../data/test.csv'
y, tX, ids = load_csv_data(PATH_TRAIN, sub_sample=DEBUG)
y_test, tX_test, ids_test = load_csv_data(PATH_TEST, sub_sample=DEBUG)

## Notebook for parameter scanning for all the methods we have implemented.
Every notebook will be ran for one group (indexed by jet_num) \
For ALL methods we try, if the parameter is relevant:
- degree from 1 to 10
- lambnda from -8 to 1 in logspace
- gamma from -8 to 1 in log space

Each generated PDF will be DENSE in information. I suggest looking at the plots at the end of each parameter scans to help you find quickly what is needed. \
Once we run them once, I think we'll be happy to have them for the end of the project, to target what to try and what not to try :) \
Gang

In [20]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = None
# Can be True or False
OFFSET = True

## Constant values
SEED = 1
K_FOLD = 5
MAX_ITERS = 100

In [21]:
def print_best_perfs(idxs, params, acc, std):
    for i in range(5):
        idx = idxs[i]
        print('Ranked {}'.format(i+1))
        print('Parameters\n', params[idx])
        print('With validation accuracy {} std {}'.format(acc[idx], std[idx]))
        print('\n')

---
## In order, the methods to be implemented:
- Least Squares GD
- Least Squares SGD
- Least Squares
- Ridge Regression
- Logistic Regression
- Regularized Logistic Regression
---

In [22]:
def ParameterScan():
	print('Starting parameter scan for: Group {}, sampling strategy {} and offset {}'.format(GROUP, SAMPLING, OFFSET))
	
	## Scanning the following parameters
	degree = np.arange(1, 10)
	gamma = np.logspace(-8, 1, 10)
	lambdas = gamma
	
	####################################
	print('--------------------------')
	print('\nMethod: Least squares GD')
	method = least_squares_GD
	parameters_GD = []

	for d in degree:
		for g in gamma:
			parameters_GD.append({'gamma':g, 'degree':d, 'max_iters':MAX_ITERS, 'offset': OFFSET})

	acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
											 k_fold=K_FOLD, method=method , parameters=parameters_GD, seed=SEED, log=False)

	print_best_perfs(idx_sorted, parameters_GD, acc_val, std_val)
	best_GD = acc_val[idx_sorted[0]]
	
	####################################
	print('--------------------------')
	print('\nMethod: Least squares SGD')
	method = least_squares_SGD
	parameters_SGD = []

	for d in degree:
		for g in gamma:
			parameters_SGD.append({'gamma':g, 'degree':d, 'max_iters':MAX_ITERS, 'offset': OFFSET})

	acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
											 k_fold=K_FOLD, method=method , parameters=parameters_SGD, seed=SEED, log=False)

	print_best_perfs(idx_sorted, parameters_SGD, acc_val, std_val)
	best_SGD = acc_val[idx_sorted[0]]
	
	####################################
	print('--------------------------')
	print('\nMethod: Least squares')
	method = least_squares
	parameters_LS = []

	for d in degree:
			parameters_LS.append({'degree':d, 'offset': OFFSET})
	try:    
		acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
												 k_fold=K_FOLD, method=method , parameters=parameters_LS, seed=SEED, log=False)
		print_best_perfs(idx_sorted, parameters_LS, acc_val, std_val)
		best_LS = acc_val[idx_sorted[0]]
	except:
		print('Least squares did not work')
		best_LS = .5
		
	####################################
	print('--------------------------')
	print('\nMethod: Ridge Regression')
	method = ridge_regression
	parameters_RR = []

	for d in degree:
		for l in lambdas:
			parameters_RR.append({'lambda_':l, 'degree':d,'offset': OFFSET})

	acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
											 k_fold=K_FOLD, method=method , parameters=parameters_RR, seed=SEED, log=False)

	print_best_perfs(idx_sorted, parameters_RR, acc_val, std_val)
	best_RR = acc_val[idx_sorted[0]]
	
	####################################
	print('--------------------------')
	print('\nMethod: Logistic Regression')
	method = logistic_regression
	parameters_LR = []

	for d in degree:
		for g in gamma:
			parameters_LR.append({'gamma':g, 'degree':d, 'max_iters':MAX_ITERS, 'offset': OFFSET, 'initial_w':None})

	acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
											 k_fold=K_FOLD, method=method , parameters=parameters_LR, seed=SEED, log=True)

	print_best_perfs(idx_sorted, parameters_LR, acc_val, std_val)
	best_LR = acc_val[idx_sorted[0]]
	
	####################################
	print('--------------------------')
	print('\nMethod: Regularized Logistic Regression')
	method = reg_logistic_regression
	lambdas = gamma
	parameters_RLR = []

	for d in degree:
		for g in gamma:
			for l in lambdas:
				parameters_RLR.append({'gamma':g, 'lambda_':l, 'degree':d, 'max_iters':MAX_ITERS, 'offset': OFFSET, 'initial_w':None})

	acc_tr, acc_val, std_tr, std_val, idx_sorted = cross_tunning(y, x,
											 k_fold=K_FOLD, method=method , parameters=parameters_RLR, seed=SEED, log=True)

	print_best_perfs(idx_sorted, parameters_RLR, acc_val, std_val)
	best_RLR = acc_val[idx_sorted[0]]
	
	####################################
	print('--------------------------')
	print('\nPlot of the best methods')
	Methods = ['Least square GD', 'Least square SDG', 'Least square', 'Ridge Regression', 'Logistic', 'Regularized Logistic']
	Accuracy = [best_GD,best_SGD, best_LS, best_RR, best_LR, best_RLR]


	ax =sns.relplot(x=Methods, y=Accuracy)
	ax.set_xticklabels(rotation = 80)
	plt.title("Test Accuracy Model Comparison \n Group {}, sampling {}, offset {}".format(GROUP, SAMPLING, OFFSET))
	plt.xlabel('Test Accuracy')
	plt.ylabel('Method')
	plt.show()
	plt.show()

---
# Parameter scan for Group_0, offset True, sampling None

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = None
# Can be True or False
OFFSET = True

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()

Starting parameter scan for: Group group_0, sampling strategy None and offset True
--------------------------

Method: Least squares GD
Ranked 1
Parameters
 {'gamma': 0.1, 'degree': 2, 'max_iters': 100, 'offset': True}
With validation accuracy 0.827780578804074 std 0.0026614841885769674


Ranked 2
Parameters
 {'gamma': 0.1, 'degree': 3, 'max_iters': 100, 'offset': True}
With validation accuracy 0.8264999495815267 std 0.004192974362798449


Ranked 3
Parameters
 {'gamma': 0.1, 'degree': 6, 'max_iters': 100, 'offset': True}
With validation accuracy 0.824765554099022 std 0.00536113166650552


Ranked 4
Parameters
 {'gamma': 0.1, 'degree': 4, 'max_iters': 100, 'offset': True}
With validation accuracy 0.8244932943430474 std 0.004283256290052307


Ranked 5
Parameters
 {'gamma': 0.1, 'degree': 5, 'max_iters': 100, 'offset': True}
With validation accuracy 0.8234647574871433 std 0.003934527927705662


--------------------------

Method: Least squares SGD


---
# Parameter scan for Group_0, offset True, sampling 'under'

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = 'under'
# Can be True or False
OFFSET = True

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()

---
# Parameter scan for Group_0, offset True, sampling 'over'

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = 'over'
# Can be True or False
OFFSET = True

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()

---
# Parameter scan for Group_0, offset False, sampling None

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = None
# Can be True or False
OFFSET = False

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()

---
# Parameter scan for Group_0, offset False, sampling 'under'

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = 'under'
# Can be True or False
OFFSET = False

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()

---
# Parameter scan for Group_0, offset False, sampling 'over'

In [None]:
# Can be 0, 1, 2, or 3 (with group_ before)
GROUP = 'group_0'
# Can be 'under', 'over' or None
SAMPLING = 'over'
# Can be True or False
OFFSET = False

preprocessed_X, _, preprocessed_y, _ = preprocess_data_new(tX, tX_test, y, sampling_strategy=SAMPLING)
x, y = preprocessed_X[GROUP], preprocessed_y[GROUP]

ParameterScan()