In [478]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime
from helpers import *
from features_engineering import *
from cross_validation import *
from pre_processing import *
from group_by import *
from implementation import *
from knn import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading the dataset

In [479]:
y, x, ids = load_csv_data(data_path="datas/train.csv", sub_sample=False)

In [480]:
submission_y, submission_x, submission_ids = load_csv_data(data_path="datas/test.csv", sub_sample=False)

# Create sub dataset 
- Group by numjet column (categorical data : (0, 1, 2, 3))
- Group by the NaN columns

We obtain at the end 8 datasets, one for each numjet and for each of these, 2 according to the NaN columns removed.

In [481]:
sub_jet_num_x_dict, sub_jet_num_y_dict, sub_jet_num_ids_dict = group_by_jetnum_NaN(submission_x, submission_y, submission_ids)

0 (227458, 30) (227458,) (227458,)
1 (175338, 30) (175338,) (175338,)
2 (114648, 30) (114648,) (114648,)
3 (50794, 30) (50794,) (50794,)
num_jet: 0
(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (59263, 19) (59263, 1) (59263, 1)
(4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (168195, 20) (168195, 1) (168195, 1)
	Remove col : 
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 17 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 19 [ 0.  0.  0. ...,  0.  0.  0.]
num_jet: 1
(4, 5, 6, 12, 26, 27, 28) (158095, 23) (158095, 1) (158095, 1)
(0, 4, 5, 6, 12, 26, 27, 28) (17243, 22) (17243, 1) (17243, 1)
	Remove col : 
	 (4, 5, 6, 12, 26, 27, 28) 18 [ 1.  1.  1. ...,  1.  1.  1.]
	 (0, 4, 5, 6, 12, 26, 27, 28) 17 [ 1.  1.  1. ...,  1.  1.  1.]
num_jet: 2
() (107905, 30) (107905, 1) (107905, 1)
(0,) (6743, 29) (6743, 1) (6743, 1)
	Remove col : 
	 ()

In [482]:
jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict = group_by_jetnum_NaN(x, y, ids)

0 (99913, 30) (99913,) (99913,)
1 (77544, 30) (77544,) (77544,)
2 (50379, 30) (50379,) (50379,)
3 (22164, 30) (22164,) (22164,)
num_jet: 0
(4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (73790, 20) (73790, 1) (73790, 1)
(0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) (26123, 19) (26123, 1) (26123, 1)
	Remove col : 
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
	 (4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 19 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 17 [ 0.  0.  0. ...,  0.  0.  0.]
	 (0, 4, 5, 6, 12, 23, 24, 25, 26, 27, 28) 18 [ 0.  0.  0. ...,  0.  0.  0.]
num_jet: 1
(4, 5, 6, 12, 26, 27, 28) (69982, 23) (69982, 1) (69982, 1)
(0, 4, 5, 6, 12, 26, 27, 28) (7562, 22) (7562, 1) (7562, 1)
	Remove col : 
	 (4, 5, 6, 12, 26, 27, 28) 18 [ 1.  1.  1. ...,  1.  1.  1.]
	 (0, 4, 5, 6, 12, 26, 27, 28) 17 [ 1.  1.  1. ...,  1.  1.  1.]
num_jet: 2
() (47427, 30) (47427, 1) (47427, 1)
(0,) (2952, 29) (2952, 1) (2952, 1)
	Remove col : 
	 () 22 [ 2.  2.  2. ...,

# Build the best model for each of the sub dataset

In [483]:
def get_false(x, y, w, predict_threshold):
    """Get the ratio of negative predictions over wrong predictions"""
    
    # Get the predicted values
    pred_y = predict_labels(w, x, predict_threshold)
    # Initialize at 0
    false_count = 0
    count_negatif = 0
    
    # If prediction is wrong, add 1, if prediction is wrong and negative, add 1
    for index, yi in enumerate(y):
        pred_yi = pred_y[index]
        if pred_yi != yi:
            false_count += 1
            if pred_yi == -1:
                count_negatif += 1
                
    # Calculate which percentage of wrong predictions are due to negative value
    return count_negatif / false_count



def get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index):
    # Get the column number of the features that wil be removed
    removed_col_key = list(jet_num_x_dict[numjet])[index]
    # Get the samples of the category numjet of PRI_num_jet and removed data
    x = jet_num_x_dict[numjet][removed_col_key]
    y = jet_num_y_dict[numjet][removed_col_key]
    ids = jet_num_ids_dict[numjet][removed_col_key]
    return x, y, ids

def build_features(x, numjet, index):
    """
    Calculate different features depending on the data (category of PRI_num_jet and nan or not)
    Which features are used has been done with trial and error to improve the loss
    1. Normalize data
    2. Build combinations
    """
    if numjet == 0 and index == 0:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 6, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 7, 8)
    elif numjet == 0 and index == 1:
        x_numjet0_index1 = normalize(x)
        polynomial_x = x_numjet0_index1
        polynomial_x = np.concatenate((polynomial_x, np.tanh(x_numjet0_index1)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.sqrt(np.abs(x_numjet0_index1))), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(x_numjet0_index1, 2)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x_numjet0_index1), 2)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x_numjet0_index1)), 2)), axis=1)
    elif numjet == 1 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 10)
    elif numjet == 2 and index == 0:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
    elif numjet == 2 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 2)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 10)
    elif numjet == 3 and index == 0:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 5)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 5, 8)
        polynomial_x = build_combinations_lvl(polynomial_x, 6, 8)
    elif numjet == 3 and index == 1:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 6)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)
    else:
        polynomial_x = normalize(x)
        polynomial_x = build_polynomial(polynomial_x, 3)
        polynomial_x = build_combinations_lvl(polynomial_x, 2, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 3, 10)
        polynomial_x = build_combinations_lvl(polynomial_x, 4, 10)

    return polynomial_x

def build_best_model(x_, y_, numjet, index):
    """
    Build the best model with the best parameters
    """
    
    # Initialize k_fold and prediction threshold and build features
    k = 5
    predict_threshold = 0
    polynomial_x = build_features(x_, numjet, index)

    # Use the best lambda for best result
    if numjet == 0 and index == 0:
        lambda_ = 4.52035365636e-07
    elif numjet == 0 and index == 1:
        lambda_ = 1e-08
    elif numjet == 1 and index == 1:
        lambda_ = 0.137382379588
    elif numjet == 2 and index == 0:
        lambda_ = 2.39502661999e-07
    elif numjet == 2 and index == 1:
        lambda_ = 0.0417531893656
        predict_threshold = -0.0323232323232
    elif numjet == 3 and index == 0:
        lambda_ = 7.27895384398e-05
    elif numjet == 3 and index == 1:
        lambda_ = 0.529831690628
    else:
        lambda_ = 0.000001


    #Gest the accuracy of test and train using k_fold_corss_validation
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)
    # Find optimal weights and loss with ridge regression
    w, loss = ridge_regression(y_, polynomial_x, lambda_)

    
    print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

    
    return w, predict_threshold, accuracy_train_k, accuracy_test_k

    

In [36]:
"""
Initialize variables to submit data, this includes the id.
It is important as the data will be separated depending on its features and category
"""
count = 0

accuracy_train = 0
accuracy_test = 0

submission_ids = []
submission_y = []

result_y = []
result_ids = []

# For each category in PRI_num_jet and if they have or not NA
for numjet in range(0, 4):
    for index in range(0, 2):
        # Get the x, y and ID
        x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, numjet, index)
        
        # Get the optimal weights and accuracy
        w, predict_threshold, accuracy_train_k, accuracy_test_k = build_best_model(x_, y_, numjet, index)
        
        # Get the number of elements in that category
        number_of_el = len(y_)

        # Add the accuracy in proportion to the number of elements (max 1 if all elements in 1 category)
        accuracy_train += accuracy_train_k * number_of_el
        accuracy_test += accuracy_test_k * number_of_el
        
        # PRint training and testing accuracy
        print(numjet, index, "Train Accuracy: " + str(accuracy_train_k))
        print(numjet, index, "Test Accuracy: " + str(accuracy_test_k))
        
        # Count the number of elements
        count += number_of_el
  
        # Predict local
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x2 = jet_num_x_dict[numjet][removed_col_key]
        sub_ids2 = jet_num_ids_dict[numjet][removed_col_key]

        sub_x2 = build_features(sub_x2, numjet, index)
        pred_y2 = predict_labels(w, sub_x2, predict_threshold)
        
        for sub_index, sub_id in enumerate(sub_ids2):
            result_ids.append(sub_id)
            result_y.append(pred_y2[sub_index])
        
        
        # Predict submission
        removed_col_key = list(jet_num_x_dict[numjet])[index]
        sub_x = sub_jet_num_x_dict[numjet][removed_col_key]
        sub_ids = sub_jet_num_ids_dict[numjet][removed_col_key]
        
        sub_x = build_features(sub_x, numjet, index)
        pred_y = predict_labels(w, sub_x, predict_threshold)
        for sub_index, sub_id in enumerate(sub_ids):
            submission_ids.append(sub_id)
            submission_y.append(pred_y[sub_index])
        
print("Count:", count)
print("Train Accuracy: " + str(accuracy_train / count))
print("Test Accuracy: " + str(accuracy_test / count))

2017-10-26 07:55:26.607268 combinations 2 : 0 / 28
2017-10-26 07:55:28.961267 combinations 3 : 0 / 56
2017-10-26 07:55:33.815268 combinations 3 : 50 / 56
2017-10-26 07:55:34.533263 combinations 4 : 0 / 70
2017-10-26 07:55:40.687307 combinations 4 : 50 / 70
2017-10-26 07:55:43.410314 combinations 5 : 0 / 56
2017-10-26 07:55:51.989276 combinations 5 : 50 / 56
2017-10-26 07:55:53.190269 combinations 6 : 0 / 28
2017-10-26 07:55:58.490264 combinations 7 : 0 / 8
	 Predicted -1 but was 1 : 0.634313005143277
0 0 Train Accuracy: 0.815689795365
0 0 Test Accuracy: 0.814351538149
2017-10-26 07:56:19.637266 combinations 2 : 0 / 28
2017-10-26 07:56:22.038264 combinations 3 : 0 / 56
2017-10-26 07:56:26.855265 combinations 3 : 50 / 56
2017-10-26 07:56:27.559264 combinations 4 : 0 / 70
2017-10-26 07:56:33.137265 combinations 4 : 50 / 70
2017-10-26 07:56:35.762263 combinations 5 : 0 / 56
2017-10-26 07:56:42.943262 combinations 5 : 50 / 56
2017-10-26 07:56:43.907268 combinations 6 : 0 / 28
2017-10-26 07:

2017-10-26 08:08:28.662991 combinations 3 : 50 / 120
2017-10-26 08:08:28.917017 combinations 3 : 100 / 120
2017-10-26 08:08:29.014027 combinations 4 : 0 / 210
2017-10-26 08:08:29.283025 combinations 4 : 50 / 210
2017-10-26 08:08:29.600027 combinations 4 : 100 / 210
2017-10-26 08:08:29.955028 combinations 4 : 150 / 210
2017-10-26 08:08:30.336995 combinations 4 : 200 / 210
2017-10-26 08:08:30.415989 combinations 5 : 0 / 252
2017-10-26 08:08:30.883992 combinations 5 : 50 / 252
2017-10-26 08:08:31.589993 combinations 5 : 100 / 252
2017-10-26 08:08:32.219991 combinations 5 : 150 / 252
2017-10-26 08:08:32.788988 combinations 5 : 200 / 252
2017-10-26 08:08:33.393025 combinations 5 : 250 / 252
2017-10-26 08:08:33.581989 combinations 2 : 0 / 45
2017-10-26 08:08:33.925989 combinations 3 : 0 / 120
2017-10-26 08:08:34.371992 combinations 3 : 50 / 120
2017-10-26 08:08:34.918992 combinations 3 : 100 / 120
2017-10-26 08:08:35.159025 combinations 4 : 0 / 210
2017-10-26 08:08:35.858025 combinations 4 :

In [37]:
# Get total accuracy in our train set
def get_accuracy_ids(result_y, result_ids, y, ids):
    stacked = np.column_stack((ids, y))
    stacked = stacked[stacked[:,0].argsort()]
    stacked_pred = np.column_stack((result_ids, result_y))
    stacked_pred = stacked_pred[stacked_pred[:,0].argsort()]
    
    print(len(stacked_pred), len(stacked))
    unique, counts = np.unique((stacked == stacked_pred)[:, 1], return_counts=True)
    return dict(zip(unique, counts))[True] / len(y)

get_accuracy_ids(result_y, result_ids, y, ids)

250000 250000


0.84057999999999999

In [38]:
# Create submission csv file
submission_stacked = np.column_stack((submission_ids, submission_y))
submission_stacked = submission_stacked[submission_stacked[:,0].argsort()]
create_csv_submission(submission_stacked[:,0], submission_stacked[:,1], "datas/submission.csv")
print('Submission file created !')

Submission file created !


# Find the best model for specific classifier

In [33]:
k = 5

# Manually choose the categories
x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, 3, 0)

def build_polynomial2(x, max_degree):
    polynomial_x = x
    # Create new features with the tanh of the original data
    polynomial_x = np.concatenate((polynomial_x, np.tanh(x)), axis=1)
    # Create new features with the ln of the original data
    polynomial_x = np.concatenate((polynomial_x, np.log(np.abs(x))), axis=1)
    # Create new features with the square root of the original data
    polynomial_x = np.concatenate((polynomial_x, np.sqrt(np.abs(x))), axis=1)
    
    # Create polynomials of max_degree of the new data
    for degree in range(2, max_degree + 1):
        polynomial_x = np.concatenate((polynomial_x, np.power(x, degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.tanh(x), degree)), axis=1)
        polynomial_x = np.concatenate((polynomial_x, np.power(np.log(np.abs(x)), degree)), axis=1)

    return polynomial_x

# Build combinations
polynomial_x = normalize(x_)
polynomial_x = build_polynomial(polynomial_x, 5)
polynomial_x = build_combinations_lvl(polynomial_x, 2, 8)
polynomial_x = build_combinations_lvl(polynomial_x, 3, 8)
polynomial_x = build_combinations_lvl(polynomial_x, 4, 8)
polynomial_x = build_combinations_lvl(polynomial_x, 5, 8)
polynomial_x = build_combinations_lvl(polynomial_x, 6, 8)

predict_threshold = -0.00

# 4.52035365636e-07 0.813863667164

2017-10-25 23:38:15.327153 combinations 2 : 0 / 45
2017-10-25 23:38:19.658188 combinations 3 : 0 / 120
2017-10-25 23:38:24.386155 combinations 3 : 50 / 120
2017-10-25 23:38:32.436804 combinations 3 : 100 / 120
2017-10-25 23:38:35.410679 combinations 4 : 0 / 210
2017-10-25 23:38:43.568814 combinations 4 : 50 / 210
2017-10-25 23:38:50.479413 combinations 4 : 100 / 210
2017-10-25 23:38:58.398380 combinations 4 : 150 / 210
2017-10-25 23:39:06.895377 combinations 4 : 200 / 210


In [34]:
# Here to test the accuracy of one specific classifier
# Find best lambdas
lambdas = np.logspace(-8, 0, 30)
best_accuracy = 0
best_lambda = 0
for lambda_ in lambdas:
    
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)

    if accuracy_test_k > best_accuracy:
        best_accuracy = accuracy_test_k
        best_lambda = lambda_
    print("Lambdas:", lambda_, "Train:", accuracy_train_k, " Test:", accuracy_test_k)

w, loss = ridge_regression(y_, polynomial_x, best_lambda)

print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

print("BEST:", best_lambda, best_accuracy)
lambda_ = best_lambda


Lambdas: 1e-08 Train: 0.845624670532  Test: 0.839515023722
Lambdas: 1.88739182214e-08 Train: 0.845598313126  Test: 0.839557195572
Lambdas: 3.56224789026e-08 Train: 0.845603584607  Test: 0.839493937797
Lambdas: 6.7233575365e-08 Train: 0.845556141276  Test: 0.839451765946
Lambdas: 1.26896100317e-07 Train: 0.845471797575  Test: 0.839493937797
Lambdas: 2.39502661999e-07 Train: 0.845313653137  Test: 0.839599367422
Lambdas: 4.52035365636e-07 Train: 0.845345282024  Test: 0.839409594096
Lambdas: 8.53167852417e-07 Train: 0.845155508698  Test: 0.839578281497
Lambdas: 1.61026202756e-06 Train: 0.845150237217  Test: 0.839409594096
Lambdas: 3.03919538231e-06 Train: 0.845092250923  Test: 0.839325250395
Lambdas: 5.73615251045e-06 Train: 0.844902477596  Test: 0.839093305219
Lambdas: 1.08263673387e-05 Train: 0.844733790195  Test: 0.838713758566
Lambdas: 2.04335971786e-05 Train: 0.844602003163  Test: 0.838713758566
Lambdas: 3.85662042116e-05 Train: 0.844544016869  Test: 0.838755930416
Lambdas: 7.27895384

In [15]:
# Find best threshold
threshs = np.linspace(0, -0.4, num=100)
best_accuracy = 0
best_thresh = 0
for thresh in threshs:
    predict_threshold = thresh
    
    accuracy_train_k, accuracy_test_k = k_fold_cross_validation(y_, polynomial_x, k, lambda_, predict_threshold)

    if accuracy_test_k > best_accuracy:
        best_accuracy = accuracy_test_k
        best_thresh = thresh
    print("Thresh:", thresh, "Train:", accuracy_train_k, " Test:", accuracy_test_k)

w, loss = ridge_regression(y_, polynomial_x, best_lambda)

print("\t Predicted -1 but was 1 :", get_false(polynomial_x, y_, w, predict_threshold))

print("BEST:", best_thresh, best_accuracy)

Thresh: 0.0 Train: 0.938305084746  Test: 0.902372881356
Thresh: -0.0040404040404 Train: 0.938728813559  Test: 0.902372881356
Thresh: -0.00808080808081 Train: 0.938728813559  Test: 0.903389830508
Thresh: -0.0121212121212 Train: 0.938728813559  Test: 0.903050847458
Thresh: -0.0161616161616 Train: 0.938898305085  Test: 0.903050847458
Thresh: -0.020202020202 Train: 0.938898305085  Test: 0.903050847458
Thresh: -0.0242424242424 Train: 0.938983050847  Test: 0.903050847458
Thresh: -0.0282828282828 Train: 0.939237288136  Test: 0.90406779661
Thresh: -0.0323232323232 Train: 0.939491525424  Test: 0.904406779661
Thresh: -0.0363636363636 Train: 0.939830508475  Test: 0.904406779661
Thresh: -0.040404040404 Train: 0.94  Test: 0.90406779661
Thresh: -0.0444444444444 Train: 0.939915254237  Test: 0.903389830508
Thresh: -0.0484848484848 Train: 0.940084745763  Test: 0.902372881356
Thresh: -0.0525252525253 Train: 0.940338983051  Test: 0.902711864407
Thresh: -0.0565656565657 Train: 0.939745762712  Test: 0.9027

# Functions to implement

In [484]:
x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, 3, 1)

In [485]:
y = 0
w = 0
x = 0
tx = 0

y__ = []
for i in y_:
    y__.append(i[0] if i == 1 else 0)
    

x_ = normalize(x_) 
initial_w = np.random.rand(x_.shape[1], 1)
calculate_loss(y_, x_, initial_w)

2741.7361147921183

## Regularized logistic regression

In [486]:
w, loss = reg_logistic_regression(y_, x_, 0.001, initial_w, 2, 1)

Current iteration=0, loss=-8745.913669069796
loss=nan


  """
  
  


## Logistic regression

In [487]:
w, loss = logistic_regression(y_, x_, initial_w, 200, 1)
loss


Current iteration=0, loss=-8745.924528557429
Current iteration=100, loss=nan


  
  
  """


nan

## Gradient descent

In [488]:
w, loss = least_squares_GD(y_, x_, initial_w, 100, 0.5)

Gradient Descent(0/99): loss=16.31680577135001		0.3744075829383886
Gradient Descent(1/99): loss=50.736759376294046		0.6269465132024373
Gradient Descent(2/99): loss=181.81340506469255		0.37576167907921465
Gradient Descent(3/99): loss=660.8932795968924		0.6276235612728503
Gradient Descent(4/99): loss=2408.6660456413883		0.3744075829383886
Gradient Descent(5/99): loss=8783.604796420854		0.6262694651320244
Gradient Descent(6/99): loss=32035.334534264457		0.3744075829383886
Gradient Descent(7/99): loss=116842.53028677976		0.6262694651320244
Gradient Descent(8/99): loss=426163.8384898581		0.37373053486797564
Gradient Descent(9/99): loss=1554365.9815739973		0.6262694651320244
Gradient Descent(10/99): loss=5669310.581684775		0.37373053486797564
Gradient Descent(11/99): loss=20677940.987653714		0.6262694651320244
Gradient Descent(12/99): loss=75419622.43471941		0.37373053486797564
Gradient Descent(13/99): loss=275081524.0263116		0.6262694651320244
Gradient Descent(14/99): loss=1003317739.867309

## Stochastic gradient descent

In [489]:
w, loss = least_squares_SGD(y_, x_, initial_w, 100, 0.5)

SGD (0/99): loss=103.7220312797626		0.49085985104942453
SGD (1/99): loss=634.9095254158576		0.4617467840216655
SGD (2/99): loss=712.8161550605626		0.4813811780636425
SGD (3/99): loss=22242.202312406975		0.5639810426540285
SGD (4/99): loss=36484.35925564048		0.5260663507109005
SGD (5/99): loss=365371.7919403375		0.45091401489505756
SGD (6/99): loss=667213.738941194		0.4752877454299255
SGD (7/99): loss=1678835.1118862384		0.45023696682464454
SGD (8/99): loss=30857173.477063492		0.5199729180771835
SGD (9/99): loss=33983658.9360351		0.5186188219363574
SGD (10/99): loss=61329227.89667664		0.5016926201760324
SGD (11/99): loss=1160543243.6444895		0.5497630331753555
SGD (12/99): loss=1493404993.3918626		0.4901828029790115
SGD (13/99): loss=24582257029.171535		0.5660121868652674
SGD (14/99): loss=153587856742.85718		0.4163845633039946
SGD (15/99): loss=159003153532.88318		0.4197698036560596
SGD (16/99): loss=431754571373.21985		0.44888287068381855
SGD (17/99): loss=3367053747642.421		0.41096817

## Least squares

In [490]:
w, loss = least_squares(y_, x_)
w.shape

Rank: 28


(28, 1)

## Ridge regression

In [491]:
w, loss = ridge_regression(y_, x_, 0.5)
w.shape

(28, 1)

In [296]:
x_, y_, ids_ = get_data_numjet(jet_num_x_dict, jet_num_y_dict, jet_num_ids_dict, 0, 1)

In [298]:
train_x, train_y, test_x, test_y = separate_set(x_, y_)
pred_y = knn_predict(train_x, train_y, test_x, 100)

13:55:49 0/5225
13:58:21 500/5225
14:00:54 1000/5225
14:03:29 1500/5225
14:06:05 2000/5225
14:08:38 2500/5225
14:17:57 3000/5225
14:20:43 3500/5225
14:23:21 4000/5225
14:25:57 4500/5225
14:28:39 5000/5225


In [300]:
correct_count = 0
for index, yi in enumerate(test_y):
    pred_yi = pred_y[index]
    if pred_yi == yi:
        correct_count += 1
        
print(correct_count / len(test_y))

0.9483253588516747


In [286]:
def k_fold_cross_validation_knn(y, x, k, knn_k):
    """return the accuracy of ridge regression."""

    # Return the array of which indeces go in each k interval
    k_indices = build_k_indices(y, k)

    # Create empty lists for the accuracy test data
    accuracy_te = []

    # Loop through each interval
    for i in range(0, k):

        # get k'th subgroup in test, others in train:
        x_test = x[k_indices[i]]
        y_test = y[k_indices[i]]
        x_train = np.array([]).reshape(0, x.shape[1])
        y_train = np.array([]).reshape(0, 1)

        for j in range(0, k):
            # If the index interval is different from test, put it in train
            if j != i:
                x_train = np.concatenate((x_train, x[k_indices[j]]))
                y_train = np.concatenate((y_train, y[k_indices[j]]))

        # KNN:
        pred_y = knn_predict(x_train, y_train, x_test, 100)
        
        # Calculate the accuracy
        correct_count = 0
        for index, yi in enumerate(y_test):
            pred_yi = pred_y[index]
            if pred_yi == yi:
                correct_count += 1
        accuracy = correct_count / len(y_test)

        # Append it to the list
        accuracy_te.append(accuracy)

    return np.mean(accuracy_te)

In [297]:
k_fold_cross_validation_knn(y_, x_, 5, 100)

13:52:54 0/5224
13:55:27 500/5224


KeyboardInterrupt: 

In [None]:
# 3.1 work better

In [327]:
x3, y3, ids3 = remove_NaN(x, y, ids, delete_columns=True, delete_rows=False)
logistic_regression(y3, x3, np.zeros((x3.shape[1], 1)), 1000, 0.1)


TypeError: 'int' object is not iterable