In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
import csv

## Loading data

In [2]:
#load the data 
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    yb = np.ones(len(y))
    yb[np.where(y == "b")] = -1

    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

##############################

#standardize the data
def standardize(x):

    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

##############################

#extract each feature of x
def extract(lst, nb):
    return [item[nb] for item in lst]

##############################

#create subplot for each feature
def subplot(y,x,n):
    feat = x[:,n] #extract(x,n)
    ax.plot(feat,y,'o')
    ax.set_title(f'Feature '+ str(n))
    
##############################

#compute the number of undefined values in a certain feature
def nb_undef_feat(x, n):
    feat = x[:,n] #extract(x,n)
    unique, frequency = np.unique(feat, return_counts = True) #ligne à changer
    
    if(unique[0] == -999): 
        return int(frequency[0]) 
    else: 
        return 0
    
##############################

#remove a column from the data
def remove_feat(x, n):
    d = np.delete(d,n, axis = 1)
    return d

##############################

#remove a feature from a data set that has more than 
#only put x as argument
def remove_feat_w_undef(x):
    xdel = []
    #xinter= x[0]
    for i in range(len(x[0])):
        nb = nb_undef_feat(x, i)
        #print(i)
        print(nb)
        if(nb > np.double(0.6*len(x))):
            xdel = xdel + [i]
        elif(nb > 0):
            taille = len(x[:, i])
            if(taille % 2 == 0):
                index1 = int((taille + nb)/2 - 0.5)
                index2 = int((taille + nb)/2 + 0.5)
                x_sorted = np.sort(x[:, i])
                xmedian = (x_sorted[index1] + x_sorted[index2]) / 2.0
            else:
                index = int((taille + nb)/2)
                xmedian = (np.sort(x[:, i]))[index]

            x[:,i][x[:,i] == -999] = xmedian
 
    x = np.delete(x,xdel,axis =1)
    return x, xdel
       
##############################

def Multi_Model(tx):
#change so that we also keep the indices
    tx0 = []
    tx1 = []
    tx2 = []
    tx3 = []
    index0 = []
    index1 = []
    index2 = []
    index3 = []
    
    for i in range(len(tx)):
        if(tx[i][22]==0):
            tx0 = tx0 + [tx[i]]
            index0 = index0 + [i]
        else:
            if(tx[i][22]==1):
                tx1 = tx1 + [tx[i]]
                index1 = index1 + [i]
            else:
                if(tx[i][22]==2):
                    tx2 = tx2 + [tx[i]]
                    index2 = index2 + [i]
                else:
                    if(tx[i][22]==3):
                        tx3 = tx3 + [tx[i]]
                        index3 = index3 + [i]
    
    return np.array(tx0), np.array(tx1), np.array(tx2), np.array(tx3), index0, index1, index2, index3

##############################
              
#create a csv submission
def create_csv_submission(ids, y_pred, name):

    with open(name, "w") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})
            
######################################
def build_model_data(height, weight):   
    
    y = weight
    x = height
    num_samples = len(y)
    tx = np.c_[np.ones(num_samples), x]
    return y, tx
######################################
#Cross-validation implementation
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)

In [22]:
train_data = load_csv_data('train.csv', sub_sample=False)
test_data = load_csv_data('test.csv', sub_sample=False)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ids = x[:, 0].astype(np.int)


In [23]:
type(train_data)
len(train_data)
print(train_data)

(array([ 1., -1., -1., ...,  1., -1., -1.]), array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]]), array([100000, 100001, 100002, ..., 349997, 349998, 349999]))


In [24]:
y = train_data[0]

In [25]:
train_data[1]

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [26]:
train_data[2]

array([100000, 100001, 100002, ..., 349997, 349998, 349999])

In [27]:
#train_data[4] is out of range
train_data[1][1]
len(train_data[1][1])

30

In [28]:
len(train_data[1])

250000

In [29]:
train_data_std = train_data
# 1st column is 'y'
# 2nd column is 'X'
# 3rd column is 'ids'

In [30]:
train_data_std1, mean1, std1 = standardize(train_data[1]) #see to what it corresponds
#train_data_std2, mean2, std2 = standardize(train_data[2]) #see to what it corresponds

In [31]:
def get_with_index(x, idx):
    x_fin = []
    
    for i in range(len(idx)):
        x_fin += [x[idx[i]]]
    
    return np.array(x_fin)

def create_with_index(x0, x1, x2, x3, idx0, idx1, idx2, idx3):
    total_length = len(x0) + len(x1) + len(x2) + len(x3)
    x_fin = np.repeat(x0[0], total_length).reshape(total_length,)
    
    print(x0.shape)
    print(len(idx0))
    print(x_fin.shape)
    
    for i in range(len(x0)):
        x_fin[idx0[i]] = x0[i]
    for i in range(len(x1)):
        x_fin[idx1[i]] = x1[i]
    for i in range(len(x2)):
        x_fin[idx2[i]] = x2[i]
    for i in range(len(x3)):
        x_fin[idx3[i]] = x3[i]
        
    return x_fin

In [32]:
"""
zero, un, deux, trois, idx0, idx1, idx2, idx3 = Multi_Model(train_data[1])
x_fin = create_with_index(zero, un, deux, trois, idx0, idx1, idx2, idx3)

test = True

for i in range(len(x_fin)):
    for j in range(len(x_fin[0])):
        if(x_fin[i,j] != np.array(train_data[1])[i,j]):
            print(i)
            test = False
            break
        
print(test)
"""

'\nzero, un, deux, trois, idx0, idx1, idx2, idx3 = Multi_Model(train_data[1])\nx_fin = create_with_index(zero, un, deux, trois, idx0, idx1, idx2, idx3)\n\ntest = True\n\nfor i in range(len(x_fin)):\n    for j in range(len(x_fin[0])):\n        if(x_fin[i,j] != np.array(train_data[1])[i,j]):\n            print(i)\n            test = False\n            break\n        \nprint(test)\n'

## Removing columns with too many undefined the train data

In [33]:
#load data : already done
####

#separate the data in 4 according to feature index 22
zero, un, deux, trois, idx0, idx1, idx2, idx3 = Multi_Model(train_data[1])
print(train_data[1].shape)

y0 = get_with_index(y, idx0)
y1 = get_with_index(y, idx1)
y2 = get_with_index(y, idx2)
y3 = get_with_index(y, idx3)

#drop columns in each of the four subdivisions if they have columns with more than 60% of undefined data
zero_new, xdel0 = remove_feat_w_undef(zero)
print(zero_new.shape)
un_new, xdel1 = remove_feat_w_undef(un)
print(un_new)
deux_new, xdel2 = remove_feat_w_undef(deux)
print(deux_new)
trois_new, xdel3 = remove_feat_w_undef(trois)
print(trois_new)

#for each subdivision : see correlation plots

#for each column of each subdivision print the percent of -999 present 


#standardize the cleaned data
zero_std, _, _ = standardize(zero_new)
print(zero_std)
un_std, _, _ = standardize(un_new)
print(un_std)
deux_std, _, _ = standardize(deux_new)
print(deux_std)
trois_std, _, _ = standardize(trois_new)
print(trois_std.shape)

#drop columns with little correlation



(250000, 30)
26123
0
0
0
99913
99913
99913
0
0
0
0
0
99913
0
0
0
0
0
0
0
0
0
0
99913
99913
99913
99913
99913
99913
0
(99913, 20)
7562
0
0
0
77544
77544
77544
0
0
0
0
0
77544
0
0
0
0
0
0
0
0
0
0
0
0
0
77544
77544
77544
0
[[160.937   68.768  103.235  ...   0.725    1.158   46.226 ]
 [112.4055 162.172  125.953  ...   2.053   -2.028   44.251 ]
 [154.916   10.418   94.714  ...  -0.715   -1.724   30.638 ]
 ...
 [112.4055  78.256   79.699  ...  -0.852   -0.706   78.984 ]
 [133.457   77.54    88.989  ...  -1.234    2.521   70.969 ]
 [105.457   60.526   75.839  ...   1.8     -0.166   41.992 ]]
2952
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
[[ 1.38470e+02  5.16550e+01  9.78270e+01 ...  1.24000e+00 -2.47500e+00
   1.13497e+02]
 [ 1.48754e+02  2.88620e+01  1.07782e+02 ...  1.31000e-01 -2.76700e+00
   1.79877e+02]
 [ 1.41481e+02  7.36000e-01  1.11581e+02 ... -7.98000e-01 -2.78500e+00
   2.78009e+02]
 ...
 [ 1.19934e+02  2.00780e+01  8.87510e+01 ... -1.72500e+00 -2.75600e+00
   1.129

In [34]:
import implementations

_, tx0 = build_model_data(zero_std, zero_std)
_, tx1 = build_model_data(un_std, un_std)
_, tx2 = build_model_data(deux_std, deux_std)
_, tx3 = build_model_data(trois_std, trois_std)

print(tx0.shape)
print(tx1.shape)
print(tx2.shape)
print(tx3.shape)

#_, tc = build_model_data(test_data[1], test_data[1])

y0 = (y0 + 1)/2
y1 = (y1 + 1)/2
y2 = (y2 + 1)/2
y3 = (y3 + 1)/2

(99913, 21)
(77544, 24)
(50379, 31)
(22164, 31)


In [35]:
t0, t1, t2, t3, idx0, idx1, idx2, idx3 = Multi_Model(test_data[1])

t0_cleaned = np.delete(t0, xdel0, axis = 1)
t1_cleaned = np.delete(t1, xdel1, axis = 1)
t2_cleaned = np.delete(t2, xdel2, axis = 1)
t3_cleaned = np.delete(t3, xdel3, axis = 1)

_, t0_cleaned = build_model_data(t0_cleaned, t0_cleaned)
_, t1_cleaned = build_model_data(t1_cleaned, t1_cleaned)
_, t2_cleaned = build_model_data(t2_cleaned, t2_cleaned)
_, t3_cleaned = build_model_data(t3_cleaned, t3_cleaned)

Current iteration=0, loss=nan
Current iteration=100, loss=inf
Current iteration=200, loss=0.44301833010604125
Current iteration=300, loss=0.4408614402934563
Current iteration=400, loss=0.4402244456236474
Current iteration=500, loss=0.4398907081116085
Current iteration=600, loss=0.4396837105053276
Current iteration=700, loss=0.4395427563193815
Current iteration=800, loss=0.4394396935666764
Current iteration=900, loss=0.4393599456377966
Current iteration=1000, loss=0.43929543997665094
Current iteration=1100, loss=0.43924141169966774
Current iteration=1200, loss=0.43919488823507796
Current iteration=1300, loss=0.4391539446198477
Current iteration=1400, loss=0.43911725242980204
Current iteration=1500, loss=0.4390839017526276
Current iteration=1600, loss=0.43905322196830726
Current iteration=1700, loss=0.4390247194230413
Current iteration=1800, loss=0.4389980284339286
Current iteration=1900, loss=0.43897286097834615
Current iteration=2000, loss=0.4389489952660155
Current iteration=2100, los

Current iteration=16900, loss=0.43716378304174075
Current iteration=17000, loss=0.43715365984644483
Current iteration=17100, loss=0.4371435475023191
Current iteration=17200, loss=0.43713344597241394
Current iteration=17300, loss=0.4371233552199502
Current iteration=17400, loss=0.4371132752083194
Current iteration=17500, loss=0.43710320590108437
Current iteration=17600, loss=0.43709314754731704
Current iteration=17700, loss=0.43708309953976265
Current iteration=17800, loss=0.4370730624127052
Current iteration=17900, loss=0.4370630358450532
Current iteration=18000, loss=0.4370530195177676
Current iteration=18100, loss=0.43704301396281386
Current iteration=18200, loss=0.4370330188610254
Current iteration=18300, loss=0.4370230341773291
Current iteration=18400, loss=0.43701305987682004
Current iteration=18500, loss=0.4370030959247632
Current iteration=18600, loss=0.4369931422865919
Current iteration=18700, loss=0.43698319920814294
Current iteration=18800, loss=0.4369732660942642
Current ite

Current iteration=33500, loss=0.4356090145356612
Current iteration=33600, loss=0.43560029936603356
Current iteration=33700, loss=0.4355915906647448
Current iteration=33800, loss=0.4355828886388742
Current iteration=33900, loss=0.43557419327231806
Current iteration=34000, loss=0.4355655045490451
Current iteration=34100, loss=0.43555682223185416
Current iteration=34200, loss=0.4355481467476768
Current iteration=34300, loss=0.4355394778591221
Current iteration=34400, loss=0.43553081533020155
Current iteration=34500, loss=0.43552215936615
Current iteration=34600, loss=0.4355135101709403
Current iteration=34700, loss=0.4355048672895482
Current iteration=34800, loss=0.43549623092641837
Current iteration=34900, loss=0.4354876010661538
Current iteration=35000, loss=0.43547897769342553
Current iteration=35100, loss=0.4354703607929734
Current iteration=35200, loss=0.43546175013200256
Current iteration=35300, loss=0.4354531461309175
Current iteration=35400, loss=0.4354445483397849
Current iterati

Current iteration=50200, loss=0.43423562762077866
Current iteration=50300, loss=0.4342278473233907
Current iteration=50400, loss=0.4342200717799851
Current iteration=50500, loss=0.4342123008092699
Current iteration=50600, loss=0.4342045347492333
Current iteration=50700, loss=0.4341967734183851
Current iteration=50800, loss=0.4341890166362934
Current iteration=50900, loss=0.4341812645675207
Current iteration=51000, loss=0.434173517203918
Current iteration=51100, loss=0.4341657747088222
Current iteration=51200, loss=0.4341580365597714
Current iteration=51300, loss=0.43415030326307374
Current iteration=51400, loss=0.4341425746392388
Current iteration=51500, loss=0.43413485050982303
Current iteration=51600, loss=0.43412713120797747
Current iteration=51700, loss=0.4341194163851313
Current iteration=51800, loss=0.4341117062037993
Current iteration=51900, loss=0.4341040006560865
Current iteration=52000, loss=0.4340962997341256
Current iteration=52100, loss=0.4340886034300776
Current iteration

Current iteration=66900, loss=0.43299644309824986
Current iteration=67000, loss=0.4329893578796242
Current iteration=67100, loss=0.4329822763626559
Current iteration=67200, loss=0.4329751985425151
Current iteration=67300, loss=0.4329681244143868
Current iteration=67400, loss=0.43296105383882266
Current iteration=67500, loss=0.4329539869460801
Current iteration=67600, loss=0.432946923865636
Current iteration=67700, loss=0.432939864324032
Current iteration=67800, loss=0.43293280831711617
Current iteration=67900, loss=0.43292575610806566
Current iteration=68000, loss=0.4329187075581101
Current iteration=68100, loss=0.43291166252928176
Current iteration=68200, loss=0.4329046211505484
Current iteration=68300, loss=0.43289758341723333
Current iteration=68400, loss=0.432890549324674
Current iteration=68500, loss=0.4328835188682214
Current iteration=68600, loss=0.4328764920432406
Current iteration=68700, loss=0.4328694687130175
Current iteration=68800, loss=0.4328624490054249
Current iteration

Current iteration=83600, loss=0.4318609950841123
Current iteration=83700, loss=0.43185446789220117
Current iteration=83800, loss=0.43184794366044843
Current iteration=83900, loss=0.4318414225976837
Current iteration=84000, loss=0.43183490448924583
Current iteration=84100, loss=0.43182838933253825
Current iteration=84200, loss=0.43182187733548927
Current iteration=84300, loss=0.4318153683894891
Current iteration=84400, loss=0.4318088623865479
Current iteration=84500, loss=0.43180235942889733
Current iteration=84600, loss=0.4317958595135156
Current iteration=84700, loss=0.4317893625328931
Current iteration=84800, loss=0.4317828686931674
Current iteration=84900, loss=0.43177637778249933
Current iteration=85000, loss=0.43176988990239307
Current iteration=85100, loss=0.4317634050498636
Current iteration=85200, loss=0.4317569232219339
Current iteration=85300, loss=0.43175044431204657
Current iteration=85400, loss=0.43174396852456454
Current iteration=85500, loss=0.4317374956495076
Current it

Current iteration=300, loss=0.573598343329216
Current iteration=400, loss=0.5724009498832715
Current iteration=500, loss=0.5716658631433543
Current iteration=600, loss=0.5711533700087558
Current iteration=700, loss=0.5707768288962679
Current iteration=800, loss=0.5704917359087553
Current iteration=900, loss=0.5702711659520082
Current iteration=1000, loss=0.5700975117781796
Current iteration=1100, loss=0.5699587112241166
Current iteration=1200, loss=0.5698462214207887
Current iteration=1300, loss=0.5697538364138158
Current iteration=1400, loss=0.5696769585574303
Current iteration=1500, loss=0.5696121296214277
Current iteration=1600, loss=0.5695567176098396
Current iteration=1700, loss=0.5695087008370128
Current iteration=1800, loss=0.5694665150575341
Current iteration=1900, loss=0.569428942861763
Current iteration=2000, loss=0.569395032235847
Current iteration=2100, loss=0.5693640357462858
Current iteration=2200, loss=0.5693353646097211
Current iteration=2300, loss=0.5693085536865609
Cu

Current iteration=17300, loss=0.5667226341586704
Current iteration=17400, loss=0.5667077070286642
Current iteration=17500, loss=0.5666928043506441
Current iteration=17600, loss=0.5666779260648274
Current iteration=17700, loss=0.566663072111676
Current iteration=17800, loss=0.5666482424318279
Current iteration=17900, loss=0.5666334369661447
Current iteration=18000, loss=0.5666186556556955
Current iteration=18100, loss=0.566603898441736
Current iteration=18200, loss=0.5665891652657432
Current iteration=18300, loss=0.5665744560694009
Current iteration=18400, loss=0.5665597707945776
Current iteration=18500, loss=0.5665451093833531
Current iteration=18600, loss=0.566530471777997
Current iteration=18700, loss=0.5665158579210058
Current iteration=18800, loss=0.5665012677550432
Current iteration=18900, loss=0.5664867012229879
Current iteration=19000, loss=0.5664721582679002
Current iteration=19100, loss=0.5664576388330539
Current iteration=19200, loss=0.5664431428619066
Current iteration=19300

Current iteration=34100, loss=0.5645172839090817
Current iteration=34200, loss=0.5645057617494739
Current iteration=34300, loss=0.564494256311238
Current iteration=34400, loss=0.5644827675592579
Current iteration=34500, loss=0.5644712954585259
Current iteration=34600, loss=0.5644598399741199
Current iteration=34700, loss=0.5644484010712355
Current iteration=34800, loss=0.5644369787151458
Current iteration=34900, loss=0.5644255728712447
Current iteration=35000, loss=0.5644141835050093
Current iteration=35100, loss=0.5644028105820106
Current iteration=35200, loss=0.5643914540679464
Current iteration=35300, loss=0.5643801139285773
Current iteration=35400, loss=0.5643687901297826
Current iteration=35500, loss=0.5643574826375225
Current iteration=35600, loss=0.5643461914178691
Current iteration=35700, loss=0.564334916436987
Current iteration=35800, loss=0.5643236576611207
Current iteration=35900, loss=0.5643124150566255
Current iteration=36000, loss=0.5643011885899486
Current iteration=3610

Current iteration=50900, loss=0.5627917229652516
Current iteration=51000, loss=0.5627825845261123
Current iteration=51100, loss=0.5627734580649546
Current iteration=51200, loss=0.5627643435592443
Current iteration=51300, loss=0.5627552409864943
Current iteration=51400, loss=0.5627461503242811
Current iteration=51500, loss=0.5627370715502323
Current iteration=51600, loss=0.5627280046420304
Current iteration=51700, loss=0.5627189495774096
Current iteration=51800, loss=0.5627099063341553
Current iteration=51900, loss=0.5627008748901206
Current iteration=52000, loss=0.5626918552231988
Current iteration=52100, loss=0.5626828473113422
Current iteration=52200, loss=0.5626738511325595
Current iteration=52300, loss=0.5626648666648987
Current iteration=52400, loss=0.5626558938864804
Current iteration=52500, loss=0.5626469327754696
Current iteration=52600, loss=0.5626379833100735
Current iteration=52700, loss=0.5626290454685765
Current iteration=52800, loss=0.5626201192292901
Current iteration=52

Current iteration=67700, loss=0.5614085932430394
Current iteration=67800, loss=0.5614011886764666
Current iteration=67900, loss=0.5613937929564128
Current iteration=68000, loss=0.5613864060675829
Current iteration=68100, loss=0.5613790279947136
Current iteration=68200, loss=0.5613716587225619
Current iteration=68300, loss=0.5613642982359304
Current iteration=68400, loss=0.5613569465196575
Current iteration=68500, loss=0.5613496035586072
Current iteration=68600, loss=0.5613422693376713
Current iteration=68700, loss=0.5613349438417873
Current iteration=68800, loss=0.5613276270559221
Current iteration=68900, loss=0.5613203189650636
Current iteration=69000, loss=0.5613130195542498
Current iteration=69100, loss=0.561305728808538
Current iteration=69200, loss=0.5612984467130208
Current iteration=69300, loss=0.5612911732528358
Current iteration=69400, loss=0.5612839084131249
Current iteration=69500, loss=0.56127665217909
Current iteration=69600, loss=0.5612694045359572
Current iteration=69700

Current iteration=84500, loss=0.5602778302816738
Current iteration=84600, loss=0.5602717205948331
Current iteration=84700, loss=0.5602656175906093
Current iteration=84800, loss=0.5602595212582525
Current iteration=84900, loss=0.560253431587014
Current iteration=85000, loss=0.560247348566189
Current iteration=85100, loss=0.56024127218507
Current iteration=85200, loss=0.5602352024329913
Current iteration=85300, loss=0.5602291392993015
Current iteration=85400, loss=0.5602230827733728
Current iteration=85500, loss=0.5602170328445881
Current iteration=85600, loss=0.5602109895023669
Current iteration=85700, loss=0.5602049527361365
Current iteration=85800, loss=0.5601989225353523
Current iteration=85900, loss=0.5601928988894975
Current iteration=86000, loss=0.560186881788063
Current iteration=86100, loss=0.5601808712205674
Current iteration=86200, loss=0.5601748671765483
Current iteration=86300, loss=0.560168869645568
Current iteration=86400, loss=0.5601628786172088
Current iteration=86500, l

Current iteration=1400, loss=0.5500251985063901
Current iteration=1500, loss=0.5498469201749041
Current iteration=1600, loss=0.5496902777566551
Current iteration=1700, loss=0.5495507863533847
Current iteration=1800, loss=0.5494252429038806
Current iteration=1900, loss=0.5493113020647582
Current iteration=2000, loss=0.5492072028868206
Current iteration=2100, loss=0.5491115902278919
Current iteration=2200, loss=0.549023396420205
Current iteration=2300, loss=0.5489417617101572
Current iteration=2400, loss=0.5488659799351739
Current iteration=2500, loss=0.5487954608207861
Current iteration=2600, loss=0.5487297033577556
Current iteration=2700, loss=0.5486682766622721
Current iteration=2800, loss=0.5486108059604354
Current iteration=2900, loss=0.5485569621339201
Current iteration=3000, loss=0.5485064537793666
Current iteration=3100, loss=0.548459021071116
Current iteration=3200, loss=0.5484144309392365
Current iteration=3300, loss=0.5483724732229064
Current iteration=3400, loss=0.54833295755

Current iteration=18400, loss=0.5472894662257073
Current iteration=18500, loss=0.5472864832539917
Current iteration=18600, loss=0.5472835020662623
Current iteration=18700, loss=0.5472805226415057
Current iteration=18800, loss=0.5472775449597677
Current iteration=18900, loss=0.547274569002098
Current iteration=19000, loss=0.5472715947504958
Current iteration=19100, loss=0.5472686221878594
Current iteration=19200, loss=0.5472656512979391
Current iteration=19300, loss=0.5472626820652912
Current iteration=19400, loss=0.5472597144752351
Current iteration=19500, loss=0.5472567485138133
Current iteration=19600, loss=0.5472537841677525
Current iteration=19700, loss=0.5472508214244275
Current iteration=19800, loss=0.5472478602718273
Current iteration=19900, loss=0.5472449006985214
Current iteration=20000, loss=0.5472419426936301
Current iteration=20100, loss=0.5472389862467951
Current iteration=20200, loss=0.5472360313481515
Current iteration=20300, loss=0.5472330779883011
Current iteration=204

Current iteration=35200, loss=0.5468083407709399
Current iteration=35300, loss=0.5468055873318006
Current iteration=35400, loss=0.5468028351380116
Current iteration=35500, loss=0.5468000841887024
Current iteration=35600, loss=0.5467973344830025
Current iteration=35700, loss=0.5467945860200428
Current iteration=35800, loss=0.5467918387989558
Current iteration=35900, loss=0.546789092818874
Current iteration=36000, loss=0.5467863480789317
Current iteration=36100, loss=0.5467836045782637
Current iteration=36200, loss=0.5467808623160061
Current iteration=36300, loss=0.5467781212912961
Current iteration=36400, loss=0.5467753815032717
Current iteration=36500, loss=0.5467726429510715
Current iteration=36600, loss=0.546769905633836
Current iteration=36700, loss=0.5467671695507058
Current iteration=36800, loss=0.5467644347008231
Current iteration=36900, loss=0.5467617010833307
Current iteration=37000, loss=0.5467589686973723
Current iteration=37100, loss=0.5467562375420928
Current iteration=3720

Current iteration=52000, loss=0.5463625860909198
Current iteration=52100, loss=0.5463600303521967
Current iteration=52200, loss=0.5463574757248044
Current iteration=52300, loss=0.5463549222080123
Current iteration=52400, loss=0.5463523698010904
Current iteration=52500, loss=0.5463498185033097
Current iteration=52600, loss=0.5463472683139418
Current iteration=52700, loss=0.546344719232259
Current iteration=52800, loss=0.5463421712575345
Current iteration=52900, loss=0.546339624389042
Current iteration=53000, loss=0.5463370786260562
Current iteration=53100, loss=0.5463345339678524
Current iteration=53200, loss=0.5463319904137064
Current iteration=53300, loss=0.5463294479628948
Current iteration=53400, loss=0.5463269066146955
Current iteration=53500, loss=0.5463243663683862
Current iteration=53600, loss=0.5463218272232461
Current iteration=53700, loss=0.5463192891785545
Current iteration=53800, loss=0.5463167522335921
Current iteration=53900, loss=0.5463142163876398
Current iteration=5400

Current iteration=68800, loss=0.545948267151401
Current iteration=68900, loss=0.5459458884073801
Current iteration=69000, loss=0.5459435106615325
Current iteration=69100, loss=0.545941133913238
Current iteration=69200, loss=0.5459387581618772
Current iteration=69300, loss=0.5459363834068316
Current iteration=69400, loss=0.5459340096474828
Current iteration=69500, loss=0.5459316368832137
Current iteration=69600, loss=0.5459292651134069
Current iteration=69700, loss=0.5459268943374462
Current iteration=69800, loss=0.5459245245547157
Current iteration=69900, loss=0.5459221557646002
Current iteration=70000, loss=0.5459197879664851
Current iteration=70100, loss=0.5459174211597567
Current iteration=70200, loss=0.5459150553438007
Current iteration=70300, loss=0.5459126905180051
Current iteration=70400, loss=0.5459103266817569
Current iteration=70500, loss=0.5459079638344447
Current iteration=70600, loss=0.5459056019754573
Current iteration=70700, loss=0.5459032411041844
Current iteration=7080

Current iteration=85600, loss=0.5455621774532549
Current iteration=85700, loss=0.5455599581406719
Current iteration=85800, loss=0.5455577397298337
Current iteration=85900, loss=0.5455555222202093
Current iteration=86000, loss=0.5455533056112691
Current iteration=86100, loss=0.545551089902483
Current iteration=86200, loss=0.5455488750933218
Current iteration=86300, loss=0.5455466611832569
Current iteration=86400, loss=0.5455444481717596
Current iteration=86500, loss=0.5455422360583028
Current iteration=86600, loss=0.5455400248423588
Current iteration=86700, loss=0.5455378145234007
Current iteration=86800, loss=0.545535605100902
Current iteration=86900, loss=0.5455333965743372
Current iteration=87000, loss=0.5455311889431804
Current iteration=87100, loss=0.545528982206907
Current iteration=87200, loss=0.5455267763649924
Current iteration=87300, loss=0.5455245714169126
Current iteration=87400, loss=0.5455223673621437
Current iteration=87500, loss=0.5455201642001629
Current iteration=87600

Current iteration=2600, loss=0.5485408961852548
Current iteration=2700, loss=0.5485207363080257
Current iteration=2800, loss=0.5485019495320018
Current iteration=2900, loss=0.5484844023844954
Current iteration=3000, loss=0.5484679805258688
Current iteration=3100, loss=0.5484525847683581
Current iteration=3200, loss=0.5484381281202292
Current iteration=3300, loss=0.5484245335643521
Current iteration=3400, loss=0.5484117323669552
Current iteration=3500, loss=0.5483996627723224
Current iteration=3600, loss=0.5483882689809569
Current iteration=3700, loss=0.5483775003379426
Current iteration=3800, loss=0.5483673106787941
Current iteration=3900, loss=0.5483576577946012
Current iteration=4000, loss=0.5483485029886056
Current iteration=4100, loss=0.5483398107037172
Current iteration=4200, loss=0.5483315482057818
Current iteration=4300, loss=0.5483236853112324
Current iteration=4400, loss=0.5483161941505511
Current iteration=4500, loss=0.54830904896099
Current iteration=4600, loss=0.54830222590

Current iteration=19600, loss=0.5479089340461508
Current iteration=19700, loss=0.5479068595619596
Current iteration=19800, loss=0.5479047863942057
Current iteration=19900, loss=0.5479027145399649
Current iteration=20000, loss=0.5479006439963987
Current iteration=20100, loss=0.5478985747607501
Current iteration=20200, loss=0.5478965068303397
Current iteration=20300, loss=0.54789444020256
Current iteration=20400, loss=0.547892374874873
Current iteration=20500, loss=0.5478903108448051
Current iteration=20600, loss=0.547888248109945
Current iteration=20700, loss=0.5478861866679394
Current iteration=20800, loss=0.5478841265164899
Current iteration=20900, loss=0.5478820676533507
Current iteration=21000, loss=0.5478800100763251
Current iteration=21100, loss=0.5478779537832635
Current iteration=21200, loss=0.5478758987720604
Current iteration=21300, loss=0.5478738450406522
Current iteration=21400, loss=0.5478717925870151
Current iteration=21500, loss=0.5478697414091629
Current iteration=21600,

Current iteration=36400, loss=0.547577546990545
Current iteration=36500, loss=0.5475756713019811
Current iteration=36600, loss=0.5475737966922984
Current iteration=36700, loss=0.5475719231603895
Current iteration=36800, loss=0.5475700507051483
Current iteration=36900, loss=0.5475681793254695
Current iteration=37000, loss=0.5475663090202505
Current iteration=37100, loss=0.5475644397883893
Current iteration=37200, loss=0.5475625716287861
Current iteration=37300, loss=0.5475607045403419
Current iteration=37400, loss=0.5475588385219603
Current iteration=37500, loss=0.5475569735725451
Current iteration=37600, loss=0.5475551096910024
Current iteration=37700, loss=0.5475532468762396
Current iteration=37800, loss=0.547551385127166
Current iteration=37900, loss=0.5475495244426913
Current iteration=38000, loss=0.547547664821728
Current iteration=38100, loss=0.5475458062631893
Current iteration=38200, loss=0.5475439487659903
Current iteration=38300, loss=0.547542092329047
Current iteration=38400,

Current iteration=53300, loss=0.5472750436052097
Current iteration=53400, loss=0.5472733356538562
Current iteration=53500, loss=0.547271628614085
Current iteration=53600, loss=0.5472699224850153
Current iteration=53700, loss=0.5472682172657682
Current iteration=53800, loss=0.5472665129554658
Current iteration=53900, loss=0.5472648095532309
Current iteration=54000, loss=0.5472631070581879
Current iteration=54100, loss=0.5472614054694622
Current iteration=54200, loss=0.5472597047861804
Current iteration=54300, loss=0.5472580050074703
Current iteration=54400, loss=0.5472563061324612
Current iteration=54500, loss=0.5472546081602827
Current iteration=54600, loss=0.5472529110900666
Current iteration=54700, loss=0.5472512149209448
Current iteration=54800, loss=0.5472495196520515
Current iteration=54900, loss=0.5472478252825211
Current iteration=55000, loss=0.5472461318114898
Current iteration=55100, loss=0.5472444392380946
Current iteration=55200, loss=0.5472427475614737
Current iteration=553

Current iteration=70100, loss=0.5470002481030014
Current iteration=70200, loss=0.5469986818102207
Current iteration=70300, loss=0.5469971162964004
Current iteration=70400, loss=0.5469955515608358
Current iteration=70500, loss=0.5469939876028231
Current iteration=70600, loss=0.5469924244216595
Current iteration=70700, loss=0.5469908620166432
Current iteration=70800, loss=0.5469893003870732
Current iteration=70900, loss=0.5469877395322493
Current iteration=71000, loss=0.5469861794514725
Current iteration=71100, loss=0.5469846201440448
Current iteration=71200, loss=0.5469830616092685
Current iteration=71300, loss=0.5469815038464472
Current iteration=71400, loss=0.5469799468548857
Current iteration=71500, loss=0.5469783906338893
Current iteration=71600, loss=0.5469768351827642
Current iteration=71700, loss=0.5469752805008178
Current iteration=71800, loss=0.546973726587358
Current iteration=71900, loss=0.5469721734416941
Current iteration=72000, loss=0.5469706210631359
Current iteration=721

Current iteration=87000, loss=0.5467460746916694
Current iteration=87100, loss=0.5467446307253433
Current iteration=87200, loss=0.5467431874309557
Current iteration=87300, loss=0.5467417448079396
Current iteration=87400, loss=0.5467403028557289
Current iteration=87500, loss=0.5467388615737585
Current iteration=87600, loss=0.5467374209614635
Current iteration=87700, loss=0.5467359810182802
Current iteration=87800, loss=0.5467345417436452
Current iteration=87900, loss=0.5467331031369962
Current iteration=88000, loss=0.5467316651977712
Current iteration=88100, loss=0.5467302279254092
Current iteration=88200, loss=0.54672879131935
Current iteration=88300, loss=0.5467273553790338
Current iteration=88400, loss=0.5467259201039018
Current iteration=88500, loss=0.5467244854933954
Current iteration=88600, loss=0.5467230515469574
Current iteration=88700, loss=0.546721618264031
Current iteration=88800, loss=0.5467201856440599
Current iteration=88900, loss=0.5467187536864885
Current iteration=89000

In [77]:
#loss0, w0 = implementations.reg_logistic_regression(y0, tx0, 0.00000003, np.ones(len(tx0[0])), 10000, 0.4)
loss1, w1 = implementations.reg_logistic_regression(y1, tx1, 0.00003, np.ones(len(tx1[0])), 10000, 0.3)
loss2, w2 = implementations.reg_logistic_regression(y2, tx2, 0.00000003, np.ones(len(tx2[0])), 10000, 0.4)
loss3, w3 = implementations.reg_logistic_regression(y3, tx3, 0.00000003, np.ones(len(tx3[0])), 10000, 0.4)

r0 = t0_cleaned @ w0
r0_pred = np.sign(r0 - 0.5)
r1 = t1_cleaned @ w1
r1_pred = np.sign(r1 - 0.5)
r2 = t2_cleaned @ w2
r2_pred = np.sign(r2 - 0.5)
r3 = t3_cleaned @ w3
r3_pred = np.sign(r3 - 0.5)

pred4V = create_with_index(r0_pred, r1_pred, r2_pred, r3_pred, idx0, idx1, idx2, idx3)

Current iteration=0, loss=nan
Current iteration=100, loss=0.5808548122596098
Current iteration=200, loss=0.5738400709387022
Current iteration=300, loss=0.5722649123387612
Current iteration=400, loss=0.5714367344671613
Current iteration=500, loss=0.5709195761437915
Current iteration=600, loss=0.5705748468290636
Current iteration=700, loss=0.5703351881877274
Current iteration=800, loss=0.5701628925317792
Current iteration=900, loss=0.5700352115622971
Current iteration=1000, loss=0.5699377316258138
Current iteration=1100, loss=0.5698610122135596
Current iteration=1200, loss=0.5697987284983158
Current iteration=1300, loss=0.5697465754099149
Current iteration=1400, loss=0.5697015888395598


KeyboardInterrupt: 

In [64]:
loss0, w0 = implementations.mean_squared_error_gd(y0, tx0, np.ones(len(tx0[0])), 10000, 0.024)
loss1, w1 = implementations.mean_squared_error_gd(y1, tx1, np.ones(len(tx1[0])), 10000, 0.024)
loss2, w2 = implementations.mean_squared_error_gd(y2, tx2, np.ones(len(tx2[0])), 10000, 0.024)
loss3, w3 = implementations.mean_squared_error_gd(y3, tx3, np.ones(len(tx3[0])), 10000, 0.024)

r0 = t0_cleaned @ w0
r0_pred = np.sign(r0 - 0.5)
r1 = t1_cleaned @ w1
r1_pred = np.sign(r1 - 0.5)
r2 = t2_cleaned @ w2
r2_pred = np.sign(r2 - 0.5)
r3 = t3_cleaned @ w3
r3_pred = np.sign(r3 - 0.5)

predGD = create_with_index(r0_pred, r1_pred, r2_pred, r3_pred, idx0, idx1, idx2, idx3)

Gradient Descent: loss=0.07267646825097837, w0=2.1069176813709087, w1=-0.00480152257548914
Gradient Descent: loss=0.09781101068067323, w0=2.179084905487993, w1=0.06915964306935678
Gradient Descent: loss=0.09596711329361693, w0=2.603282709303064, w1=0.12734931941200586
Gradient Descent: loss=0.09296286142527414, w0=2.4640577636290724, w1=0.08141877125054402
(227458,)
227458
(568238,)


In [53]:
#num_samples = len(test_data[0])
#txTE = np.c_[np.ones(num_samples), test_data[1]]


#rig_log
#print(txTE.shape, w1.shape)
#prediTE = implementations.sigmoid(txTE @ opt_wTR)
#predi_tTE = np.sign(prediTE - 0.5)
#print(predi_tTE.shape)
"""
ids_ = test_data[2]
y_pred = pred4V
name = "logistic_ridge.csv"
create_csv_submission(ids_, y_pred, name)
"""
 
 


#GD, and else

#prediTE = txTE @ w5
#predi_tTE = np.sign(prediTE - 0.5)
#print(predi_tTE.shape)


ids_ = test_data[2]
y_pred = predGD
name = "Gradient_Descent.csv"
create_csv_submission(ids_, y_pred, name)

In [42]:
test = 0

for i in range(len(pred4V)):
        if(pred4V[i] != np.array(test_data[0])[i]):
            test += 1 
        
print(test)

252670


In [45]:
print(252670/568238)

0.44465523249061134


In [46]:
#cross testing + lambda testing
""" Remarks for report: to find the best lambda, we manually tested them by narrowing our range.
    First, 1-10 (1), 0.1-1 (1), 1-2 (ev. faire du '0.1-2')
"""
#form K subgroups randomly
k_fold = 10 #to change when on better computer
seed = 0
k_indices = build_k_indices(y0, k_fold, seed)
#take a range of lambda
#lambdas = np.logspace(-5, 0, 15)
lambdas = np.linspace(0.1,1,10)
print(lambdas)
#initialize the variables
rmse_tr_tmp = []
rmse_te_tmp = []
w_tr_tmp = []
w_te_tmp = []
opt_rmse_te = np.Inf
opt_rmse_tr = np.Inf
opt_lambda = 0
opt_wTR = np.ones(len(y))
for lambda_ in lambdas:
    for k in range(k_fold):
        # Put the kth group in test
        te_indice = k_indices[k]
        tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
        tr_indice = tr_indice.reshape(-1)
        #create new x and y for test and training set
        y_te = y[te_indice]
        y_tr = y[tr_indice]
        x_te = tx0[te_indice]
        x_tr = tx0[tr_indice]
        print(lambda_)
        loss_tr, w_tr = implementations.logistic_regression(y_tr, x_tr, np.ones(len(x_tr[0])), 10000, lambda_)
        loss_te, w_te = implementations.logistic_regression(y_te, x_te, np.ones(len(x_te[0])), 10000, lambda_)
        rmse_tr_tmp.append(loss_tr)
        rmse_te_tmp.append(loss_te)
        w_tr_tmp.append(w_tr)
        w_te_tmp.append(w_te)
    rmse_tr = np.mean(rmse_tr_tmp, axis=0)
    rmse_te = np.mean(rmse_te_tmp, axis=0)
    wTR = np.mean(w_tr_tmp, axis=0)
    wTE = np.mean(w_te_tmp, axis=0)
    if rmse_te < opt_rmse_te:
        opt_rmse_te = rmse_te
        opt_rmse_tr = rmse_tr
        opt_lambda = lambda_
        opt_wTR = wTR


#return opt_rms_tr, opt_lambda, opt_wTR
print(opt_rmse_tr, opt_lambda, opt_wTR)

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
0.1
Current iteration=0, loss=nan


  if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:


Current iteration=100, loss=inf


  loss = -(y.T.dot(np.log(predict)) + (1.0 - y).T.dot(np.log(1.0 - predict))) / len(


Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=1800, loss=nan
Current iteration=1900, loss=nan
Current iteration=2000, loss=nan
Current iteration=2100, loss=nan
Current iteration=2200, loss=nan
Current iteration=2300, loss=nan
Current iteration=2400, loss=nan
Current iteration=2500, loss=nan
Current iteration=2600, loss=nan
Current iteration=2700, loss=nan
Current iteration=2800, loss=nan
Current iteration=2900, loss=nan
Current iteration=3000, loss=nan
Current iteration=3100, loss=nan
Current iteration=

Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current iteration=6700, loss=nan
Current iteration=6800, loss=nan
Current iteration=6900, loss=nan
Current iteration=7000, loss=nan
Current iteration=7100, loss=nan
Current iteration=7200, loss=nan
Current iteration=7300, loss=nan
Current iteration=7400, loss=nan
Current iteration=7500, loss=nan
Current iteration=7600, loss=nan
Current iteration=7700, loss=nan
Current iteration=7800, loss=nan
Current iteration=7900, loss=nan
Current iteration=8000, loss=nan
Current it

Current iteration=9900, loss=nan
Current iteration=0, loss=0.903451383839188
Current iteration=100, loss=-16.449595828952393
Current iteration=200, loss=-32.02795727295148
Current iteration=300, loss=-47.60631871695048
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=1800, loss=nan
Current iteration=1900, loss=nan
Current iteration=2000, loss=nan
Current iteration=2100, loss=nan
Current iteration=2200, loss=nan
Current iteration=2300, loss=nan
Current iteration=2400, loss=nan
Current iteration=2500, loss=nan
Current iteration=2600, loss=nan
Current iteration=2700, loss

Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current iteration=6700, loss=nan
Current iteration=6800, loss=nan
Current iteration=6900, loss=nan
Current iteration=7000, loss=nan
Current iteration=7100, loss=nan
Current iteration=7200, loss=nan
Current iteration=7300, loss=nan
Current iteration=7400, loss=nan
Current iteration=7500, loss=nan
Current iteration=7600, loss=nan
Current it

Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
0.1
Current iteration=0, loss=nan
Current iteration=100, loss=inf
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=1800, loss=nan
Current iteration=1900, loss=nan
Current iteration=2000, loss=nan
Current iteration=2100, loss=nan
Current iteration=2200, loss=nan
Current iteration=2300, loss=nan
Current iteration=2400, loss=nan
Current iteration=

Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current iteration=6700, loss=nan
Current iteration=6800, loss=nan
Current iteration=6900, loss=nan
Current iteration=7000, loss=nan
Current iteration=7100, loss=nan
Current iteration=7200, loss=nan
Current iteration=7300, loss=nan
Current it

Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
Current iteration=0, loss=inf
Current iteration=100, loss=-17.525342619944716
Current iteration=200, loss=-34.15662947401904
Current iteration=300, loss=-50.787916328093466
Current iteration=400, loss=-inf
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=1800, loss=nan
Current iteration=1900, loss=nan
Current iteration=2000, loss=nan
Current

Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current iteration=6700, loss=nan
Current iteration=6800, loss=nan
Current iteration=6900, loss=nan
Current it

Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
0.2
Current iteration=0, loss=inf
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, loss=nan
Current iteration=1600, loss=nan
Current iteration=1700, loss=nan
Current iteration=

Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current it

Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
Current iteration=0, loss=-0.14519546297370445
Current iteration=100, loss=-32.2881619767954
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=1500, 

Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current iteration=6400, loss=nan
Current iteration=6500, loss=nan
Current iteration=6600, loss=nan
Current it

Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
0.2
Current iteration=0, loss=inf
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200, loss=nan
Current iteration=1300, loss=nan
Current iteration=1400, loss=nan
Current iteration=

Current iteration=3400, loss=nan
Current iteration=3500, loss=nan
Current iteration=3600, loss=nan
Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current iteration=6300, loss=nan
Current it

Current iteration=8300, loss=nan
Current iteration=8400, loss=nan
Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
Current iteration=0, loss=-0.16510668443841325
Current iteration=100, loss=-34.43389701301754
Current iteration=200, loss=-inf
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Current iteration=1100, loss=nan
Current iteration=1200

Current iteration=3300, loss=nan
Current iteration=3400, loss=nan
Current iteration=3500, loss=nan
Current iteration=3600, loss=nan
Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current iteration=5900, loss=nan
Current iteration=6000, loss=nan
Current iteration=6100, loss=nan
Current iteration=6200, loss=nan
Current it

Current iteration=8100, loss=nan
Current iteration=8200, loss=nan
Current iteration=8300, loss=nan
Current iteration=8400, loss=nan
Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
0.30000000000000004
Current iteration=0, loss=inf
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700, loss=nan
Current iteration=800, loss=nan
Current iteration=900, loss=nan
Current iteration=1000, loss=nan
Cu

Current iteration=2900, loss=nan
Current iteration=3000, loss=nan
Current iteration=3100, loss=nan
Current iteration=3200, loss=nan
Current iteration=3300, loss=nan
Current iteration=3400, loss=nan
Current iteration=3500, loss=nan
Current iteration=3600, loss=nan
Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current iteration=5800, loss=nan
Current it

Current iteration=7800, loss=nan
Current iteration=7900, loss=nan
Current iteration=8000, loss=nan
Current iteration=8100, loss=nan
Current iteration=8200, loss=nan
Current iteration=8300, loss=nan
Current iteration=8400, loss=nan
Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
Current iteration=0, loss=-0.9396225931601313
Current iteration=100, loss=-48.14850440738823
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=nan
Current iteration=600, loss=nan
Current iteration=700

Current iteration=2800, loss=nan
Current iteration=2900, loss=nan
Current iteration=3000, loss=nan
Current iteration=3100, loss=nan
Current iteration=3200, loss=nan
Current iteration=3300, loss=nan
Current iteration=3400, loss=nan
Current iteration=3500, loss=nan
Current iteration=3600, loss=nan
Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current iteration=5400, loss=nan
Current iteration=5500, loss=nan
Current iteration=5600, loss=nan
Current iteration=5700, loss=nan
Current it

Current iteration=7600, loss=nan
Current iteration=7700, loss=nan
Current iteration=7800, loss=nan
Current iteration=7900, loss=nan
Current iteration=8000, loss=nan
Current iteration=8100, loss=nan
Current iteration=8200, loss=nan
Current iteration=8300, loss=nan
Current iteration=8400, loss=nan
Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
0.30000000000000004
Current iteration=0, loss=inf
Current iteration=100, loss=nan
Current iteration=200, loss=nan
Current iteration=300, loss=nan
Current iteration=400, loss=nan
Current iteration=500, loss=na

Current iteration=2400, loss=nan
Current iteration=2500, loss=nan
Current iteration=2600, loss=nan
Current iteration=2700, loss=nan
Current iteration=2800, loss=nan
Current iteration=2900, loss=nan
Current iteration=3000, loss=nan
Current iteration=3100, loss=nan
Current iteration=3200, loss=nan
Current iteration=3300, loss=nan
Current iteration=3400, loss=nan
Current iteration=3500, loss=nan
Current iteration=3600, loss=nan
Current iteration=3700, loss=nan
Current iteration=3800, loss=nan
Current iteration=3900, loss=nan
Current iteration=4000, loss=nan
Current iteration=4100, loss=nan
Current iteration=4200, loss=nan
Current iteration=4300, loss=nan
Current iteration=4400, loss=nan
Current iteration=4500, loss=nan
Current iteration=4600, loss=nan
Current iteration=4700, loss=nan
Current iteration=4800, loss=nan
Current iteration=4900, loss=nan
Current iteration=5000, loss=nan
Current iteration=5100, loss=nan
Current iteration=5200, loss=nan
Current iteration=5300, loss=nan
Current it

Current iteration=7200, loss=nan
Current iteration=7300, loss=nan
Current iteration=7400, loss=nan
Current iteration=7500, loss=nan
Current iteration=7600, loss=nan
Current iteration=7700, loss=nan
Current iteration=7800, loss=nan
Current iteration=7900, loss=nan
Current iteration=8000, loss=nan
Current iteration=8100, loss=nan
Current iteration=8200, loss=nan
Current iteration=8300, loss=nan
Current iteration=8400, loss=nan
Current iteration=8500, loss=nan
Current iteration=8600, loss=nan
Current iteration=8700, loss=nan
Current iteration=8800, loss=nan
Current iteration=8900, loss=nan
Current iteration=9000, loss=nan
Current iteration=9100, loss=nan
Current iteration=9200, loss=nan
Current iteration=9300, loss=nan
Current iteration=9400, loss=nan
Current iteration=9500, loss=nan
Current iteration=9600, loss=nan
Current iteration=9700, loss=nan
Current iteration=9800, loss=nan
Current iteration=9900, loss=nan
Current iteration=0, loss=-0.9911897553846525
Current iteration=100, loss=-5

KeyboardInterrupt: 

## Visualization of data

In [None]:
#plot all the features of x to see the distribution and potential overliers
def corr_plot():
    fig, axs = plt.subplots(6, 5, figsize=(16, 16), sharey='col', sharex='row')
    z = 0
    
    for i in range(6):
        for j in range(5):
            ax = axs[i, j]
            z += 1
            subplot(z)
    
    plt.setp(axs[-1, :], xlabel='X')
    plt.setp(axs[:, 0], ylabel='Y = {-1,1}')
    fig.tight_layout()
    plt.savefig('Features.png', dpi=300)


In [None]:
corr_plot()

In [None]:
feat = extract(train_data[1],22)
#feature with all int (22)

In [None]:
#test_data = load_csv_data('test.csv', sub_sample=True)

In [None]:
#feat = extract(test_data[1],22)#same for test

In [None]:
#compute the correlation coeffincients between y and each of the features of x
corr = []
for i in range(0,30):
    feat = extract(train_data_std1,i)
    #print(np.corrcoef(train_data[0],feat)[0,1])
    corr.append(np.corrcoef(train_data[0],feat)[0,1])
#print(corr)
#print(len(corr))

#get an array corresponding to the indices of each feature
def listFrom1toN(n):
    return list(range(0,n))
ind = listFrom1toN(30)
#print(ind)

In [None]:
#plot the correlation values betweeny and each feature against the ids of the features
plt.scatter(ind, corr, )
plt.axhline(0, c='r', ls=':')
plt.title("Correlations coefficient of the different features of x")
plt.ylabel("Correlation coefficients")
plt.xlabel("Features' indices")
plt.savefig('Corrplot.png', dpi=300, bbox_inches='tight')

*get the names of the headers* 

In [None]:
#get the headers from the train data
#see if can use csv library
from csv import reader
reader = csv.reader(open('train.csv'), delimiter = ",")
header = next(reader)
print(len(header))
print(header)

In [None]:
#get the headers from the test data
#see if can use csv library
reader = csv.reader(open('test.csv'), delimiter = ",")
header = next(reader)
print(len(header))
print(header)