In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

**Test of regular training functions**

In [112]:
nsamples = 1000
nfeatures = 20
seed = 20
std = 0.1
y,tx,w = generate_data(nsamples,nfeatures,seed,std)
loss = compute_loss(y,tx,w)

In [113]:
print(loss)
print(w)

0.005309625925735089
[[ 0.1762616 ]
 [ 0.79542746]
 [ 0.78306146]
 [ 0.63167495]
 [-0.92822083]
 [ 0.38351516]
 [-0.24263812]
 [ 0.03702189]
 [ 0.31590293]
 [-0.61229956]
 [-0.4553672 ]
 [ 0.43721187]
 [ 0.56600722]
 [ 0.70065528]
 [ 0.55048979]
 [-0.92667139]
 [-0.76661253]
 [ 0.5025614 ]
 [-0.52156357]
 [-0.49038797]
 [ 0.71525106]]


In [114]:
initial_w = np.zeros([nfeatures+1,1])
max_iters = 100000
gamma = 0.1

Least squares GD

In [115]:
w,loss = least_squares_GD(y,tx,initial_w,max_iters,gamma)

Terminated least_squares_GD after  489  iterations.


In [116]:
print(loss)
print(w)

0.00538535627653458
[[ 0.14808265]
 [ 0.76770622]
 [ 0.77628839]
 [ 0.63050625]
 [-0.91318033]
 [ 0.38260021]
 [-0.22952819]
 [ 0.05057628]
 [ 0.33253619]
 [-0.60653593]
 [-0.44422718]
 [ 0.42380871]
 [ 0.53860572]
 [ 0.71046953]
 [ 0.54684245]
 [-0.90622417]
 [-0.73882015]
 [ 0.48468173]
 [-0.50896463]
 [-0.46833936]
 [ 0.70347379]]


Least squares SGD

In [117]:
w,loss = least_squares_SGD(y,tx,initial_w,10000,gamma)

Terminated least_squares_SGD after  19  iterations.


In [118]:
print(loss)
print(w)

0.22971007121517392
[[ 0.12693671]
 [ 0.14580919]
 [ 0.18435011]
 [ 0.18020186]
 [-0.13755976]
 [ 0.09618968]
 [ 0.01036153]
 [ 0.0559057 ]
 [ 0.13258332]
 [-0.00986171]
 [-0.01613327]
 [ 0.06359952]
 [ 0.12922318]
 [ 0.14797021]
 [ 0.1791574 ]
 [-0.0605137 ]
 [-0.09135906]
 [ 0.12753356]
 [-0.05793838]
 [-0.02967219]
 [ 0.13916909]]


Least squares

In [119]:
w,loss = least_squares(y, tx)

In [120]:
print(loss)
print(w)

0.005208633401473408
[[ 0.20150237]
 [ 0.77864141]
 [ 0.7873774 ]
 [ 0.63871312]
 [-0.94142934]
 [ 0.38671696]
 [-0.22980625]
 [ 0.04835073]
 [ 0.31770175]
 [-0.62597692]
 [-0.4691441 ]
 [ 0.43757343]
 [ 0.54944065]
 [ 0.71450421]
 [ 0.55213022]
 [-0.93721272]
 [-0.75704205]
 [ 0.48227548]
 [-0.52786446]
 [-0.48925327]
 [ 0.71283803]]


Ridge regression

In [121]:
lambda_ = 0.0001

In [122]:
w,loss = ridge_regression(y, tx, lambda_)

In [123]:
print(loss)
print(w)

0.005210508424564729
[[ 0.20013947]
 [ 0.77695313]
 [ 0.78568382]
 [ 0.63736215]
 [-0.93871185]
 [ 0.38592071]
 [-0.22928267]
 [ 0.04840484]
 [ 0.31764339]
 [-0.62408123]
 [-0.46742257]
 [ 0.43642401]
 [ 0.54821894]
 [ 0.71315937]
 [ 0.55107625]
 [-0.93440663]
 [-0.75498797]
 [ 0.48153747]
 [-0.52622629]
 [-0.48756298]
 [ 0.71133883]]


**Test of Logistic Regression on actual data**

In [63]:
path_tr = "../data/train.csv"
yb_tr, data_tr, ids_tr = load_csv_data(path_tr,True)

In [69]:
data_tr0, mean_tr, std_tr = standardize(data_tr)
data_tr0 = add_constant(data_tr0)

In [70]:
nfeatures = data_tr0.shape[1]
initial_w = np.ones([nfeatures,1])/nfeatures

Since the features can have numeric values in whatever range, $exp(X^\top\,w)$ can be very large when chosing $w_{initial}=1$ (or any other finite value), resulting in $\sigma=1$. On the other hand chosing $w_{initial}=0$ results in $\sigma=0.5$.
Not sure though whether that actually has an influence on the stability of the method.
On a second thought, standardisation or normalisation would actually make it reasonable to use a finite $w_{initial}$. I was thinking maybe $w_{initial} = \tfrac{1}{\# features}$.

In [71]:
max_iters = 5000
w, loss = logistic_regression(yb_tr,data_tr0,initial_w,max_iters,0.2)

In [72]:
print(loss)
print(w)

0.2798
[[-2.52684145e+06]
 [ 7.22837053e+05]
 [-8.86942722e+05]
 [-8.91851537e+04]
 [ 2.38212051e+05]
 [ 3.00160112e+04]
 [ 1.58894333e+05]
 [ 2.67322358e+04]
 [ 3.33420963e+05]
 [-8.03067947e+04]
 [ 4.32986503e+04]
 [-3.14424400e+05]
 [ 5.73245164e+05]
 [ 2.91177055e+04]
 [ 3.60105593e+05]
 [ 4.92375251e+04]
 [-2.92017249e+04]
 [ 3.63261880e+04]
 [ 6.80401919e+04]
 [ 5.73310498e+04]
 [-1.42973964e+05]
 [-3.53037435e+03]
 [ 1.57540613e+03]
 [ 7.07173749e+03]
 [ 2.06721977e+05]
 [ 2.06028883e+05]
 [ 2.06406455e+05]
 [ 1.98739024e+04]
 [ 2.88896921e+04]
 [ 2.86823816e+04]
 [-3.23444273e+04]]


In [73]:
w = w/np.mean(w)
print(w)

[[ 1.16273904e+02]
 [-3.32617174e+01]
 [ 4.08131239e+01]
 [ 4.10390055e+00]
 [-1.09614496e+01]
 [-1.38120214e+00]
 [-7.31160414e+00]
 [-1.23009753e+00]
 [-1.53425365e+01]
 [ 3.69535831e+00]
 [-1.99240958e+00]
 [ 1.44683999e+01]
 [-2.63781700e+01]
 [-1.33986614e+00]
 [-1.65704434e+01]
 [-2.26568995e+00]
 [ 1.34373233e+00]
 [-1.67156816e+00]
 [-3.13090430e+00]
 [-2.63811763e+00]
 [ 6.57902024e+00]
 [ 1.62451985e-01]
 [-7.24931204e-02]
 [-3.25409625e-01]
 [-9.51241770e+00]
 [-9.48052464e+00]
 [-9.49789882e+00]
 [-9.14507804e-01]
 [-1.32937399e+00]
 [-1.31983449e+00]
 [ 1.48834540e+00]]


In [74]:
w, loss = reg_logistic_regression(yb_tr,data_tr0,1+4,initial_w,max_iters,0.2)

In [75]:
print(loss)
print(w)

0.6064
[[-667.99534162]
 [ 129.6766822 ]
 [-228.66895501]
 [ -14.94019797]
 [ -82.44496554]
 [-270.10833093]
 [-207.94622662]
 [-271.13099738]
 [ 143.39873142]
 [-125.98049701]
 [-168.71453466]
 [-183.56434577]
 [  85.82721938]
 [-270.38204073]
 [ 106.60059948]
 [  26.15895072]
 [ -16.03729886]
 [-112.25572516]
 [  30.03502361]
 [  16.39320674]
 [-128.22769493]
 [   4.9357043 ]
 [-165.3115051 ]
 [-238.13357031]
 [ -98.25480609]
 [ -91.23495215]
 [ -91.20886192]
 [-271.05084782]
 [-270.44408328]
 [-270.51287909]
 [-195.23694477]]


In [76]:
path_te = "../data/train.csv"
yb_te, data_te, ids_te = load_csv_data(path_te,True)
data_te0, mean_te, std_te = standardize(data_te)
data_te0 = add_constant(data_te0)

- log_reg working correctly?
- stochastic approach?
- iterations end conditions?
- preprocessing? (use all features?)
- ...?

TODO:
- bias_variance_decomposition
- cross_validation
- again test all methods
- prepare script run.py

TEST:

Required Functions

- logistic_regression
- reg_logistic_regression

Trainers

- my_least_squares_GD
- my_least_squares_SGD
- my_least_squares
- my_ridge_regression
- my_logistic_regression
- my_reg_logistic_regression
- my_stoch_logistic_regression

Utility functions for trainers

- compute_gradient
- compute_sigma
- batch_iter

Loss functions

- compute_loss

Preprocessing

- standardize
- split_data
- add_constant
- poly_expansion

Utility

- generate_data
- generate_bin_data
- column_array
- compute_y

In [35]:
path_tr = "../data/train.csv"
yb_tr, data_tr, ids_tr = load_csv_data(path_tr,False)

In [36]:
degree = 6
data_tr0 = poly_expansion(data_tr, degree, False, True)

In [37]:
print(data_tr0[1,:])

[ 1.60937000e+02  2.59007180e+04  4.16838385e+06  6.70847191e+08
  1.07964134e+11  1.73754239e+13  6.87680000e+01  4.72903782e+03
  3.25206473e+05  2.23637987e+07  1.53791371e+09  1.05759250e+11
  1.03235000e+02  1.06574652e+04  1.10022342e+06  1.13581565e+08
  1.17255929e+10  1.21049158e+12  4.81460000e+01  2.31803732e+03
  1.11604225e+05  5.37329700e+06  2.58702757e+08  1.24555030e+10
 -9.99000000e+02  9.98001000e+05 -9.97002999e+08  9.96005996e+11
 -9.95009990e+14  9.94014980e+17 -9.99000000e+02  9.98001000e+05
 -9.97002999e+08  9.96005996e+11 -9.95009990e+14  9.94014980e+17
 -9.99000000e+02  9.98001000e+05 -9.97002999e+08  9.96005996e+11
 -9.95009990e+14  9.94014980e+17  3.47300000e+00  1.20617290e+01
  4.18903848e+01  1.45485306e+02  5.05270469e+02  1.75480434e+03
  2.07800000e+00  4.31808400e+00  8.97297855e+00  1.86458494e+01
  3.87460751e+01  8.05143441e+01  1.25157000e+02  1.56642746e+04
  1.96049362e+06  2.45369500e+08  3.07097105e+10  3.84353524e+12
  8.79000000e-01  7.72641

In [38]:
data_tr0, mean_tr, std_tr = standardize(data_tr0)
data_tr0 = add_constant(data_tr0)
nfeatures = data_tr0.shape[1]
initial_w = np.ones([nfeatures,1])/nfeatures

In [41]:
l = 0
w_SGD, loss_SGD = my_ridge_regression(yb_tr, data_tr0, l, mode = "ls", max_iters=1000, gamma=0.2, batch_size=10, eps=1e-3)
#w_SGD, loss_SGD = my_least_squares_SGD(yb_tr, data_tr0, initial_w, 1000, 0.01, 4, l, 1e-5)
#w_SGD, loss_SGD = my_logistic_regression(yb_tr, data_tr0, initial_w, 2000, 0.02, 'log', l, eps=1e-05)
#w_SGD, loss_SGD = my_stoch_logistic_regression(yb_tr, data_tr0, initial_w, 2000, 0.02, 10, "log", l, eps=1e-5)

In [44]:
#print(loss_SGD)
print(compute_loss(yb_tr, data_tr0, w_SGD, "log", 0))
print(w_SGD)

0.446112
[[-4.96142491e+00]
 [ 2.01442436e+03]
 [-1.29574866e+04]
 [ 3.03002038e+04]
 [-8.67855697e+03]
 [-3.21804591e+04]
 [ 2.17095102e+04]
 [-2.36211392e+01]
 [ 5.24669380e+01]
 [-2.30302771e+01]
 [-1.78867945e+02]
 [ 3.41976759e+02]
 [-1.73947046e+02]
 [-1.83273432e+02]
 [ 8.33323253e+02]
 [-2.45208418e+03]
 [ 4.75164840e+03]
 [-4.54242167e+03]
 [ 1.61627459e+03]
 [ 5.91956505e+00]
 [-7.07049915e+00]
 [-5.70849252e+02]
 [ 1.44304372e+04]
 [-1.01639597e+05]
 [ 2.12756302e+05]
 [ 1.49123553e+05]
 [-1.18426273e+08]
 [ 3.13525891e+10]
 [-2.55463492e+12]
 [-1.57522705e+12]
 [-6.20174591e+12]
 [ 8.86175063e+01]
 [-7.97381258e+00]
 [-3.33188977e+02]
 [ 8.20805214e+02]
 [-9.47988802e+02]
 [ 3.88460480e+02]
 [-5.28823705e+03]
 [-1.45959090e+06]
 [ 2.06369072e+08]
 [ 2.25235242e+10]
 [-9.49338089e+11]
 [ 6.24268118e+12]
 [-7.73568116e+02]
 [ 4.35135420e+03]
 [-1.04333230e+04]
 [ 1.31360668e+04]
 [-8.58993820e+03]
 [ 2.30472079e+03]
 [-2.79824407e+00]
 [ 3.43183275e+01]
 [ 5.69219511e+02]
 [-

In [31]:
path_te = "../data/test.csv"
yb_te, data_te, ids_te = load_csv_data(path_te,False)

In [32]:
data_te0 = poly_expansion(data_te, degree, False, True)
data_te0, mean_te, std_te = standardize(data_te0, mean_tr, std_tr)
data_te0 = add_constant(data_te0)
yb_te = yb_te.reshape(len(yb_te),1)

In [33]:
#print(yb_te[:50])

In [34]:
y_pred = compute_y(data_te0, w_SGD)
print(y_pred)

[[-1]
 [-1]
 [-1]
 ...
 [ 1]
 [ 1]
 [-1]]


In [23]:
path_te = "Testnumber4.csv"
yte3, data_te, ids_te = load_csv_data(path_te,False)

In [24]:
loss_gl = compute_loss(yte3, data_te0, w_SGD, "log", lambda_=0)
print(loss_gl)

0.5914053618378212


In [15]:
create_csv_submission(ids_te, y_pred, "Testnumber4.csv")