In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from proj1_helpers import *
%load_ext autoreload
%autoreload 2

**Test of regular training functions**

In [2]:
nsamples = 1000
nfeatures = 20
seed = 20
std = 0.1
y,tx,w = generate_data(nsamples,nfeatures,seed,std)
loss = compute_loss(y,tx,w)

In [3]:
print(loss)
print(w)

0.005309625925735089
[[ 0.1762616 ]
 [ 0.79542746]
 [ 0.78306146]
 [ 0.63167495]
 [-0.92822083]
 [ 0.38351516]
 [-0.24263812]
 [ 0.03702189]
 [ 0.31590293]
 [-0.61229956]
 [-0.4553672 ]
 [ 0.43721187]
 [ 0.56600722]
 [ 0.70065528]
 [ 0.55048979]
 [-0.92667139]
 [-0.76661253]
 [ 0.5025614 ]
 [-0.52156357]
 [-0.49038797]
 [ 0.71525106]]


In [4]:
initial_w = np.zeros([nfeatures+1,1])
max_iters = 100000
gamma = 0.1

Least squares GD

In [5]:
w,loss = least_squares_GD(y,tx,initial_w,max_iters,gamma)

Terminated least_squares_GD after  589  iterations.


In [6]:
print(loss)
print(w)

0.005263113775547522
[[ 0.15483312]
 [ 0.77521245]
 [ 0.78409973]
 [ 0.63694343]
 [-0.92573112]
 [ 0.3872494 ]
 [-0.22861084]
 [ 0.05145808]
 [ 0.32834661]
 [-0.61536715]
 [-0.45421206]
 [ 0.43222627]
 [ 0.54526185]
 [ 0.71506445]
 [ 0.55147459]
 [-0.92010031]
 [-0.74650163]
 [ 0.4862108 ]
 [-0.51670025]
 [-0.47710491]
 [ 0.71034127]]


Least squares SGD

In [7]:
w,loss = least_squares_SGD(y,tx,initial_w,10000,gamma)

Terminated least_squares_SGD after  599  iterations.


In [8]:
print(loss)
print(w)

0.00594309918821907
[[ 0.18034311]
 [ 0.77059666]
 [ 0.78607342]
 [ 0.62818267]
 [-0.93924056]
 [ 0.38074666]
 [-0.21715287]
 [ 0.05141713]
 [ 0.33083564]
 [-0.6170471 ]
 [-0.44230045]
 [ 0.43335394]
 [ 0.54072466]
 [ 0.71553842]
 [ 0.56287622]
 [-0.92082503]
 [-0.73391364]
 [ 0.48949539]
 [-0.50896692]
 [-0.48236113]
 [ 0.71326105]]


Least squares

In [9]:
w,loss = least_squares(y, tx)

In [10]:
print(loss)
print(w)

0.005208633401473408
[[ 0.20150237]
 [ 0.77864141]
 [ 0.7873774 ]
 [ 0.63871312]
 [-0.94142934]
 [ 0.38671696]
 [-0.22980625]
 [ 0.04835073]
 [ 0.31770175]
 [-0.62597692]
 [-0.4691441 ]
 [ 0.43757343]
 [ 0.54944065]
 [ 0.71450421]
 [ 0.55213022]
 [-0.93721272]
 [-0.75704205]
 [ 0.48227548]
 [-0.52786446]
 [-0.48925327]
 [ 0.71283803]]


Ridge regression

In [11]:
lambda_ = 0.0001

In [12]:
w,loss = ridge_regression(y, tx, lambda_)

In [13]:
print(loss)
print(w)

0.005210508424564729
[[ 0.20013947]
 [ 0.77695313]
 [ 0.78568382]
 [ 0.63736215]
 [-0.93871185]
 [ 0.38592071]
 [-0.22928267]
 [ 0.04840484]
 [ 0.31764339]
 [-0.62408123]
 [-0.46742257]
 [ 0.43642401]
 [ 0.54821894]
 [ 0.71315937]
 [ 0.55107625]
 [-0.93440663]
 [-0.75498797]
 [ 0.48153747]
 [-0.52622629]
 [-0.48756298]
 [ 0.71133883]]


**Test of Logistic Regression on actual data**

In [63]:
path_tr = "../data/train.csv"
yb_tr, data_tr, ids_tr = load_csv_data(path_tr,True)

In [69]:
data_tr0, mean_tr, std_tr = standardize(data_tr)
data_tr0 = add_constant(data_tr0)

In [70]:
nfeatures = data_tr0.shape[1]
initial_w = np.ones([nfeatures,1])/nfeatures

Since the features can have numeric values in whatever range, $exp(X^\top\,w)$ can be very large when chosing $w_{initial}=1$ (or any other finite value), resulting in $\sigma=1$. On the other hand chosing $w_{initial}=0$ results in $\sigma=0.5$.
Not sure though whether that actually has an influence on the stability of the method.
On a second thought, standardisation or normalisation would actually make it reasonable to use a finite $w_{initial}$. I was thinking maybe $w_{initial} = \tfrac{1}{\# features}$.

In [71]:
max_iters = 5000
w, loss = logistic_regression(yb_tr,data_tr0,initial_w,max_iters,0.2)

In [72]:
print(loss)
print(w)

0.2798
[[-2.52684145e+06]
 [ 7.22837053e+05]
 [-8.86942722e+05]
 [-8.91851537e+04]
 [ 2.38212051e+05]
 [ 3.00160112e+04]
 [ 1.58894333e+05]
 [ 2.67322358e+04]
 [ 3.33420963e+05]
 [-8.03067947e+04]
 [ 4.32986503e+04]
 [-3.14424400e+05]
 [ 5.73245164e+05]
 [ 2.91177055e+04]
 [ 3.60105593e+05]
 [ 4.92375251e+04]
 [-2.92017249e+04]
 [ 3.63261880e+04]
 [ 6.80401919e+04]
 [ 5.73310498e+04]
 [-1.42973964e+05]
 [-3.53037435e+03]
 [ 1.57540613e+03]
 [ 7.07173749e+03]
 [ 2.06721977e+05]
 [ 2.06028883e+05]
 [ 2.06406455e+05]
 [ 1.98739024e+04]
 [ 2.88896921e+04]
 [ 2.86823816e+04]
 [-3.23444273e+04]]


In [73]:
w = w/np.mean(w)
print(w)

[[ 1.16273904e+02]
 [-3.32617174e+01]
 [ 4.08131239e+01]
 [ 4.10390055e+00]
 [-1.09614496e+01]
 [-1.38120214e+00]
 [-7.31160414e+00]
 [-1.23009753e+00]
 [-1.53425365e+01]
 [ 3.69535831e+00]
 [-1.99240958e+00]
 [ 1.44683999e+01]
 [-2.63781700e+01]
 [-1.33986614e+00]
 [-1.65704434e+01]
 [-2.26568995e+00]
 [ 1.34373233e+00]
 [-1.67156816e+00]
 [-3.13090430e+00]
 [-2.63811763e+00]
 [ 6.57902024e+00]
 [ 1.62451985e-01]
 [-7.24931204e-02]
 [-3.25409625e-01]
 [-9.51241770e+00]
 [-9.48052464e+00]
 [-9.49789882e+00]
 [-9.14507804e-01]
 [-1.32937399e+00]
 [-1.31983449e+00]
 [ 1.48834540e+00]]


In [74]:
w, loss = reg_logistic_regression(yb_tr,data_tr0,1+4,initial_w,max_iters,0.2)

In [75]:
print(loss)
print(w)

0.6064
[[-667.99534162]
 [ 129.6766822 ]
 [-228.66895501]
 [ -14.94019797]
 [ -82.44496554]
 [-270.10833093]
 [-207.94622662]
 [-271.13099738]
 [ 143.39873142]
 [-125.98049701]
 [-168.71453466]
 [-183.56434577]
 [  85.82721938]
 [-270.38204073]
 [ 106.60059948]
 [  26.15895072]
 [ -16.03729886]
 [-112.25572516]
 [  30.03502361]
 [  16.39320674]
 [-128.22769493]
 [   4.9357043 ]
 [-165.3115051 ]
 [-238.13357031]
 [ -98.25480609]
 [ -91.23495215]
 [ -91.20886192]
 [-271.05084782]
 [-270.44408328]
 [-270.51287909]
 [-195.23694477]]


In [76]:
path_te = "../data/train.csv"
yb_te, data_te, ids_te = load_csv_data(path_te,True)
data_te0, mean_te, std_te = standardize(data_te)
data_te0 = add_constant(data_te0)

- log_reg working correctly?
- stochastic approach?
- iterations end conditions?
- preprocessing? (use all features?)
- ...?

TODO:
- bias_variance_decomposition
- cross_validation
- again test all methods
- prepare script run.py

TEST:

Required Functions

- logistic_regression
- reg_logistic_regression

Trainers

- my_least_squares_GD
- my_least_squares_SGD
- my_least_squares
- my_ridge_regression
- my_logistic_regression
- my_reg_logistic_regression
- my_stoch_logistic_regression

Utility functions for trainers

- compute_gradient
- compute_sigma
- batch_iter

Loss functions

- compute_loss

Preprocessing

- standardize
- split_data
- add_constant
- poly_expansion

Utility

- generate_data
- generate_bin_data
- column_array
- compute_y

In [44]:
path_tr = "../data/train.csv"
yb_tr, data_tr, ids_tr = load_csv_data(path_tr,False)

In [101]:
degree = 7
data_tr0 = poly_expansion(data_tr, degree, False, False)
data_tr0, mean_tr, std_tr = standardize(data_tr0)
data_tr0 = add_constant(data_tr0)
nfeatures = data_tr0.shape[1]
initial_w = np.ones([nfeatures,1])/nfeatures

In [110]:
lambda_ = 0
#w_SGD, loss_SGD = my_least_squares_SGD(yb_tr, data_tr0, initial_w, max_iters=1000, gamma=0.01, batch_size=4, lambda_=0, eps=1e-5)
w_SGD, loss_SGD = my_logistic_regression(yb_tr, data_tr0, initial_w, 2000, 0.02, 'newton', lambda_=0, eps=1e-05)
#w_SGD, loss_SGD = my_stoch_logistic_regression(yb_tr, data_tr0, initial_w, 2000, 0.02, 10, "newton", lambda_=0, eps=1e-5)

iteration:  0
iteration:  1


In [109]:
print(loss_SGD)
print(w_SGD)

0.5811240000000001
[[-2.19981772e+48]
 [ 8.48159782e+47]
 [-1.33800157e+49]
 [ 7.81334072e+49]
 [-1.48655526e+50]
 [ 2.45484290e+49]
 [ 1.62921717e+50]
 [-1.02519301e+50]
 [-3.86430281e+46]
 [ 3.66485168e+47]
 [-2.45462593e+48]
 [ 1.09034371e+49]
 [-2.60394940e+49]
 [ 3.01192904e+49]
 [-1.32332807e+49]
 [ 1.30261708e+47]
 [-2.84341012e+47]
 [-9.75670152e+47]
 [ 1.08524454e+49]
 [-3.50296715e+49]
 [ 4.66826745e+49]
 [-2.19279710e+49]
 [-6.18656823e+46]
 [ 7.07187727e+47]
 [-1.40592445e+49]
 [ 1.83051717e+50]
 [-1.13853448e+51]
 [ 3.24273513e+51]
 [-3.37243712e+51]
 [ 2.23729653e+44]
 [-6.51113580e+44]
 [ 1.95453026e+45]
 [ 2.98297115e+45]
 [ 2.14007709e+45]
 [-5.19408869e+44]
 [ 3.46922577e+44]
 [-1.11306435e+34]
 [ 3.16302176e+34]
 [-1.26801930e+35]
 [ 2.90050571e+35]
 [-4.68516735e+35]
 [ 4.04465355e+35]
 [-1.43377655e+35]
 [-1.52782114e+35]
 [ 3.30561762e+37]
 [-1.72640237e+39]
 [ 1.60209183e+41]
 [ 1.34264388e+42]
 [-7.70468534e+44]
 [ 7.96688323e+44]
 [-9.57387029e+33]
 [ 7.0004994

In [92]:
path_te = "../data/test.csv"
yb_te, data_te, ids_te = load_csv_data(path_te,False)

In [94]:
data_te0 = poly_expansion(data_te, degree, False, False)
data_te0, mean_te, std_te = standardize(data_te0, mean_tr, std_tr)
data_te0 = add_constant(data_te0)
yb_te = yb_te.reshape(len(yb_te),1)

In [95]:
#print(yb_te[:50])

In [97]:
y_pred = compute_y(data_te0, w_SGD)
print(y_pred)

[[-1]
 [-1]
 [-1]
 ...
 [-1]
 [ 1]
 [-1]]


In [99]:
loss_gl = compute_loss(y_pred, data_te0, w_SGD, "log", lambda_=0)
print(loss_gl)

0.0


In [100]:
create_csv_submission(ids_te, y_pred, "Testnumber3.csv")