In [165]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import csv

## Loading data

In [166]:
#load the data 
def load_csv_data(data_path, sub_sample=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    yb = np.ones(len(y))
    yb[np.where(y == "b")] = -1

    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids

def cleanning(data, y):
    toDelete = []
    mean = np.mean(data, axis=0)
    var = np.var(data, axis=0)
    
    for i in range(len(data)):
        for j in range(len(data[i])):
            if((data[i][j] < (mean[j] - 2 * var[j])) | (data[i][j] > (mean[j] + 2 * var[j]))):
                toDelete = toDelete + [i]
                break
                
    return np.delete(data, toDelete, axis=0), np.delete(y, toDelete, axis=0)

#standardize the data
def standardize(x):

    mean_x = np.mean(x, axis=0)
    x = x - mean_x
    
    std_x = np.std(x, axis=0)
    x = x / std_x
    return x, mean_x, std_x

#create a csv submission
def create_csv_submission(ids, y_pred, name):

    with open(name, "w") as csvfile:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({"Id": int(r1), "Prediction": int(r2)})
            
#Cross-validation implementation

def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.
    
    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval] for k in range(k_fold)]
    return np.array(k_indices)



In [167]:
train_data = load_csv_data('train.csv', sub_sample=True)
test_data = load_csv_data('test.csv', sub_sample=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ids = x[:, 0].astype(np.int)


In [168]:
print(test_data[1].shape)

(11365, 30)


In [169]:
train_data[0]

array([ 1., -1., -1., ...,  1., -1.,  1.])

In [170]:
train_data[1]

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 219.057,   72.461,  124.835, ..., -999.   , -999.   ,   50.396],
       [  90.801,   27.787,   65.373, ..., -999.   , -999.   ,   62.766],
       ...,
       [ 142.347,    7.389,   99.212, ..., -999.   , -999.   ,   97.068],
       [  78.162,   46.335,   60.136, ..., -999.   , -999.   ,   32.44 ],
       [ 130.042,    4.073,   67.819, ..., -999.   , -999.   ,   51.037]])

In [171]:
train_data[2]

array([100000, 100050, 100100, ..., 349850, 349900, 349950])

In [172]:
#train_data[4] is ut of range

In [173]:
train_data_std = train_data
# 1st column is 'y'
# 2nd column is 'X'
# 3rd column is 'ids'

In [174]:
train_data_cleanned, y_clean = cleanning(train_data[1], train_data[0])
train_data_std1, mean1, std1 = standardize(train_data_cleanned) #see to what it corresponds
#train_data_std2, mean2, std2 = standardize(train_data[2]) #see to what it corresponds

In [175]:

def build_model_data(height, weight):   
    
    y = weight
    x = height
    num_samples = len(y)
    tx = np.c_[np.ones(num_samples), x]
    return y, tx


def batch_iter(y, tx, batch_size, num_batches=1, shuffle=True):
    
    data_size = len(y)

    if shuffle:
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_y = y[shuffle_indices]
        shuffled_tx = tx[shuffle_indices]
    else:
        shuffled_y = y
        shuffled_tx = tx
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        if start_index != end_index:
            yield shuffled_y[start_index:end_index], shuffled_tx[start_index:end_index]

In [176]:
import implementations

_, tx = build_model_data(train_data_std1, train_data_std1)

print(tx.shape)

_, tc = build_model_data(test_data[1], test_data[1])

y = (y_clean + 1)/2

print(max(y))
print(min(y))
    
loss, w = implementations.logistic_regression(y, tx, np.ones(len(tx[0])), 10000, 0.2)
print()
loss1, w1 = implementations.reg_logistic_regression(y, tx, 1, np.ones(len(tx[0])), 10000, 0.2)
print()
loss2, w2 = implementations.mean_squared_error_gd(y, tx, np.ones(len(tx[0])), 10000, 0.02)
print()
loss3, w3 = implementations.mean_squared_error_sgd(y, tx, np.ones(len(tx[0])), 10000, 0.002)
print()
w4, mse = implementations.least_squares(y, tx)
print("ok")
w5, mse = implementations.ridge_regression(y, tx, 0.03)

#form K subgroups randomly
k_fold = 10
seed = 0
k_indices = build_k_indices(y, k_fold, seed)

#Pattern with method
rmse_tr_tmp = []
rmse_te_tmp = []
w_tr_tmp = []
w_te_tmp = []
for k in range(k_fold):
    # Put the kth group in test
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = tx[te_indice]
    x_tr = tx[tr_indice]
    loss_tr, w_tr = implementations.logistic_regression(y_tr, x_tr, np.ones(len(x_tr[0])), 10000, 0.2)
    loss_te, w_te = implementations.logistic_regression(y_te, x_te, np.ones(len(x_te[0])), 10000, 0.2)
    rmse_tr_tmp.append(loss_tr)
    rmse_te_tmp.append(loss_te)
    w_tr_tmp.append(w_tr)
    w_te_tmp.append(w_te)
rmse_tr = np.mean(rmse_tr_tmp, axis=0)
rmse_te = np.mean(rmse_te_tmp, axis=0)
wTR = np.mean(w_tr_tmp, axis=0)
wTE = np.mean(w_te_tmp, axis=0)

    
    
predi = implementations.sigmoid(tc @ w)
predi_t = np.sign(predi - 0.5)

predi1 = implementations.sigmoid(tc @ w1)
predi_t1 = np.sign(predi1 - 0.5)

predi2 = tc @ w2
predi_t2 = np.sign(predi2 - 0.5)

predi3 = tc @ w3
predi_t3 = np.sign(predi3 - 0.5)

predi4 = tc @ w4
predi_t4 = np.sign(predi4 - 0.5)

predi5 = tc @ w5
predi_t5 = np.sign(predi5 - 0.5)

predi6 = tc @ wTR
predi_t6 = np.sign(predi6 - 0.5)

predi7 = tc @ wTE
predi_t7 = np.sign(predi7 - 0.5)


(3911, 31)
1.0
0.0
Current iteration=0, loss=nan
Current iteration=100, loss=0.5036003103607855
Current iteration=200, loss=0.4889633069053449
Current iteration=300, loss=0.4850916139478635
Current iteration=400, loss=0.4827336658520129
Current iteration=500, loss=0.4811377684126328
Current iteration=600, loss=0.48001677551827815
Current iteration=700, loss=0.47921565795333687
Current iteration=800, loss=0.47863779455377076
Current iteration=900, loss=0.4782183435486517
Current iteration=1000, loss=0.4779121400816231
Current iteration=1100, loss=0.47768723765267657
Current iteration=1200, loss=0.47752093388990363
Current iteration=1300, loss=0.4773970704520704
Current iteration=1400, loss=0.47730412216309287
Current iteration=1500, loss=0.4772338377440826
Current iteration=1600, loss=0.4771802791096098
Current iteration=1700, loss=0.47713914657638595
Current iteration=1800, loss=0.4771073061301039
Current iteration=1900, loss=0.47708245790981624
Current iteration=2000, loss=0.477062902

Current iteration=2700, loss=0.4497235913621385
Current iteration=2800, loss=0.4496549164311195
Current iteration=2900, loss=0.4495908450108459
Current iteration=3000, loss=0.44953095353284334
Current iteration=3100, loss=0.4494748716516191
Current iteration=3200, loss=0.4494222737082194
Current iteration=3300, loss=0.449372871776758
Current iteration=3400, loss=0.4493264099734415
Current iteration=3500, loss=0.4492826597771533
Current iteration=3600, loss=0.4492414161640041
Current iteration=3700, loss=0.4492024943995004
Current iteration=3800, loss=0.44916572736405586
Current iteration=3900, loss=0.4491309633126724
Current iteration=4000, loss=0.4490980639893545
Current iteration=4100, loss=0.4490669030324112
Current iteration=4200, loss=0.4490373646191646
Current iteration=4300, loss=0.449009342308427
Current iteration=4400, loss=0.44898273804697264
Current iteration=4500, loss=0.4489574613125181
Current iteration=4600, loss=0.4489334283707954
Current iteration=4700, loss=0.44891056

Current iteration=2100, loss=0.4214193090730815
Current iteration=2200, loss=0.4213445140905104
Current iteration=2300, loss=0.42127903922384685
Current iteration=2400, loss=0.4212214215165131
Current iteration=2500, loss=0.42117046166539246
Current iteration=2600, loss=0.42112517112941084
Current iteration=2700, loss=0.4210847309140635
Current iteration=2800, loss=0.42104845920275774
Current iteration=2900, loss=0.4210157857560295
Current iteration=3000, loss=0.42098623153388054
Current iteration=3100, loss=0.4209593923813953
Current iteration=3200, loss=0.4209349258983322
Current iteration=3300, loss=0.4209125408201076
Current iteration=3400, loss=0.42089198839146846
Current iteration=3500, loss=0.4208730553298463
Current iteration=3600, loss=0.4208555580631277
Current iteration=3700, loss=0.42083933799369927
Current iteration=3800, loss=0.42082425759235625
Current iteration=3900, loss=0.42081019716582097
Current iteration=4000, loss=0.4207970521729865
Current iteration=4100, loss=0.

Current iteration=3200, loss=0.4471570184419157
Current iteration=3300, loss=0.4471373932116335
Current iteration=3400, loss=0.447118048483513
Current iteration=3500, loss=0.4470989378992763
Current iteration=3600, loss=0.44708002552926746
Current iteration=3700, loss=0.4470612834990041
Current iteration=3800, loss=0.44704269016004344
Current iteration=3900, loss=0.44702422867902086
Current iteration=4000, loss=0.44700588594837926
Current iteration=4100, loss=0.44698765174484456
Current iteration=4200, loss=0.4469695180788943
Current iteration=4300, loss=0.44695147869159474
Current iteration=4400, loss=0.44693352866523356
Current iteration=4500, loss=0.44691566412188366
Current iteration=4600, loss=0.44689788198995206
Current iteration=4700, loss=0.44688017982332023
Current iteration=4800, loss=0.4468625556611875
Current iteration=4900, loss=0.4468450079194267
Current iteration=5000, loss=0.44682753530635033
Current iteration=5100, loss=0.4468101367573864
Current iteration=5200, loss=0

  return 1.0/(1.0 + np.exp(-z))


In [179]:
print(test_data[0].shape)
print(max(test_data[0]))
print(min(test_data[0]))

test = 0

for i in range(len(predi_t)):
    if(predi_t[i] != test_data[0][i]):
        test += 1

print("test 0 : " + str(1 - test/len(predi_t)))

test = 0

for i in range(len(predi_t1)):
    if(predi_t1[i] != test_data[0][i]):
        test += 1

print("test 1 : " + str(1 - test/len(predi_t1)))


test = 0

for i in range(len(predi_t2)):
    if(predi_t2[i] != test_data[0][i]):
        test += 1

print("test 2 : " + str(1 - test/len(predi_t2)))


test = 0

for i in range(len(predi_t3)):
    if(predi_t3[i] != test_data[0][i]):
        test += 1

print("test 3 : " + str(1 - test/len(predi_t3)))


test = 0

for i in range(len(predi_t4)):
    if(predi_t4[i] != test_data[0][i]):
        test += 1

print("test 4 : " + str(1 - test/len(predi_t4)))


test = 0

for i in range(len(predi_t5)):
    if(predi_t5[i] != test_data[0][i]):
        test += 1

print("test 5 : " + str(1 - test/len(predi_t5)))

test = 0

for i in range(len(predi_t6)):
    if(predi_t6[i] != test_data[0][i]):
        test += 1

print("test 6 : " + str(1 - test/len(predi_t6)))

test = 0

for i in range(len(predi_t7)):
    if(predi_t7[i] != test_data[0][i]):
        test += 1

print("test 7 : " + str(1 - test/len(predi_t7)))


(11365,)
1.0
1.0
test 0 : 0.2788385393752749
test 1 : 0.27989441267047954
test 2 : 0.2346678398592169
test 3 : 0.1782666080070392
test 4 : 0.2942366915970084
test 5 : 0.280686317641883
test 6 : 0.2791025076990761
test 7 : 0.28438187417509897
