In [1]:
import numpy as np
from data_loader import load_data, DATA_PATH_TEST, DATA_PATH_TRAIN, DATA_PATH_SAMPLE_SUBMISSION_TEST
import logistic_regression
import gradient_descent
import matplotlib.pyplot as plt
from label_predictor import predict_labels

In [2]:
y, tx, ids_train = load_data(DATA_PATH_TRAIN)

In [61]:
def build_k_indices(y, number_of_subset, seed):
    rows_num = len(y)
    inter = int(rows_num / number_of_subset)
    #set the random seed
    np.random.seed(seed)
    indices = np.random.permutation(rows_num)
    subset_indices = []
    for i in range(number_of_subset):
        subset_indices.append(indices[i * inter: (i + 1) * inter])
    return np.array(subset_indices)

In [37]:
def cross_validation(y, tx, k_indices, k, gamma, alpha):
    #get the indices of the subsets
    test_set = k_indices[k]
    train_set = k_indices[~(np.arange(k_indices.shape[0]) == k)].reshape(-1)
    
    #get the subsets
    tx_train = tx[train_set]
    y_train = y[train_set]
    tx_test = tx[test_set]
    y_test = y[test_set]

    
    w, loss_train = logistic_regression.regularized_logistic_regression_gradient_descent(y_train, tx_train, gamma, 50,alpha)
    loss_test = logistic_regression.calculate_loss(y_test, tx_test, w)
    y_pred = predict_labels(w, tx_test)
    counter = 0
    #change the -1 to 0 to match y_test
    y_pred = [0 if x==-1 else x for x in y_pred]
    for i in range(y_test.shape[0]):
        if y_pred[i] == y_test[i]:
            counter += 1
    percent = 100*counter/y_test.shape[0]
#     print(percent)
    return loss_train, loss_test, w, percent

In [5]:
def build_poly(tx, degree):
    for idx, x in enumerate(tx.T):
        if idx == 0:
            arr_out = build_poly_one_column(x, degree)
        else:
            arr_out = np.c_[arr_out, build_poly_one_column(x, degree)]
    return arr_out

In [6]:
def build_poly_one_column(x, degree):
    arr = np.zeros((x.shape[0], degree+1))
    for degre in range(degree+1):
        arr[:,degre] = np.power(x, degre)
    return arr

In [62]:
def cross_validation_test():    
#     tx_train = np.delete(tx, [5, 12, 15, 18, 19, 20, 21, 23, 25, 27, 28, 29, 30], axis=1)
    for y_t, tx_t, id_test in zip(y, tx, ids_train):
        # split data in k fold
        number_of_subset = 4
        subset_indices = build_k_indices(y_t, number_of_subset, 12) #last number is the seed  
        percents = []
        tx_train = build_poly(tx_t, 2)
        for i in range(number_of_subset):
            _,_,_, percent = cross_validation(y_t, tx_train, subset_indices, i, 0.5,0.5)
            #print(percent)
            percents.append(percent)
        print(np.mean(percents))

cross_validation_test()


  loss_2 = np.matmul((1 - y).T,(np.log(1 - sigma)))
  loss_1 = -np.matmul(y.T,(np.log(sigma)))


94.63629402756509
89.1931216931217
89.46476964769649
92.95392953929539
73.89548436060065
63.62817947985138
66.13950742240218
65.77547863082576


In [65]:
def cross_validation_search_param(tx, y):
    # split data in k fold
    number_of_subset = 4
    subset_indices = build_k_indices(y, number_of_subset, 12) #last number is the seed
    
    #Best parameter
    bestW = []
    bestRatio = 0
    bestG = 0
    bestA = 0
    bestD = 0
    
    count = 0
    #Test of different degrees
    for d in np.arange(2, 4, 1):  
        tx_train = build_poly(tx, d)
        #Test of different alpha
        for a in np.arange(0.1, 1, 0.1):
            #Test of different gamma
            for g in np.arange(0.1, 1, 0.1):
                # define lists to store the ratio of true mapping
                ratio = 0
                ratios= []
                for k in range(number_of_subset):
                    _, _, w, ratio = cross_validation(y, tx_train, subset_indices, k, g, a)
                    ratios.append(ratio)
                if np.mean(ratios) > bestRatio:
                    bestW = w
                    bestG = g
                    bestA = a
                    bestD = d
                    bestRatio = np.mean(ratios)
                count += 1
#                 if(count%100 == 0):
#                     print(count)
#                 print("ratio:", np.mean(ratios))
    print("bestRatio:", bestRatio)
    return bestW, bestG, bestA, bestD
counter = 0
for y_t, tx_t, id_test in zip(y, tx, ids_train):
    bestW, bestG, bestA, bestD = cross_validation_search_param(tx_t, y_t)
    print("Set ", counter)
    print("bestW ", bestW)
    print("bestG ", bestG)
    print("bestA ", bestA)
    print("bestD ", bestD)
    counter += 1

  loss_2 = np.matmul((1 - y).T,(np.log(1 - sigma)))
  loss_1 = -np.matmul(y.T,(np.log(sigma)))


bestRatio: 94.63629402756509
Set  0
bestW  [-0.14526555 -0.14526555 -0.14526555 -0.14526555  0.14302304  0.16658398
 -0.14526555  0.01919022 -0.66265613 -0.14526555 -0.00682256 -0.12060465
 -0.14526555 -0.04619443 -0.26854505 -0.14526555 -0.00682284 -0.12060495
 -0.14526555  0.42408973  0.15267577 -0.14526555 -0.34200832 -0.13782914
 -0.14526555 -0.07503385 -0.01331175 -0.14526555  0.48325103 -0.16683199
 -0.14526555  0.04058689 -0.01780975 -0.14526555 -0.01773919 -0.12911693
 -0.14526555  0.18004038 -0.26865787 -0.14526555  0.04508566 -0.17553439
 -0.14526555  0.0151609  -0.09603653 -0.14526555  0.17154488  0.24178848
 -0.14526555  0.0283974  -0.10461213 -0.14526555 -0.00823478 -0.15668107]
bestG  0.5
bestA  0.5
bestD  2
bestRatio: 91.17724867724867
Set  1
bestW  [-0.079317   -0.079317   -0.079317   -0.079317    0.11677488  0.00608304
 -0.079317    0.03323589 -0.26166997 -0.079317   -0.0749151   0.02893454
 -0.079317   -0.03111449 -0.16121121 -0.079317    0.13082433  0.06180063
 -0.07

  return loss_1 + loss_2 / len(sigma)


bestRatio: 79.70401691331924
Set  4
bestW  [-0.02220702 -0.02220702 -0.02220702 -0.02220702 -0.47787745  0.10079395
 -0.02220702 -0.05455501 -0.3491103  -0.02220702 -0.07005964  0.06729159
 -0.02220702  0.42913035 -0.35949092 -0.02220702 -0.07005959  0.06729169
 -0.02220702  0.1983064  -0.05207411 -0.02220702 -0.44129686  0.07112309
 -0.02220702  0.00098036  0.04009628 -0.02220702  0.41282302 -0.02283691
 -0.02220702  0.00330699 -0.10101111 -0.02220702  0.00354647 -0.00864553
 -0.02220702 -0.10740618  0.03496943 -0.02220702  0.02230704 -0.21011447
 -0.02220702  0.00417427 -0.02022429 -0.02220702 -0.33998975  0.08816527
 -0.02220702 -0.03234727  0.00352862 -0.02220702  0.1237931  -0.00608006]
bestG  0.2
bestA  0.8
bestD  2
bestRatio: 75.27579308373822
Set  5
bestW  [ 2.85635272e-03  2.85635272e-03  2.85635272e-03  2.85635272e-03
 -3.63831617e-01 -6.82291746e-02  2.85635272e-03  1.89748393e-01
 -3.67182458e-01  2.85635272e-03  9.97153883e-02  1.01679496e-01
  2.85635272e-03  3.25466273e-