In [5]:
#main notebook
import numpy as np
from numpy.linalg import inv
from numpy import linalg as la
from implementations import *


In [6]:
#load data
import datetime
from helpers import *
import csv




In [72]:
def load_csv_data(data_path, sub_sample=False, set_outliers_zero=False, skip_n_header=1, skip_n_footer=0, cross_validation=False):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    #remove outliers by setting the values to 0
    if set_outliers_zero:
        #input_data = np.array([[ 0 if data == -999 else data for data in line.split()] for line in input_data])
        for (x,y), data in np.ndenumerate(input_data):
            if data == -999. :
                input_data[x,y] = 0.
               
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]
        
    if cross_validation:
        n=int(2*len(yb)/3)
        return yb[0:n], input_data[0:n,:], ids [0:n], yb[n:(len(yb)-1)], input_data[n:(len(yb)-1),:], ids[n:(len(yb)-1)]
    

    return yb, input_data, ids


In [73]:
y, datas, ids = load_csv_data("train.csv",set_outliers_zero=True,cross_validation=False)

y, datas, ids, np.shape(y), np.shape(datas)

(array([ 1., -1., -1., ...,  1., -1., -1.]),
 array([[138.47 ,  51.655,  97.827, ...,   1.24 ,  -2.475, 113.497],
        [160.937,  68.768, 103.235, ...,   0.   ,   0.   ,  46.226],
        [  0.   , 162.172, 125.953, ...,   0.   ,   0.   ,  44.251],
        ...,
        [105.457,  60.526,  75.839, ...,   0.   ,   0.   ,  41.992],
        [ 94.951,  19.362,  68.812, ...,   0.   ,   0.   ,   0.   ],
        [  0.   ,  72.756,  70.831, ...,   0.   ,   0.   ,   0.   ]]),
 array([100000, 100001, 100002, ..., 349997, 349998, 349999]),
 (250000,),
 (250000, 30))

In [75]:
y, datas, ids, y_cross, datas_cross, ids_cross = load_csv_data("train.csv",set_outliers_zero=True,cross_validation=True)

y, datas, ids, np.shape(y), np.shape(datas), np.shape(datas_cross)

(array([ 1., -1., -1., ..., -1., -1.,  1.]),
 array([[ 1.38470e+02,  5.16550e+01,  9.78270e+01, ...,  1.24000e+00,
         -2.47500e+00,  1.13497e+02],
        [ 1.60937e+02,  6.87680e+01,  1.03235e+02, ...,  0.00000e+00,
          0.00000e+00,  4.62260e+01],
        [ 0.00000e+00,  1.62172e+02,  1.25953e+02, ...,  0.00000e+00,
          0.00000e+00,  4.42510e+01],
        ...,
        [ 1.38441e+02,  1.01882e+02,  9.95410e+01, ...,  0.00000e+00,
          0.00000e+00,  4.01990e+01],
        [ 0.00000e+00,  7.57380e+01,  1.75200e+01, ...,  0.00000e+00,
          0.00000e+00,  0.00000e+00],
        [ 1.12179e+02,  7.40000e-02,  7.62010e+01, ..., -2.36600e+00,
          2.64700e+00,  1.95706e+02]]),
 array([100000, 100001, 100002, ..., 266663, 266664, 266665]),
 (166666,),
 (166666, 30),
 (83333, 30))

In [93]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x 

def create_tx(datas):
    for i in range(np.shape(datas)[1]) :
        datas[:,i] = standardize(datas[:,i])
    tx = np.c_[np.ones(np.shape(datas)[0]), datas]
    return tx

def change_to_int(y_evaluated, adapted_mean=True) :
    y_predicted = np.array([])
    special_mean = 0
    
    if adapted_mean :
        sorted_y = np.sort(y_evaluated)
        special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]
    print(special_mean)
        
    y_predicted = [-1 if y < special_mean else 1 for y in y_evaluated]
    return y_predicted

In [94]:
def do_a_cross_test():
    #loads the datas
    y, datas, ids, y_cross, datas_cross, ids_cross = load_csv_data("train.csv",set_outliers_zero=True,cross_validation=True)
    
    #create the good datas
    tx = create_tx(datas)
    tx_cross = create_tx(datas_cross)
    
    #finds the w via gradient descent
    L_GD,w_GD =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),500,0.1)
    #finds by ridge regression
    L_ridge,w_ridge = ridge_regression(y, tx, 0.00001)
    #calculates the evaluation
    y_evaluated_GD = tx_cross @ w_GD
    y_evaluated_ridge = tx_cross @ w_ridge
    #changes to int
    y_pred = change_to_int(y_evaluated_GD, adapted_mean=True)
    error = 1 - np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("GD has error : ")
    print(error)
    y_pred = change_to_int(y_evaluated_ridge, adapted_mean=True)
    error = 1- np.sum((y_pred - y_cross)**2)/4/len(y_pred)
    print("Ridge has error : ")
    print(error)
    
    
    return 

In [None]:
do_a_cross_test()

In [43]:
tx = create_tx(datas)


In [54]:
#finds the w via gradient descent
L,w =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),500,0.1)
L,w

(0.3391594959719766,
 array([-3.14664000e-01,  3.15953267e-02, -2.45244213e-01, -2.62096921e-01,
        -1.43108193e-02, -1.44794110e-03,  1.14574506e-01, -1.12736996e-02,
         2.66679643e-01, -9.85382220e-03, -3.22126849e-02, -1.83393557e-01,
         1.22711870e-01,  1.01311323e-01,  1.88672793e-01, -1.04261481e-03,
        -9.30947646e-04,  2.83213473e-01, -2.17058273e-04,  2.45449418e-03,
         9.72061660e-02,  8.90132649e-04, -6.20547331e-02,  2.37583361e-03,
         8.71736717e-02,  4.78036689e-04,  4.43481742e-05, -7.62297013e-03,
         1.37046817e-03, -1.86167019e-03, -1.44923744e-01]))

In [55]:
#finds by ridge regression
L,w = ridge_regression(y, tx, 0.00001)


In [25]:
#import the test datas
y_real, data_test, id_test = load_csv_data("test.csv",set_outliers_zero=True)
tx_test = create_tx(data_test)

In [26]:
y_evaluated = tx_test @ w

In [27]:
np.shape(datas), np.shape(data_test), np.shape(tx), np.shape(tx_test)

((250000, 30), (568238, 30), (250000, 31), (568238, 31))

In [28]:

sorted_y = np.sort(y_evaluated)
special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]
for y in np.nditer(y_evaluated) :
    y_predicted = np.append(y_predicted, -1 if y < special_mean else 1)
y_predicted

array([-1., -1., -1., ...,  1.,  1., -1.])

In [29]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            r2 = '-1'if r2==-1 else '1'
            writer.writerow({'Id':int(r1),'Prediction':r2})

In [30]:
create_csv_submission(id_test, y_predicted, "sample_submission_5.csv")

In [None]:
#test by deleting some parameters

In [None]:
#import the datas for creating the model
path_dataset = "train.csv"
data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1,usecols= [3,4,5] )

In [None]:

np.shape(data)
print(data)
np.delete(data,(1),axis=0)
data[1]

In [None]:
sb = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=1, skip_footer=50000, usecols=1)
SB = np.array([])
for i in sb: 
    SB = np.append(SB,0 if i == 's' else 1)

In [None]:
num_samples = len(SB)
tx = np.c_[np.ones(num_samples), data]
tx

In [None]:
L,w =least_squares_GD(standardize(SB)[0],tx, [0,0,0,0],200,0.1)

In [None]:
L,w


In [None]:
data_test = np.genfromtxt("train.csv", delimiter=",", skip_header=200001, usecols= [3,4,5] )
data_test, mean_x, std_x = standardize(data_test)
np.shape(data_test)

In [None]:
sb_test = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=200001, usecols=1)
SB_test = np.array([])
for i in sb_test: 
    SB_test = np.append(SB_test,0 if i == 's' else 1)


In [None]:
num_samples = len(SB_test)
tx_test = np.c_[np.ones(num_samples), data_test]
tx_test

In [None]:
Y_test = tx_test @ w

In [None]:
res = np.array([])
mean = standardize(SB)[1]
for y in Y_test :
    res = np.append(res, 0 if y < mean else 1)

In [None]:
Y_test[0:20], res[0:20]

In [None]:
np.sum( (res- SB_test)**2) / len(SB_test)

In [35]:
y_predicted = np.array([])
sorted_y = np.sort(y_evaluated)
special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]

y_predicted = [-1 if y < special_mean else 1 for y in y_evaluated]
y_predicted

[-1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,

In [32]:
dank = np.asarray([-1 if y < special_mean else 1 for y in y_evaluated])

In [33]:
print(dank)

[-1 -1 -1 ...  1  1 -1]


In [34]:
y_predicted


array([-1., -1., -1., ...,  1.,  1., -1.])