In [1]:
#main notebook
import numpy as np
from numpy.linalg import inv
from numpy import linalg as la
from implementations import *


In [3]:
#load data
import datetime
from helpers import *




In [4]:
def load_csv_data(data_path, sub_sample=False, remove_outliers=False, skip_n_header=1, skip_n_footer=0):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    #remove outliers
    if remove_outliers:
        i = len(yb)-1
        while i > 0 :
            if np.sum(input_data[i]) < 0 :
                yb = np.delete(yb, (i), axis=0)
                input_data = np.delete(input_data, (i), axis=0)
                ids = np.delete(ids, (i), axis=0)
            i-=1
               
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids


In [6]:
y, datas, ids = load_csv_data("train.csv")


In [7]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x 

def create_tx(datas):
    for i in range(np.shape(datas)[1]) :
        datas[:,i] = standardize(datas[:,i])
    tx = np.c_[np.ones(np.shape(datas)[0]), datas]
    return tx

In [8]:
tx = create_tx(datas)

In [9]:
tx

array([[ 1.        ,  0.46141372,  0.06833197, ...,  1.5668    ,
         1.55858439,  0.4125105 ],
       [ 1.        ,  0.51670419,  0.55250482, ..., -0.63936657,
        -0.63936694, -0.27381996],
       [ 1.        , -2.33785898,  3.19515553, ..., -0.63936657,
        -0.63936694, -0.29396985],
       ...,
       [ 1.        ,  0.38016991,  0.31931645, ..., -0.63936657,
        -0.63936694, -0.31701723],
       [ 1.        ,  0.35431502, -0.84532397, ..., -0.63936657,
        -0.63936694, -0.74543941],
       [ 1.        , -2.33785898,  0.66533608, ..., -0.63936657,
        -0.63936694, -0.74543941]])

In [10]:
#finds the w via gradient descent
L,w =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),200,0.1)
L,w

(0.340476032998648,
 array([-0.314664  ,  0.0324202 , -0.24756281, -0.23148437, -0.00947043,
        -0.02133883,  0.35881233, -0.02963645,  0.25019704, -0.00839   ,
        -0.00167906, -0.15538766,  0.11584467, -0.02346006,  0.19795288,
        -0.0007597 , -0.00122577,  0.24911607, -0.00086921,  0.0025014 ,
         0.10971792,  0.00103879, -0.07072691, -0.14648255,  0.03857581,
         0.0434163 ,  0.04343473, -0.02533575, -0.02425274, -0.02439544,
        -0.10332556]))

In [11]:
y_real, data_test, id_test = load_csv_data("test.csv")
tx_test = create_tx(data_test)

In [12]:
y_evaluated = tx_test @ w

In [13]:
y_predicted = np.array([])
sorted_y = np.sort(y_evaluated)
special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]
for y in np.nditer(y_evaluated) :
    y_predicted = np.append(y_predicted, -1 if y < special_mean else 1)
y_predicted

array([-1., -1., -1., ...,  1.,  1., -1.])

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            r2 = 'b' if r2==-1 else 's'
            writer.writerow({'Id':int(r1),'Prediction':r2})

In [195]:
create_csv_submission(id_test, y_predicted, "sample_submission.csv")

In [181]:
sorted_y

array([-3.61096876, -3.00082722, -2.93134027, ...,  2.9798361 ,
        3.08742585,  4.78075635])

In [182]:
np.sum(y_evaluated), np.sum(y_predicted), np.sum(y_real), np.rint(len(sorted_y)/2 + np.sum(y_evaluated))

(-15785.649028863096, -15785.0, -15675.0, 9215.0)

In [186]:
np.sum(((y_predicted-y_real)/2)**2)/len(y_real)

0.253694926101478

In [90]:
#import the datas for creating the model
path_dataset = "train.csv"
data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1,usecols= [3,4,5] )

In [98]:

np.shape(data)
print(data)
np.delete(data,(1),axis=0)
data[1]

[[ 51.655  97.827  27.98 ]
 [ 68.768 103.235  48.146]
 [162.172 125.953  35.635]
 ...
 [ 60.526  75.839  39.757]
 [ 19.362  68.812  13.504]
 [ 72.756  70.831   7.479]]


array([ 68.768, 103.235,  48.146])

In [31]:
sb = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=1, skip_footer=50000, usecols=1)
SB = np.array([])
for i in sb: 
    SB = np.append(SB,0 if i == 's' else 1)

In [45]:
num_samples = len(SB)
tx = np.c_[np.ones(num_samples), data]
tx

array([[ 1.        , -0.22239874,  0.70041702, -0.69557866],
       [ 1.        ,  0.1196299 ,  0.80850391, -0.2925313 ],
       [ 1.        ,  1.98644709,  1.26255676, -0.54258215],
       ...,
       [ 1.        , -0.01749734,  0.02125644,  0.03350815],
       [ 1.        ,  0.37365806, -0.15408495, -1.2491442 ],
       [ 1.        ,  0.13797749,  0.73781173, -0.67425308]])

In [60]:
L,w =least_squares_GD(standardize(SB)[0],tx, [0,0,0,0],200,0.1)

In [61]:
L,w


(0.43093491086929886,
 array([ 0.14320443,  0.47081878, -0.06982675, -0.08973428]))

In [48]:
data_test = np.genfromtxt("train.csv", delimiter=",", skip_header=200001, usecols= [3,4,5] )
data_test, mean_x, std_x = standardize(data_test)
np.shape(data_test)

(50000, 3)

In [40]:
sb_test = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=200001, usecols=1)
SB_test = np.array([])
for i in sb_test: 
    SB_test = np.append(SB_test,0 if i == 's' else 1)


In [49]:
num_samples = len(SB_test)
tx_test = np.c_[np.ones(num_samples), data_test]
tx_test

array([[ 1.        ,  0.74100665,  0.44790861, -0.6610894 ],
       [ 1.        , -0.77799727, -0.32603467, -0.22190143],
       [ 1.        ,  0.05888105,  0.41926549, -0.46988904],
       ...,
       [ 1.        , -0.04405456,  0.26159855, -0.45861143],
       [ 1.        , -0.86570312,  0.12133704, -0.98263096],
       [ 1.        ,  0.20006073,  0.16163703, -1.10289217]])

In [63]:
Y_test = tx_test @ w

In [68]:
res = np.array([])
mean = standardize(SB)[1]
for y in Y_test :
    res = np.append(res, 0 if y < mean else 1)

In [69]:
Y_test[0:20], res[0:20]

(array([ 0.52013066, -0.18041319,  0.18381595,  0.22603976,  0.3817129 ,
        -0.30583385,  0.06966408,  0.46115635,  0.15767803, -0.48368688,
        -0.15350885,  0.25030378,  0.13419517,  0.3884125 , -0.39979502,
        -0.00933673,  0.53627554,  0.21850201,  0.10662062, -0.38788442]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]))

In [70]:
np.sum( (res- SB_test)**2) / len(SB_test)

0.642