In [5]:
#main notebook
import numpy as np
from numpy.linalg import inv
from numpy import linalg as la
from implementations import *


In [6]:
#load data
import datetime
from helpers import *
import csv




In [38]:
def load_csv_data(data_path, sub_sample=False, set_outliers_zero=False, skip_n_header=1, skip_n_footer=0):
    """Loads data and returns y (class labels), tX (features) and ids (event ids)"""
    y = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer, dtype=str, usecols=1)
    x = np.genfromtxt(data_path, delimiter=",", skip_header=skip_n_header, skip_footer=skip_n_footer)
    ids = x[:, 0].astype(np.int)
    input_data = x[:, 2:]

    # convert class labels from strings to binary (-1,1)
    yb = np.ones(len(y))
    yb[np.where(y=='b')] = -1
    
    #remove outliers by setting the values to 0
    if set_outliers_zero:
        input_data = np.array([[ 0 if data == -999 else data for data in line.split()] for line in input_data])
        #for (x,y), data in np.ndenumerate(input_data):
         #   if data == -999. :
          #      input_data[x,y] = 0.
               
    # sub-sample
    if sub_sample:
        yb = yb[::50]
        input_data = input_data[::50]
        ids = ids[::50]

    return yb, input_data, ids


In [39]:
y, datas, ids = load_csv_data("train.csv",set_outliers_zero=True)

y, datas, ids, datas[100], datas [10000]

AttributeError: 'numpy.ndarray' object has no attribute 'split'

In [21]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x 

def create_tx(datas):
    for i in range(np.shape(datas)[1]) :
        datas[:,i] = standardize(datas[:,i])
    tx = np.c_[np.ones(np.shape(datas)[0]), datas]
    return tx

In [22]:
tx = create_tx(datas)


In [11]:
#finds the w via gradient descent
L,w =least_squares_GD(y,tx, np.zeros(np.shape(tx)[1]),600,0.1)
L,w

(0.3391140484651175,
 array([-3.14664000e-01,  3.16149177e-02, -2.45832305e-01, -2.63023327e-01,
        -1.67566637e-02, -4.57762063e-03,  1.15434388e-01, -1.22389391e-02,
         2.67158858e-01, -8.50151758e-03, -4.09019236e-02, -1.84348180e-01,
         1.22328662e-01,  1.01517792e-01,  1.90275712e-01, -1.04399302e-03,
        -9.32158998e-04,  2.86527327e-01, -2.29911478e-04,  2.45112272e-03,
         9.80728322e-02,  8.83736570e-04, -6.14744523e-02,  5.01273098e-03,
         9.94605213e-02,  4.81671984e-04,  3.84816513e-05, -3.98703076e-04,
         1.35538164e-03, -1.86970893e-03, -1.56293709e-01]))

In [23]:
def ridge_regression(y, tx, lambda_):
    lambda_prime = lambda_*len(y)
    w = inv(tx.T@tx + lambda_prime*np.eye(tx.shape[1]))@tx.T@y
    e=(y-tx@w)
    loss = (1/(2*len(y)))*np.mean(e**2)
    return loss,w

In [24]:
#finds by ridge regression
L,w = ridge_regression(y, tx, 0.00001)


In [25]:
#import the test datas
y_real, data_test, id_test = load_csv_data("test.csv",set_outliers_zero=True)
tx_test = create_tx(data_test)

In [26]:
y_evaluated = tx_test @ w

In [27]:
np.shape(datas), np.shape(data_test), np.shape(tx), np.shape(tx_test)

((250000, 30), (568238, 30), (250000, 31), (568238, 31))

In [28]:
y_predicted = np.array([])
sorted_y = np.sort(y_evaluated)
special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]
for y in np.nditer(y_evaluated) :
    y_predicted = np.append(y_predicted, -1 if y < special_mean else 1)
y_predicted

array([-1., -1., -1., ...,  1.,  1., -1.])

In [29]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            r2 = '-1'if r2==-1 else '1'
            writer.writerow({'Id':int(r1),'Prediction':r2})

In [30]:
create_csv_submission(id_test, y_predicted, "sample_submission_5.csv")

In [None]:
#test by deleting some parameters

In [None]:
#import the datas for creating the model
path_dataset = "train.csv"
data = np.genfromtxt(path_dataset, delimiter=",", skip_header=1,usecols= [3,4,5] )

In [None]:

np.shape(data)
print(data)
np.delete(data,(1),axis=0)
data[1]

In [None]:
sb = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=1, skip_footer=50000, usecols=1)
SB = np.array([])
for i in sb: 
    SB = np.append(SB,0 if i == 's' else 1)

In [None]:
num_samples = len(SB)
tx = np.c_[np.ones(num_samples), data]
tx

In [None]:
L,w =least_squares_GD(standardize(SB)[0],tx, [0,0,0,0],200,0.1)

In [None]:
L,w


In [None]:
data_test = np.genfromtxt("train.csv", delimiter=",", skip_header=200001, usecols= [3,4,5] )
data_test, mean_x, std_x = standardize(data_test)
np.shape(data_test)

In [None]:
sb_test = np.genfromtxt("train.csv", dtype = str, delimiter=",", skip_header=200001, usecols=1)
SB_test = np.array([])
for i in sb_test: 
    SB_test = np.append(SB_test,0 if i == 's' else 1)


In [None]:
num_samples = len(SB_test)
tx_test = np.c_[np.ones(num_samples), data_test]
tx_test

In [None]:
Y_test = tx_test @ w

In [None]:
res = np.array([])
mean = standardize(SB)[1]
for y in Y_test :
    res = np.append(res, 0 if y < mean else 1)

In [None]:
Y_test[0:20], res[0:20]

In [None]:
np.sum( (res- SB_test)**2) / len(SB_test)

In [35]:
y_predicted = np.array([])
sorted_y = np.sort(y_evaluated)
special_mean = sorted_y[int(len(sorted_y)/2 - np.sum(y_evaluated)/2)]

y_predicted = [-1 if y < special_mean else 1 for y in y_evaluated]
y_predicted

[-1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 -1,

In [32]:
dank = np.asarray([-1 if y < special_mean else 1 for y in y_evaluated])

In [33]:
print(dank)

[-1 -1 -1 ...  1  1 -1]


In [34]:
y_predicted


array([-1., -1., -1., ...,  1.,  1., -1.])