# Project 1 of the PCML course !

## Useful starting lines

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import datetime
%load_ext autoreload
%autoreload 2

## Import data

In [None]:
import helpers

ids, x_tr, y_tr = helpers.load_data('data/train.csv')
ids_test, x_te, _ = helpers.load_data('data/test.csv')

### concat the features

In [None]:
x_tot = np.concatenate((x_tr, x_te))

### Clean up missing data

The missing values are filled with the mean of the column.

In [None]:
import clean_data

x_tot, y_tr, cols = clean_data.clean_data_by_mean(x_tot, y_tr, 0.3)

### Feature engineering

In [None]:
from build_polynomial import build_poly

x_tot_plus = build_poly(x_tot.T, 4)
x_tot = np.concatenate((x_tot, x_tot_plus), axis=1)

### Initialization

In [None]:
# Standardizing the data
x_stdize, mean_x, std_x = helpers.standardize(x_tot)
# Building the model
y, tx_tot = helpers.build_model_data(x_stdize, y_tr)
# Replacement of -1 to 0
y[y==-1] = 0

### Separation

In [None]:
tx_te = tx_tot[250000:,:]
tx = tx_tot[:250000,:]

print("Shape of testing set",tx_te.shape)
print("Shape of training set",tx.shape)

# Machine Learning !

### Algorithm parameters and initialization

In [None]:
from implementations import *
from costs import *
from gradients import *

#### Parameters

In [None]:
max_iters = 1000
max_iters_s = 10000
gamma_MSE = 1e-05
gamma_MAE = 0.3
initial_w = np.zeros(tx.shape[1])

## Least squares gradient descent

#### MSE

In [None]:
gradient_w_MSE, gradient_loss_MSE = least_squares_GD(y, tx, initial_w, max_iters, gamma_MSE)

In [None]:
gradient_loss_MSE

In [None]:
gradient_w_MSE.shape

#### MAE

In [None]:
gradient_w_MAE, gradient_loss_MAE = gradient_descent(y, tx, initial_w, max_iters, gamma_MAE, compute_loss_MAE, compute_gradient_MAE)

In [None]:
gradient_loss_MAE

In [None]:
gradient_w_MAE

## Least squares stochastic gradient descent

#### MSE

In [None]:
SGD_w_MSE, SGD_loss_MSE = least_squares_SGD(y, tx, initial_w, max_iters_s, gamma_MSE, 200)

In [None]:
SGD_loss_MSE

In [None]:
SGD_w_MSE

#### MAE

In [None]:
SGD_w_MAE, SGD_loss_MAE = stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma_MAE, compute_loss_MAE, compute_gradient_MAE)

In [None]:
SGD_loss_MAE

In [None]:
SGD_w_MAE

## Least squares

#### MSE

In [None]:
least_squares_w, least_squares_loss = least_squares(y, tx, compute_loss_MSE)

In [None]:
least_squares_loss

In [None]:
least_squares_w

#### MAE

In [None]:
least_squares_w, least_squares_loss = least_squares(y, tx, compute_loss_MAE)

In [None]:
least_squares_loss

In [None]:
least_squares_w

## Ridge regressions

In [None]:
lamb = 23

#### MSE

In [None]:
ridge_reg_w, ridge_reg_loss = ridge_regression(y, tx, lamb, compute_loss_MSE)

In [None]:
ridge_reg_loss

In [None]:
ridge_reg_w

#### MAE

In [None]:
ridge_reg_w, ridge_reg_loss = ridge_regression(y, tx, lamb, compute_loss_MAE)

In [None]:
ridge_reg_loss

In [None]:
ridge_reg_w

## Logistic regression

TODO: set -1 values in y to 0

In [None]:
gamma = 1e-05

In [None]:
logistic_reg_w, logistic_reg_loss = logistic_regression(y, tx, initial_w, max_iters, gamma)

In [None]:
logistic_reg_loss

In [None]:
logistic_reg_w

## Regularized logistic regression

TODO: implement compute_loss_REGLOG and compute_gradient_REGLOG in costs.py and gradients.py

In [None]:
gamma = 1.1e-03

In [None]:
def stoch_reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
    """implementation of regularized logistic regression using GD"""
    return stochastic_gradient_descent(y, tx, initial_w, max_iters, gamma, compute_loss_RLOG, 
                            compute_gradient_RLOG, reg=True, lambda_=lambda_)

In [None]:
logistic_reg_w, logistic_reg_loss = stoch_reg_logistic_regression(y, tx, 0, initial_w, max_iters_s*100, gamma)

In [None]:
logistic_reg_loss

### Verification

In [None]:
yPred = helpers.predict_labels(logistic_reg_w, tx)

y[y == 0] = -1
pred = np.count_nonzero(yPred == y) / len(y)
print("percentage of good predicion in training set :", pred)
y[y == -1] = 0

### Output to file

In [None]:
OUTPUT_PATH = 'submissions/' + 'out2.csv'
y_pred = helpers.predict_labels(logistic_reg_w, tx_te)
helpers.create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

### Cross validation test

In [None]:
from cross_validation_log import *

cross_validation_demo(y, tx, stoch_reg_logistic_regression, 
                      compute_loss_RLOG, initial_w, max_iters_s, 
                      4, 1, -10, 9)


## Feature engineering tests

First, distribution testing on each feature

In [None]:
import matplotlib.pyplot as plt

### Distribution of feature

In [None]:
num_bins = 100
i = 1
for feature in x_pca.T:
    print("feature", i)
    n, bins, patches = plt.hist(feature, num_bins, normed=1, facecolor='green', 
                            alpha=0.5)
    plt.show()
    i += 1

We can see here that standerdizing the data on all the features is a mistake as most of them are not normally distributed.