In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' 
y_trainRaw, tX_trainRaw, ids_train = load_csv_data(DATA_TRAIN_PATH)
DATA_TEST_PATH = '../data/test.csv' 
y_testRaw, tX_testRaw, ids_test = load_csv_data(DATA_TEST_PATH)

In [3]:
def split_numerical_categorical(x,cat_cols):
    x_num = np.delete(x,cat_cols,axis = 1)
    x_cat = x[:,cat_cols]
    return x_num, x_cat

def replace_undef_val_with_nan(x):
    return np.where(x == -999.0, np.nan, x)

def nan_standardize_fit(x):                                
    mean = np.nanmean(x, axis = 0)
    std = np.nanstd(x, axis = 0)
    return (x - mean)/std , mean, std

def nan_standardize_with_median_fit(x):                               
    median = np.nanmedian(x, axis = 0)
    iqr,_,_ = calculate_iqr(x)  
    return 2*(x - median)/iqr , median, iqr

def nan_standardize_with_median_transform(x,median,iqr):                             
    return 2*(x - median)/iqr 
    
def nan_standardize_transform(x,mean,std):
    return (x - mean)/std

def relabel_y_non_negative(y):
    new_y = y.copy()
    new_y[new_y == -1] = 0
    return new_y
 
def relabel_y_negative(y):
    new_y = y.copy()
    new_y[new_y == 0] = -1
    return new_y
        
def replace_nan_val_with_mean(x):
    means = np.nanmean(x,axis = 0)
    n_cols = x.shape[1]
    new_x = x.copy()
    for i in range(n_cols):
        new_x[:,i] = np.where(np.isnan(new_x[:,i]), means[i], new_x[:,i])
    return new_x

def replace_nan_val_with_zero(x):
    n_cols = x.shape[1]
    new_x = x.copy()
    for i in range(n_cols):
        new_x[:,i] = np.where(np.isnan(new_x[:,i]), 0, new_x[:,i])
    return new_x

def calculate_iqr(x):
    q1 = np.quantile(x,0.25,axis = 0)
    q3 = np.quantile(x,0.75,axis = 0)
    return q3 - q1, q1, q3

def replace_iqr_outliers(x):
    iqr, q1, q3= calculate_iqr(x)
    upper_bound = q3 + iqr * 1.5
    lower_bound = q1 - iqr * 1.5
    x_trunc_up = np.where(x > upper_bound,upper_bound,x)
    x_trunc_low = np.where(x_trunc_up < lower_bound,lower_bound,x_trunc_up)
    return x_trunc_low

def replace_nan_val_with_median(x):
    medians = np.nanmedian(x,axis = 0)
    n_cols = x.shape[1]
    new_x = x.copy()
    for i in range(n_cols):
        new_x[:,i] = np.where(np.isnan(new_x[:,i]), medians[i], new_x[:,i])
    return new_x

def one_hot_encode(x):
    unique_vals = set(x.ravel())
    # print(unique_vals)
    n_cols = len(unique_vals) - 1 
    ohe_x = np.zeros((x.shape[0],n_cols))
    for (row,col) in enumerate(x):
        if col < n_cols:
            ohe_x[int(row),int(col)] = 1
    return ohe_x

def add_bias(x):
    return np.hstack((np.ones(x.shape[0]).reshape(-1,1),x))
   
def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8 
    you will have 80% of your data set dedicated to training 
    and the rest dedicated to testing
    """
    # set seed
    np.random.seed(seed)
    # ***************************************************
    # INSERT YOUR CODE HERE
    # split the data based on the given ratio: TODO
    # ***************************************************
    n_train = round(y.shape[0]*ratio)
    idx = np.random.permutation(range(y.shape[0]))
    x_shuffled = x[idx]
    y_shuffled = y[idx]
    return x_shuffled[:n_train],y_shuffled[:n_train],x_shuffled[n_train:],y_shuffled[n_train:]
    
    
def multiHistPlots(x,figsize = (15,15)):
    n = x.shape[1]
    n_rows = np.ceil(np.sqrt(n)).astype(np.int64)
    n_cols = np.floor(np.sqrt(n)).astype(np.int64)

    if n_rows * n_cols < n:
        n_cols = np.ceil(np.sqrt(n)).astype(np.int64)

    fig, axes = plt.subplots(nrows = n_rows, ncols = n_cols, figsize = figsize)

    c = 0
    for row in range(n_rows):
        for col in range(n_cols):
            if n > 1:
                ax = axes[row][col]
            else:
                ax = axes
            if c < x.shape[1]:
                ax.hist(x[:,c], label = 'feature_{:d}'.format(c),density = True)
                ax.legend(loc = 'upper left')
                ax.set_ylabel('Probability')
                ax.set_xlabel('Value')
            c += 1
    plt.show()    

## Define functions needed for the log and reg_log

### Logistic Regression

In [4]:
def sigmoid(t):
    """Apply sigmoid function on t
    
    Args: 
        t=>(numpy.array): Values to apply sigmoid function
    
    Returns:
        => numpy.array: Calculated values of sigmoid
    """
    
    return 1.0 / (1.0 + np.exp(-t))


def calculate_loss_log(y, tx, w):
    """Compute the cost of log_regression
    
    Args: 
        y =>(numpy.array): Target values
        tx =>(numpy.array): Transposed features
        w => (numpy.array): Weigths 
          
    Returns:
        => numpy.array: Calculated loss
    """
    pred = sigmoid(tx.dot(w))
    loss = y.T.dot(np.log(pred)) + (1 - y).T.dot(np.log(1 - pred))
    return np.squeeze(- loss)

    
def calculate_gradient_log(y, tx, w,):
    """Compute the gradient of loss for log_regression
    
    Args: 
        y =>(numpy.array): Target values
        tx => (numpy.array): Transposed features
        w => (numpy.array): Weigths 
          
    Returns:
        => numpy.array: Calculated logistic gradient
    """

    pred = sigmoid(tx.dot(w))
    grad = tx.T.dot(pred - y)
    return grad


def learning_by_gradient_descent_log(y, tx, w, gamma):
    """Compute the gradient descen using logistic regression
    
    Args: 
        y =>(numpy.array): Target values
        tx => (numpy.array): Transposed features
        w => (numpy.array): Weigths 
        gamma=> (float): the gamma to use.
        
    Returns:
        w =>(numpy.array): Calculated Weights
        loss => (numpy.array): Calculated Loss
    """

    loss = calculate_loss_log(y, tx, w) 
    grad = calculate_gradient_log(y, tx, w)
    w -= gamma * grad
    return loss, w


def learning_by_reg_gradient_descent_log(y, tx, w, gamma, lambda_=0):
    """Compute the gradient descen using logistic regression
    
    Args: 
        y =>(numpy.array): Target values
        tx => (numpy.array): Transposed features
        w => (numpy.array): Weigths 
        gamma=> (float): the gamma to use.
        
    Returns: 
        w =>(numpy.array): Calculated Weights
        loss => (numpy.array): Calculated Loss
    """

    loss = calculate_loss_log(y, tx, w) + lambda_ * np.squeeze(w.T.dot(w))
    grad = calculate_gradient_log(y, tx, w) + 2 * lambda_ * w
    w -= gamma * grad
    return loss, w

def logistic_regression(y, tx, w_initial, max_iters, gamma):
    """Implement logistic regression using gradient descent
    
    Args: 
      y =>(numpy.array): Target values
      tx => (numpy.array): Transposed features
      w_initial => (numpy.array): Initial Weigths 
      max_iters => (int): number of iterations.
      gamma=> (float): the gamma to use.
          
    Returns: 
        w =>(numpy.array): Calculated Weights
        loss => (numpy.array): Calculated Loss
    """

    assert max_iters > 0, "max_iters should be a positive number"
    assert y.shape[0] == tx.shape[0], "y and tx should have the same number of entries (rows)"
    assert tx.shape[1] == w_initial.shape[0], "initial_w should be the same degree as tx"
    
    print_every = 50
    w = w_initial
    losses =[]
    for n_iter in range(max_iters+1):
        loss, w = learning_by_gradient_descent_log(y, tx, w, gamma)
        if (n_iter % print_every == 0):
            # print average loss for the last print_every iterations
            print('iteration\t', str(n_iter), loss)
            losses.append(loss)

            
    loss = calculate_loss_log(y, tx, w)
    
    return w, loss,losses

def reg_logistic_regression(y, tx, w_initial, max_iters, gamma,lambda_):
    """Implement logistic regression using gradient descent
    
    Args: 
        y =>(numpy.array): Target values
        tx => (numpy.array): Transposed features
        w_initial => (numpy.array): Initial Weigths 
        max_iters => (int): number of iterations.
        gamma=> (float): the gamma to use.
          
    Returns: 
        w =>(numpy.array): Calculated Weights
        loss => (numpy.array): Calculated Loss
    """

    assert max_iters > 0, "max_iters should be a positive number"
    assert y.shape[0] == tx.shape[0], "y and tx should have the same number of entries (rows)"
    assert tx.shape[1] == w_initial.shape[0], "initial_w should be the same degree as tx"
    
    print_every = 50
    w = w_initial
    losses =[]
    for n_iter in range(max_iters+1):
        loss, w = learning_by_reg_gradient_descent_log(y, tx, w, gamma,lambda_)
        if (n_iter % print_every == 0):
            # print average loss for the last print_every iterations
            print('iteration\t', str(n_iter), loss)
            losses.append(loss)

            
    loss = calculate_loss_log(y, tx, w)
    
    return w, loss,losses

# Preprocessing

In [5]:
cat_cols = [22]
full_x_train_num, full_x_train_cat = split_numerical_categorical(tX_trainRaw,cat_cols)
# Treat numerical values
full_x_train_num_nan = replace_undef_val_with_nan(full_x_train_num)
full_x_train_num_nan_std, train_mean, train_std = nan_standardize_fit(full_x_train_num_nan)
# full_x_train_num_valid_std = replace_nan_val_with_mean(full_x_train_num_nan_std)
full_x_train_num_valid_std = replace_nan_val_with_median(full_x_train_num_nan_std)
full_x_train_num_valid_std = replace_iqr_outliers(full_x_train_num_valid_std)
# Treat categorical values
full_x_train_ohe_cat = one_hot_encode(full_x_train_cat)
full_x_train = np.hstack((add_bias(full_x_train_num_valid_std),full_x_train_ohe_cat))
# Treat labels
full_y_train = y_trainRaw
full_y_train = relabel_y_non_negative(full_y_train).reshape(-1,1)
full_y_train = full_y_train.reshape(-1,1)

In [6]:
x_train, y_train, x_val, y_val = split_data(full_x_train,full_y_train,0.8)
y_val = relabel_y_negative(y_val)

In [None]:
max_iters = 1000
gamma = 0.00005
w_initial = np.ones((full_x_train.shape[1], 1))
weights, loss_tr,losses = logistic_regression(y_train, x_train, w_initial, max_iters, gamma)

iteration	 0 428857.59690654144
iteration	 50 580522.6095472391
iteration	 100 311131.3372013471
iteration	 150 466770.90033078357
iteration	 200 160856.445716183
iteration	 250 495457.7480082634
iteration	 300 209870.6693670733
iteration	 350 535390.0085533891
iteration	 400 153224.90874038823
iteration	 450 475430.2977494928
iteration	 500 248618.36353900243


In [33]:
y_pred = predict_labels(weights, x_val)
from sklearn.metrics import accuracy_score
accuracy_score(y_val,y_pred)

0.7058

In [27]:
cat_cols = [22]
x_test_num, x_test_cat = split_numerical_categorical(tX_testRaw,cat_cols)
# Treat numerical values
x_test_num_nan = replace_undef_val_with_nan(x_test_num)
x_test_num_nan_std = nan_standardize_transform(x_test_num_nan,train_mean,train_std)
# x_test_num_nan_std = nan_standardize_with_median_transform(x_test_num_nan,train_median,train_std)
x_test_num_valid_std = replace_nan_val_with_median(x_test_num_nan_std)
x_test_num_valid_std = replace_iqr_outliers(x_test_num_valid_std)
# x_test_num_valid_std = replace_nan_val_with_mean(x_test_num_nan_std)
x_test_ohe_cat = one_hot_encode(x_test_cat)
x_test = np.hstack((add_bias(x_test_num_valid_std),x_test_ohe_cat))
# Treat labels


In [28]:
y_pred = predict_labels(weights, x_test)
y_pred.shape

(568238, 1)

In [29]:
OUTPUT_PATH = '../results/log_reg.csv' 
create_csv_submission(ids_test, y_pred[:], OUTPUT_PATH)

In [30]:
with open('../results/log_reg.csv') as input, open('../results/log_reg_cleanded.csv', 'w', newline='') as output:
    writer = csv.writer(output)
    for row in csv.reader(input):
        if any(field.strip() for field in row):
            writer.writerow(row)

In [10]:
# from sklearn import linear_model, preprocessing, metrics

# model = linear_model.SGDClassifier(max_iter=max_iters)
# model.fit(x_train,y_train)
# predict_y = model.predict(x_val)
# predict_y = relabel_y_negative(predict_y)
# from sklearn.metrics import accuracy_score
# accuracy_score(y_val,predict_y)

In [11]:
# max_iters = 1000
# gamma = 0.00001
# w_initial = np.zeros((x_train_prep.shape[1], 1))
# lambdas = np.logspace(-4, 0, 10)
# for i,lambda_ in enumerate(lambdas):
#         weights, loss_tr,losses = logistic_regression(y_train_prep, x_train_prep, w_initial, max_iters, gamma)
        

In [12]:
# lambda_range = np.arange(0.0, 1.6e-8, 2.e-9)
# (lambda_range)

In [13]:
# losses

In [14]:
# loss_tr

In [15]:
# y_pred = predict_labels(weights, x_val)
# y_pred.shape

In [16]:
# np.unique(y_val)

In [17]:
# np.unique(y_train)

In [18]:
# OUTPUT_PATH = '../results/log_reg.csv' 
# create_csv_submission(ids_test, y_pred[:], OUTPUT_PATH)

## Generate predictions and save ouput in csv format for submission: