In [None]:
# Useful starting lines
%matplotlib inline

import random
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
import time


%load_ext autoreload
%autoreload 2

In [2]:
from helpers_own import *

Import data -> use of the imported function made by the ML team (takes a long time)

In [3]:
from helpers import load_csv_data

x_train, x_test, y_train, train_ids, test_ids = load_csv_data("./dataset", sub_sample=False)

Here is a quick method to see that our data is not "clean". There are a lot of nan.

In [None]:
def caracteristics(x, y):
    count = np.sum(~np.isnan(x))
    print("Number of x features being not nan:", count)
    nan_count = np.sum(np.isnan(x))
    print("Number of x features being nan:", nan_count)
    num_negatives = np.sum(y == -1)
    print("Number of y -1s:", num_negatives)
    num_positives = np.sum(y == 1)
    print("Number of y 1s:", num_positives)
    num_null = np.sum(y == 0)
    print("Number of y 0s:", num_null)
    nan_count_y = np.sum(np.isnan(y))
    print("Number of y nan:", nan_count_y)

#Here is for the specific feature "_BMI5"
caracteristics(x_train,y_train)

Now we are ready to extract a few features and replace the lack of data with the mean (those features are taken from the website: https://medium.com/@alexteboul17/building-predictive-models-for-heart-disease-using-the-2015-behavioral-risk-factor-surveillance-b786368021ab

In [5]:
X_train, Y_train, X_val, Y_val, X_test = make_data('./dataset/x_train.csv', './dataset/x_test.csv', x_train, x_test, y_train, replace=True)

In [None]:
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape

In [None]:
caracteristics(X_train, Y_train)

We now balance the data

In [None]:
# For undesampling fully
#X_train_balanced, Y_train_balanced = undersampling(X_train, Y_train)

# For oversampling fully
#X_train_balanced, Y_train_balanced = oversampling(X_train, Y_train)

# For undersampling and oversampling at the same time
# ratio_majority is the desired factor of reduction of majority samples for undersampling
# ratio_majority_to_minority is the desired ratio of majority to minority samples for oversampling
X_train_balanced, Y_train_balanced = undersampling_oversampling(X_train, Y_train, ratio_majority=1, ratio_majority_to_minority=2)

caracteristics(X_train_balanced, Y_train_balanced)


In [9]:
def logistic_regression_gradient_descent(y_train, x_train, y_val, x_val, lambda1, lambda2, gamma):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    losses = []
    losses_val = []

    # build tx_train
    tx_train = np.c_[np.ones((y_train.shape[0], 1)), x_train]
    w = np.zeros((tx_train.shape[1], 1))
    #print(tx_train)
    #print(y_train)
    tx_val = np.c_[np.ones((y_val.shape[0], 1)), x_val]

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent_ridge_lasso(y_train, tx_train, w, gamma, lambda1, lambda2)
        loss_val = calculate_loss(y_val, tx_val, w, lambda1=lambda1, lambda2=lambda2)
        # log info
        if iter % 100 == 0:
            print("Current iteration={i}, loss={l}".format(i=iter, l=loss))
        # converge criterion
        losses.append(loss)
        losses_val.append(loss_val)
        # NB: Stopping criterion based on val loss now
        if len(losses_val) > 1 and np.abs(losses_val[-1] - losses_val[-2]) < threshold:
            print('finished')
            break
    return w, loss, losses, losses_val

In [None]:
gamma = 0.05
lambda1 = 0
lambda2 = 0

w, loss, losses, losses_val = logistic_regression_gradient_descent(Y_train_balanced, X_train_balanced, Y_val, X_val, lambda1, lambda2, gamma)
print(w)

In [None]:
# Plot the train and val losses
plt.plot(losses, label='train')
plt.plot(losses_val, label='val')
plt.legend()
plt.show()


In [None]:
def prediction(tx_test, w):
    compute = sigmoid(np.dot(tx_test, w))
    y_test = (compute >= 0.5).astype(int)
    return y_test
#Now we test the result: % of well classified data
def percentage_well_predicted(true_labels, predicted_labels):
    # Check if both vectors have the same length
    if len(true_labels) != len(predicted_labels):
        raise ValueError("The two vectors must have the same length.")
    # Calculate the number of wrongly predicted points
    num_right = np.sum(true_labels == predicted_labels)
    # Calculate the percentage of wrongly predicted points
    percentage_right = (num_right / len(true_labels)) * 100
    return percentage_right
tx_val = np.c_[np.ones((X_val.shape[0], 1)), X_val]
print(tx_val.shape)
print(w.shape)
y_pred_test = prediction(tx_val, w)
zero_count = np.sum(y_pred_test == 0)
nonzero_count = np.sum(y_pred_test != 0)
print(zero_count)
print(nonzero_count)
print(percentage_well_predicted(Y_val, y_pred_test))

In [None]:
def f1(y_pred, y_true):
    tp = np.sum(y_pred[y_true == 1] == 1)
    fp = np.sum(y_pred[y_true == 0] == 1)
    fn = np.sum(y_pred[y_true == 1] == 0)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

print(f1(y_pred_test, Y_val))

In [None]:
def confusion_matrix(y_pred, y_true):
    tp = np.sum(y_pred[y_true == 1] == 1)
    fp = np.sum(y_pred[y_true == 0] == 1)
    fn = np.sum(y_pred[y_true == 1] == 0)
    tn = np.sum(y_pred[y_true == 0] == 0)
    return tp, fp, fn, tn

tp, fp, fn, tn = confusion_matrix(y_pred_test, Y_val)
print(tp)
print(fp)
print(fn)
print(tn)

In [None]:
np.unique(Y_val, return_counts=True)

Now we use our w to predict on the test set

In [295]:
def prediction(tx_test, w):
    compute = sigmoid(np.dot(tx_test, w))
    y_test = (compute >= 0.7).astype(int)
    return y_test

In [None]:
tx_test = np.c_[np.ones((X_test.shape[0], 1)), X_test]
print(tx_test)
print(w)
y_pred = prediction(tx_test, w)
print(y_pred)
nonzero_count = np.sum(y_pred != 0)
zero_count = np.sum(y_pred == 0)
print(nonzero_count)
print(zero_count)
y_pred[y_pred == 0] = -1

In [297]:
from helpers import create_csv_submission
create_csv_submission(test_ids, y_pred, "Submission_7")

In [None]:
np.unique(y_pred, return_counts=True)