In [31]:
import numpy as np
from helpers import *
from implementations import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
data_path="data/dataset_to_release"
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print(x_train[:5])

[[5.3000000e+01 1.1000000e+01 1.1162015e+07 ...           nan
            nan 2.0000000e+00]
 [3.3000000e+01 1.2000000e+01 1.2152015e+07 ...           nan
            nan           nan]
 [2.0000000e+01 1.0000000e+01 1.0202015e+07 ... 1.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [4.2000000e+01 6.0000000e+00 6.1820150e+06 ... 2.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [2.4000000e+01 1.1000000e+01 1.1062015e+07 ... 9.0000000e+00
  9.0000000e+00 2.0000000e+00]]


In [19]:
x_train_p, x_test_p=apply_preprocessing(x_train, x_test, 
                                        correlation_tolerance=0.01, 
                                        outlier_coefficient=2.0, 
                                        polynomial_degree=1, 
                                        log_transform_columns=[])

In [20]:
np.random.seed(42)

In [None]:
def stratified_K_fold(y,k=5,seed=42):
    no_of_test_labels=np.floor(y.shape[0]/k)
    
    unique_labels=np.unique(y)
    label_ratio={};label_indexes={};label_count_p_fold={}

    picked_count=0
    for i,label in enumerate(unique_labels):
        label_ratio[label]=y[y==label].shape[0]/y.shape[0]
        label_indexes[label]=np.random.permutation(np.argwhere(y==label))
        if i<len(unique_labels)-1:
            label_count_p_fold[label]=np.floor(no_of_test_labels*label_ratio[label])
            picked_count+=label_count_p_fold[label]
        else:
            label_count_p_fold[label]=no_of_test_labels-picked_count

    folds=[]
    for i in range(k):
        index_arrs=[]
        for label in unique_labels:
            label_to_take=int(label_count_p_fold[label])
            #print(i*label_to_take,(i+1)*label_to_take)
            index_arrs.append(label_indexes[label][i*label_to_take:(i+1)*label_to_take])
    
        test_fold=np.random.permutation(np.concatenate(index_arrs))
        train_fold=np.random.permutation(np.setdiff1d(np.indices(y.shape),test_fold,assume_unique=True))
        #print(test_fold.shape,train_fold.shape)
        folds.append((train_fold,test_fold))
    
    return folds

def calculate_metrics(y_pred,y_true):
    accuracy=y_pred[y_pred==y_true].shape[0]/y_pred.shape[0]
    precision={};recall={}
    for label in np.unique(y_true):
        try:
            precision[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_pred[y_pred==label].shape[0]
        except:
            continue
        try:
            recall[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_true[y_true==label].shape[0]
        except:
            continue
    return accuracy,precision,recall

def calculate_confusion_mat(y_pred,y_true):
    unique_labels=np.unique(y_true)
    conf_matrix=np.zeros(len(unique_labels))
    for i,pred_label in enumerate(unique_labels):
        for j,true_label in enumerate(unique_labels):
            conf_matrix[i,j]=y_pred[y_pred==pred_label & y_true==true_label].shape[0]
    return conf_matrix

In [None]:
folds=stratified_K_fold(y_train)

In [17]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    
    w,loss=reg_logistic_regression(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3)
    print("Train loss:",loss)
    
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<1/2]=-1
    accuracy,precision,recall=*calculate_metrics(y_val_fold,y_pred),

    print("Val loss:",calculate_logistic_loss(y_val_fold,x_val_fold,w))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")

Fold 1
Train loss: -1089.3914679818747
Val loss: -1089.39144787967
Accuracy: 0.911697929205967 Precision: {-1.0: 1.0} Recall: {-1.0: 0.911697929205967}
##########################################
Fold 2
Train loss: -1089.3914476447048
Val loss: -1089.3914783740236
Accuracy: 0.911697929205967 Precision: {-1.0: 1.0} Recall: {-1.0: 0.911697929205967}
##########################################
Fold 3
Train loss: -1089.3914680838668
Val loss: -1089.3914477073436
Accuracy: 0.911697929205967 Precision: {-1.0: 1.0} Recall: {-1.0: 0.911697929205967}
##########################################
Fold 4
Train loss: -1089.3914545247999
Val loss: -1089.391468060546
Accuracy: 0.911697929205967 Precision: {-1.0: 1.0} Recall: {-1.0: 0.911697929205967}
##########################################
Fold 5
Train loss: -1089.3914614905955
Val loss: -1089.3914576181778
Accuracy: 0.911697929205967 Precision: {-1.0: 1.0} Recall: {-1.0: 0.911697929205967}
##########################################


In [36]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    w,loss=reg_weighted_logistic_regression_balanced(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3,class_weights={-1:0.3,1:0.7})
    print("Train loss:",loss)
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<1/2]=-1
    accuracy,precision,recall=calculate_metrics(y_val_fold,y_pred)
    total_samples = len(y_val_fold)
    w1 = total_samples / np.sum(y_val_fold == 1)
    w2 = total_samples / np.sum(y_val_fold == -1)
    print("Val loss:",calculate_weighted_logistic_loss(y_val_fold,x_val_fold,w,w1,w2))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")



Fold 1
11.324762726488352
1.0968545260061506
Train loss: 6.110846838744346e+16
Val loss: 69.07755278982135
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 2
11.324762726488352
1.0968545260061506
Train loss: 6.110846749387251e+16
Val loss: 69.07755278982137
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 3
11.324762726488352
1.0968545260061506
Train loss: 6.110846845613867e+16
Val loss: 69.07755278982138
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 4
11.324762726488352
1.0968545260061506
Train loss: 6.110846796620435e+16
Val loss: 69.07755278982137
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 5
11.324762726488352
1.0968545260061506
Train l