In [38]:
import numpy as np
from helpers import *
from implementations import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
data_path="data/dataset_to_release"
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print(x_train[:5])

[[5.3000000e+01 1.1000000e+01 1.1162015e+07 ...           nan
            nan 2.0000000e+00]
 [3.3000000e+01 1.2000000e+01 1.2152015e+07 ...           nan
            nan           nan]
 [2.0000000e+01 1.0000000e+01 1.0202015e+07 ... 1.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [4.2000000e+01 6.0000000e+00 6.1820150e+06 ... 2.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [2.4000000e+01 1.1000000e+01 1.1062015e+07 ... 9.0000000e+00
  9.0000000e+00 2.0000000e+00]]


In [None]:
x_train_p, x_test_p=apply_preprocessing(x_train, x_test, 
                                        correlation_tolerance=0.01, 
                                        outlier_coefficient=2.0, 
                                        polynomial_degree=1, 
                                        log_transform_columns=[])

In [50]:
np.random.seed(42)

In [52]:
def stratified_K_fold(y,k=5,seed=42):
    no_of_test_labels=np.floor(y.shape[0]/k)
    
    unique_labels=np.unique(y)
    label_ratio={};label_indexes={};label_count_p_fold={}

    picked_count=0
    for i,label in enumerate(unique_labels):
        label_ratio[label]=y[y==label].shape[0]/y.shape[0]
        label_indexes[label]=np.random.permutation(np.argwhere(y==label))
        if i<len(unique_labels)-1:
            label_count_p_fold[label]=np.floor(no_of_test_labels*label_ratio[label])
            picked_count+=label_count_p_fold[label]
        else:
            label_count_p_fold[label]=no_of_test_labels-picked_count

    folds=[]
    for i in range(k):
        index_arrs=[]
        for label in unique_labels:
            label_to_take=int(label_count_p_fold[label])
            #print(i*label_to_take,(i+1)*label_to_take)
            index_arrs.append(label_indexes[label][i*label_to_take:(i+1)*label_to_take].flatten())
    
        test_fold=np.random.permutation(np.concatenate(index_arrs))
        train_fold=np.random.permutation(np.setdiff1d(np.indices(y.shape),test_fold,assume_unique=True))
        #print(test_fold.shape,train_fold.shape)
        folds.append((train_fold,test_fold))
    
    return folds

def calculate_metrics(y_pred,y_true):
    accuracy=y_pred[y_pred==y_true].shape[0]/y_pred.shape[0]
    precision={};recall={}
    for label in np.unique(y_true):
        try:
            precision[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_pred[y_pred==label].shape[0]
        except:
            continue
        try:
            recall[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_true[y_true==label].shape[0]
        except:
            continue
    return accuracy,precision,recall

def calculate_confusion_mat(y_pred,y_true):
    unique_labels=np.unique(y_true)
    conf_matrix=np.zeros(len(unique_labels))
    for i,pred_label in enumerate(unique_labels):
        for j,true_label in enumerate(unique_labels):
            conf_matrix[i,j]=y_pred[y_pred==pred_label & y_true==true_label].shape[0]
    return conf_matrix

In [53]:
folds=stratified_K_fold(y_train)

In [61]:
from sklearn.linear_model import LogisticRegression
log_reg_classifier=LogisticRegression(penalty="l2",class_weight="balanced",C=0.1)

In [62]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    log_reg_classifier.fit(x_train_fold,y_train_fold)
    #print(x_train_fold.shape)
    #print(y_val_fold.shape)
    y_pred=log_reg_classifier.predict(x_val_fold)
    accuracy,precision,recall=*calculate_metrics(y_val_fold,y_pred),

    #print("Val loss:",calculate_logistic_loss(y_val_fold,x_val_fold,w))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")

Fold 1
Accuracy: 0.5092111478507322 Precision: {-1: 0.5101617863350716, 1: 0.4993960310612597} Recall: {-1: 0.9132086761406133, 1: 0.08987019439786349}
##########################################
Fold 2
Accuracy: 0.5127310405778109 Precision: {-1: 0.514189731247493, 1: 0.4976704055220017} Recall: {-1: 0.9135586174129944, 1: 0.09026321554880912}
##########################################
Fold 3
Accuracy: 0.5078549987048013 Precision: {-1: 0.5098609439764674, 1: 0.48714408973252804} Recall: {-1: 0.9112252822749268, 1: 0.08780988522193536}
##########################################
Fold 4
Accuracy: 0.5135081597513219 Precision: {-1: 0.5149084102152695, 1: 0.4990509059534081} Recall: {-1: 0.9138856752988639, 1: 0.09061285875422985}
##########################################
Fold 5
Accuracy: 0.5105520593658098 Precision: {-1: 0.5107634710522797, 1: 0.5083692838654013} Recall: {-1: 0.9147235774791224, 1: 0.09143956794338569}
##########################################


In [None]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    
    w,loss=reg_logistic_regression(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3)
    print("Train loss:",loss)
    
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<1/2]=-1
    accuracy,precision,recall=*calculate_metrics(y_val_fold,y_pred),

    print("Val loss:",calculate_logistic_loss(y_val_fold,x_val_fold,w))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")

In [43]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    w,loss=reg_weighted_logistic_regression_balanced(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3,class_weights={-1:1,1:1})
    print("Train loss:",loss)
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<=1/2]=-1
    accuracy,precision,recall=calculate_metrics(y_val_fold,y_pred)
    total_samples = len(y_val_fold)
    w1 = total_samples / np.sum(y_val_fold == 1)
    w2 = total_samples / np.sum(y_val_fold == -1)
    print("Val loss:",calculate_weighted_logistic_loss(y_val_fold,x_val_fold,w,w1,w2))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")



Fold 1
1
1
Train loss: 3009347390860701.0
Val loss: 69.07755278982135
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 2
1
1
Train loss: 3009347346853776.0
Val loss: 69.07755278982137
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 3
1
1
Train loss: 3009347394243102.0
Val loss: 69.07755278982138
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 4
1
1
Train loss: 3009347370113477.0
Val loss: 69.07755278982137
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
Fold 5
1
1
Train loss: 3009347384193966.0
Val loss: 69.07755278982137
Accuracy: 0.08830207079403295 Precision: {1.0: 1.0} Recall: {1.0: 0.08830207079403295}
##########################################
