In [108]:
import numpy as np
from helpers import *
from implementations import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
data_path="data/dataset_to_release"
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)
print(x_train[:5])

[[5.3000000e+01 1.1000000e+01 1.1162015e+07 ...           nan
            nan 2.0000000e+00]
 [3.3000000e+01 1.2000000e+01 1.2152015e+07 ...           nan
            nan           nan]
 [2.0000000e+01 1.0000000e+01 1.0202015e+07 ... 1.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [4.2000000e+01 6.0000000e+00 6.1820150e+06 ... 2.0000000e+00
  2.0000000e+00 2.0000000e+00]
 [2.4000000e+01 1.1000000e+01 1.1062015e+07 ... 9.0000000e+00
  9.0000000e+00 2.0000000e+00]]


In [25]:
x_train_p, x_test_p=apply_preprocessing(x_train, x_test, 
                                        correlation_tolerance=0.01, 
                                        outlier_coefficient=2.0, 
                                        polynomial_degree=1, 
                                        log_transform_columns=[])

In [26]:
y_train=(y_train+1)/2

In [139]:
def stratified_K_fold(y,k=5,seed=42):
    no_of_test_labels=np.floor(y.shape[0]/k)
    
    unique_labels=np.unique(y)
    label_ratio={};label_indexes={};label_count_p_fold={}

    picked_count=0
    for i,label in enumerate(unique_labels):
        label_ratio[label]=y[y==label].shape[0]/y.shape[0]
        label_indexes[label]=np.random.permutation(np.argwhere(y==label))
        if i<len(unique_labels)-1:
            label_count_p_fold[label]=np.floor(no_of_test_labels*label_ratio[label])
            picked_count+=label_count_p_fold[label]
        else:
            label_count_p_fold[label]=no_of_test_labels-picked_count

    folds=[]
    for i in range(k):
        index_arrs=[]
        for label in unique_labels:
            label_to_take=int(label_count_p_fold[label])
            #print(i*label_to_take,(i+1)*label_to_take)
            index_arrs.append(label_indexes[label][i*label_to_take:(i+1)*label_to_take].flatten())
    
        test_fold=np.random.permutation(np.concatenate(index_arrs))
        train_fold=np.random.permutation(np.setdiff1d(np.indices(y.shape),test_fold,assume_unique=True))
        #print(test_fold.shape,train_fold.shape)
        folds.append((train_fold,test_fold))
    
    return folds

def calculate_metrics(y_pred,y_true):
    accuracy=y_pred[y_pred==y_true].shape[0]/y_pred.shape[0]
    precision={};recall={}
    for label in np.unique(y_true):
        try:
            precision[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_pred[y_pred==label].shape[0]
        except:
            continue
        try:
            recall[label]=y_pred[(y_pred==label) & (y_true==label)].shape[0]/y_true[y_true==label].shape[0]
        except:
            continue
    return accuracy,precision,recall

def calculate_confusion_mat(y_pred,y_true):
    unique_labels=np.unique(y_true)
    conf_matrix=np.zeros(len(unique_labels))
    for i,pred_label in enumerate(unique_labels):
        for j,true_label in enumerate(unique_labels):
            conf_matrix[i,j]=y_pred[y_pred==pred_label & y_true==true_label].shape[0]
    return conf_matrix

def cross_validate(fit_function,loss_function,folds):
    for i,(train_fold,val_fold) in enumerate(folds):
        print("Fold",i+1)
        x_train_fold=x_train_p[train_fold]
        x_val_fold=x_train_p[val_fold]
        
        y_train_fold=y_train[train_fold]
        y_val_fold=y_train[val_fold]

        initial_w=np.zeros((x_train_fold.shape[1],))

        x_train_norm,mean,std=standardize(x_train_fold)
        x_val_norm,_,_=standardize(x_val_fold,mean,std)

        w_0=x_train_norm.shape[0]/(2*y_train_fold[y_train_fold==0].shape[0])
        w_1=x_train_norm.shape[0]/(2*y_train_fold[y_train_fold==1].shape[0])

        w_0=1
        w_1=1

        print(w_0,w_1)
        threshold=(1/3)
        w,loss=fit_function(y_train_fold,x_train_norm,lambda_=0.3 ,initial_w=initial_w, max_iters=100, gamma=0.04,class_weights=(w_0,w_1))
        print("Train loss:",loss)

        y_pred = sigmoid(x_train_norm.dot(w))
        y_pred[y_pred>threshold]=1;y_pred[y_pred<=threshold]=0
        
        accuracy,precision,recall=calculate_metrics(y_train_fold,y_pred)

        print("Train --- Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)

        print("Val loss:",loss_function(y_val_fold,x_val_fold,w,w_0,w_1))

        y_pred = sigmoid(x_val_norm.dot(w))
        y_pred[y_pred>threshold]=1;y_pred[y_pred<=threshold]=0
        
        accuracy,precision,recall=calculate_metrics(y_val_fold,y_pred)
        
        print("Val   --- Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
        print("##########################################")


In [141]:
from catboost import CatBoostClassifier

train_fold,val_fold=folds[0]
cat_X_train=x_train[train_fold]
cat_X_val=x_train[val_fold]

cat_y_train=y_train[train_fold]
cat_y_val=y_train[val_fold]

catboost_model=CatBoostClassifier()

catboost_model.fit(cat_X_train,cat_y_train)

cat_y_pred=catboost_model.predict(cat_X_val)

accuracy,precision,recall=calculate_metrics(cat_y_val,cat_y_pred)

Learning rate set to 0.11115
0:	learn: 0.5636216	total: 61.1ms	remaining: 1m 1s
1:	learn: 0.4707792	total: 123ms	remaining: 1m 1s
2:	learn: 0.4030100	total: 188ms	remaining: 1m 2s
3:	learn: 0.3580540	total: 257ms	remaining: 1m 4s
4:	learn: 0.3256890	total: 314ms	remaining: 1m 2s
5:	learn: 0.2997189	total: 379ms	remaining: 1m 2s
6:	learn: 0.2856705	total: 429ms	remaining: 1m
7:	learn: 0.2708346	total: 500ms	remaining: 1m 2s
8:	learn: 0.2592043	total: 572ms	remaining: 1m 2s
9:	learn: 0.2522804	total: 632ms	remaining: 1m 2s
10:	learn: 0.2470636	total: 684ms	remaining: 1m 1s
11:	learn: 0.2417275	total: 754ms	remaining: 1m 2s
12:	learn: 0.2371905	total: 830ms	remaining: 1m 2s
13:	learn: 0.2346706	total: 886ms	remaining: 1m 2s
14:	learn: 0.2319980	total: 952ms	remaining: 1m 2s
15:	learn: 0.2304141	total: 1.01s	remaining: 1m 2s
16:	learn: 0.2282842	total: 1.09s	remaining: 1m 3s
17:	learn: 0.2269463	total: 1.17s	remaining: 1m 3s
18:	learn: 0.2254584	total: 1.23s	remaining: 1m 3s
19:	learn: 0.2

In [142]:
print(accuracy,precision,recall)

0.9162539808310604 {0.0: 0.9893033828051878, 1.0: 0.16203623813632442} {0.0: 0.9241818636022983, 1.0: 0.5946801773274224}


In [143]:
catboost_model.feature_importances_

array([8.72098311e-02, 1.48642088e-01, 7.37599377e-01, 9.38413321e-03,
       3.84436345e-01, 0.00000000e+00, 1.52935886e-02, 4.41055493e-01,
       8.88530077e-02, 0.00000000e+00, 0.00000000e+00, 4.06179075e-04,
       0.00000000e+00, 4.05685193e-02, 6.79519469e-04, 5.61326623e-02,
       6.97749719e-02, 9.92032346e-02, 0.00000000e+00, 0.00000000e+00,
       6.35502076e-03, 0.00000000e+00, 0.00000000e+00, 4.56274420e-02,
       5.39040280e-03, 1.33824260e-01, 6.39417743e+00, 4.20345057e-01,
       2.73540416e-01, 2.51333515e-01, 1.53789925e-02, 7.94642324e-01,
       3.70007742e-01, 2.56354624e-01, 1.20804498e+00, 2.77280871e+00,
       4.46781328e-02, 1.26315742e+00, 1.38234918e-01, 2.53297458e+00,
       1.57183691e-02, 6.09885567e-02, 2.00751199e-01, 3.32040590e-01,
       1.80759767e+00, 2.50659556e-01, 2.64622385e-01, 6.94477003e-01,
       3.73726655e-01, 1.58798545e+00, 3.84211454e+00, 8.05876249e-01,
       2.40737571e-01, 1.85506357e-01, 3.63984677e-02, 8.87286581e-02,
      

In [144]:
import pandas as pd

data=pd.read_csv("data/dataset_to_release/x_train.csv")
data.columns[np.argsort(catboost_model.feature_importances_)]

Index(['VIINSUR2', '_FRUITEX', '_VEG23', '_FRT16', '_VEGLT1', '_FRTLT1',
       'VICTRCT4', '_FRTRESP', 'VEGEDA1_', '_VEGETEX',
       ...
       'TOLDHI2', 'BPHIGH4', 'MAXVO2_', '_HCVU651', '_LLCPWT', 'ASINHALR',
       'DIABAGE2', '_CHOLCHK', '_AGE65YR', 'HHADULT'],
      dtype='object', length=321)

In [147]:
feature_df=pd.DataFrame(data={"Feature Name":list(data.columns[np.argsort(catboost_model.feature_importances_)]),"Importance":list(np.sort(catboost_model.feature_importances_))})

In [150]:
feature_df.sort_values("Importance",ascending=False).to_csv("feature_importances.csv")

In [151]:
catboost_model.get_cat_feature_indices()

[]

In [140]:
folds=stratified_K_fold(y_train)

In [137]:
cross_validate(reg_weighted_logistic_regression_balanced,calculate_weighted_logistic_loss,folds)

Fold 1
1 1
1 1
[0.29861996247977496, 0.2986199624680588, 0.2986199624564828, 0.2986199624450458, 0.2986199624337462]
Train loss: 0.2986199624337462
Train --- Accuracy: 0.911697929205967 Precision: {0.0: 1.0} Recall: {0.0: 0.911697929205967}
Val loss: 0.2986199813124202
Val   --- Accuracy: 0.911697929205967 Precision: {0.0: 1.0} Recall: {0.0: 0.911697929205967}
##########################################
Fold 2
1 1
1 1
[0.29861996120515166, 0.29861996118891837, 0.29861996117288, 0.298619961157034, 0.2986199611413783]
Train loss: 0.2986199611413783
Train --- Accuracy: 0.911697929205967 Precision: {0.0: 1.0} Recall: {0.0: 0.911697929205967}
Val loss: 0.2986199842982743
Val   --- Accuracy: 0.911697929205967 Precision: {0.0: 1.0} Recall: {0.0: 0.911697929205967}
##########################################
Fold 3
1 1
1 1
[0.298619956845221, 0.2986199568190576, 0.2986199567932082, 0.2986199567676692, 0.2986199567424365]
Train loss: 0.2986199567424365
Train --- Accuracy: 0.911697929205967 Precis

In [6]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    w,loss=reg_logistic_regression(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3)
    print("Train loss:",loss)
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<1/2]=-1
    accuracy,precision,recall=calculate_metrics(y_val_fold,y_pred)
    print("Val loss:",calculate_logistic_loss(y_val_fold,x_val_fold,w))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")

NameError: name 'folds' is not defined

In [20]:
for i,(train_fold,val_fold) in enumerate(folds):
    print("Fold",i+1)
    x_train_fold=x_train_p[train_fold]
    x_val_fold=x_train_p[val_fold]
    
    y_train_fold=y_train[train_fold]
    y_val_fold=y_train[val_fold]

    initial_w=np.zeros((x_train_fold.shape[1],))
    #print(initial_w.shape)
    w,loss=reg_logistic_regression_w_clw(y_train_fold,x_train_fold,lambda_=0.1 ,initial_w=initial_w, max_iters=100, gamma=0.3)
    print("Train loss:",loss)
    y_pred = sigmoid(x_val_fold.dot(w))
    y_pred[y_pred>1/2]=1;y_pred[y_pred<1/2]=-1
    accuracy,precision,recall=calculate_metrics(y_val_fold,y_pred)
    print("Val loss:",calculate_logistic_loss(y_val_fold,x_val_fold,w))
    print("Accuracy:",accuracy,"Precision:",precision,"Recall:",recall)
    print("##########################################")



Fold 1


ValueError: 'list' argument must have no negative elements