In [9]:
from sklearn.ensemble import *
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from pyHSICLasso import HSICLasso
import numpy as np
import lightgbm as lgb

In [2]:
labels = pd.read_csv('train_labels.csv')
values = pd.read_csv('train_values.csv')
values['thal'] = pd.factorize(values['thal'])[0]
df = values.merge(labels, on='patient_id')

In [3]:
training_split = int(len(df) * 0.7)
y_train = df['heart_disease_present'].iloc[:training_split]
y_test = df['heart_disease_present'].iloc[training_split:]
x_train = df[[column for column in df.columns if column not in ['heart_disease_present', 'patient_id']]].iloc[:training_split]
x_test = df[[column for column in df.columns if column not in ['heart_disease_present', 'patient_id']]].iloc[training_split:]

In [5]:
lasso = HSICLasso()
lasso.input(x_train.values, y_train.values, featname=x_test.columns)
lasso.classification(10, M=1, B=0)
features = lasso.get_features()

Block HSIC Lasso B = 125.
M set to 1.
Using Gaussian kernel for the features, Delta kernel for the outcomes.


In [33]:
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
lgb_total = lgb.Dataset(df[[column for column in df.columns if column not in ['heart_disease_present', 'patient_id']]], df['heart_disease_present'])

test = pd.read_csv('test_values.csv')
test['thal'] = pd.factorize(test['thal'])[0]

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
#y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration)
test_pred = gbm.predict(test[[column for column in df.columns if column not in ['heart_disease_present', 'patient_id']]], num_iteration=gbm.best_iteration)

[1]	valid_0's l2: 0.251397	valid_0's l1: 0.494641
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.244355	valid_0's l1: 0.487951
[3]	valid_0's l2: 0.237778	valid_0's l1: 0.481136
[4]	valid_0's l2: 0.231991	valid_0's l1: 0.475045
[5]	valid_0's l2: 0.226678	valid_0's l1: 0.46895
[6]	valid_0's l2: 0.22059	valid_0's l1: 0.462193
[7]	valid_0's l2: 0.21503	valid_0's l1: 0.455439
[8]	valid_0's l2: 0.211407	valid_0's l1: 0.450314
[9]	valid_0's l2: 0.206861	valid_0's l1: 0.44459
[10]	valid_0's l2: 0.204095	valid_0's l1: 0.440359
[11]	valid_0's l2: 0.200562	valid_0's l1: 0.434842
[12]	valid_0's l2: 0.19762	valid_0's l1: 0.429648
[13]	valid_0's l2: 0.195201	valid_0's l1: 0.424757
[14]	valid_0's l2: 0.193243	valid_0's l1: 0.420149
[15]	valid_0's l2: 0.191554	valid_0's l1: 0.416131
[16]	valid_0's l2: 0.188872	valid_0's l1: 0.412963
[17]	valid_0's l2: 0.186421	valid_0's l1: 0.40996
[18]	valid_0's l2: 0.185753	valid_0's l1: 0.407357
[19]	valid_0's l2: 0.183825	valid_0'

array([0.30080574, 0.52412717, 0.35214542, 0.47589079, 0.83910531,
       0.32741522, 0.51900182, 0.36131963, 0.58828769, 0.43805277,
       0.51277064, 0.16778308, 0.67645629, 0.56454221, 0.49671646,
       0.32917336, 0.30103879, 0.36634967, 0.30862888, 0.36057734,
       0.418366  , 0.57370956, 0.46133764, 0.34206608, 0.17337547,
       0.8461997 , 0.11896423, 0.60809122, 0.811139  , 0.34038058,
       0.4318411 , 0.17538242, 0.59507979, 0.56207706, 0.55721603,
       0.46775783, 0.66656219, 0.64821487, 0.14140243, 0.40748866,
       0.51946486, 0.42612387, 0.38385525, 0.38262035, 0.47472269,
       0.41193212, 0.49675838, 0.60602864, 0.58587139, 0.77884685,
       0.78569756, 0.3637869 , 0.52549546, 0.51189048, 0.27446155,
       0.33864002, 0.34206321, 0.49647606, 0.64588977, 0.76305507,
       0.35838305, 0.47080271, 0.54433544, 0.85961413, 0.45413089,
       0.80827812, 0.80123932, 0.28071669, 0.5187812 , 0.77680003,
       0.46993248, 0.42873395, 0.32891788, 0.52091441, 0.45266

In [None]:
test['heart_disease_present', 'patient_id'] = test_pred