In [46]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, f1_score

df_train_origin = pd.read_csv('../data/train.csv')
df_test_origin = pd.read_csv('../data/test_leak.csv')
df_train_vecs = pd.read_csv('../fact/train_vecs.csv')
df_test_vecs = pd.read_csv('../fact/test_vecs.csv')

In [47]:
df_train = pd.merge(df_train_origin ,df_train_vecs, on='id')
df_test = pd.merge(df_test_origin ,df_test_vecs, on='id')

In [48]:
def get_c_with_prefix(train, prefix):
    return [column for column in train.columns.tolist() if prefix == column[:len(prefix)]]

c_vecs = get_c_with_prefix(df_train, 'vecs')

X_train, X_valid = df_train[c_vecs], df_test[c_vecs]
y_train, y_valid = df_train.target.values, df_test.target.values

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat, average=None)[0], True

lgbm_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': 4,
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'lambda_l1': 3.642434329823594, 
    'lambda_l2': 1.0401748765492007e-08, 
    'num_leaves': 172, 
    'feature_fraction': 0.8251431673667773, 
    'bagging_fraction': 0.9755605959841563, 
    'bagging_freq': 2, 
    'min_child_samples': 5, 
    'random_state': 68
}

model = lgb.train(
    lgbm_params, 
    lgb_train, 
    valid_sets=lgb_valid,
    verbose_eval=True,
    feval=lgb_f1_score,
    num_boost_round=300,
)

y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_cls = y_pred >= 0.5
print(f1_score(y_valid, y_pred_cls, average=None)[0])

[1]	valid_0's binary_logloss: 0.648545	valid_0's f1: 0.726386
[2]	valid_0's binary_logloss: 0.619589	valid_0's f1: 0.812233
[3]	valid_0's binary_logloss: 0.594569	valid_0's f1: 0.822656
[4]	valid_0's binary_logloss: 0.574037	valid_0's f1: 0.828888
[5]	valid_0's binary_logloss: 0.556387	valid_0's f1: 0.833413
[6]	valid_0's binary_logloss: 0.542317	valid_0's f1: 0.834412
[7]	valid_0's binary_logloss: 0.528954	valid_0's f1: 0.834057
[8]	valid_0's binary_logloss: 0.516656	valid_0's f1: 0.8339
[9]	valid_0's binary_logloss: 0.506989	valid_0's f1: 0.834146
[10]	valid_0's binary_logloss: 0.498213	valid_0's f1: 0.836845
[11]	valid_0's binary_logloss: 0.490351	valid_0's f1: 0.83738
[12]	valid_0's binary_logloss: 0.483655	valid_0's f1: 0.838137
[13]	valid_0's binary_logloss: 0.477269	valid_0's f1: 0.837117
[14]	valid_0's binary_logloss: 0.472346	valid_0's f1: 0.836714
[15]	valid_0's binary_logloss: 0.467249	valid_0's f1: 0.838119
[16]	valid_0's binary_logloss: 0.463339	valid_0's f1: 0.838454
[17]

In [49]:
pd.DataFrame({'id': df_test.id, 'target': y_pred_cls.astype(int)}).to_csv('../output/submit.csv', index=None)