In [94]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt

In [95]:
# data = pd.read_csv('data/train_music.csv')

In [96]:
data = pd.read_pickle('data/train.pkl')

In [112]:
global_config = {
    'folds': 5
}

In [99]:
def prepare_folds(data):
    kfold = StratifiedKFold(n_splits=global_config['folds'], random_state=42)
    folds_data = []
    for ids in kfold.split(data.id.values, data.target.values):
        train, test = ids
        folds_data.append((data.iloc[train], data.iloc[test]))
        
    return folds_data

In [114]:
def train_lgbm(train, val, config):
    train_x = train.drop(columns=['target', 'id']).fillna(-9999)
    train_y = train.target
    
    test_x = val.drop(columns=['target', 'id']).fillna(-9999)
    test_y = val.target
    
    cols = list(train_x.columns)
    
    xgtrain = lgb.Dataset(train_x, label=train_y)
    
    xgvalid = lgb.Dataset(test_x, label=test_y)
    
    
    clf = lgb.train(config, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'],
                     early_stopping_rounds=500,
                     num_boost_round=1880,
                     verbose_eval=50)

    n_estimators = clf.best_iteration
    predicted = clf.predict(test_x)
    score = roc_auc_score(test_y, predicted)
    prec = precision_score(test_y, (predicted > 0.5).astype(int))
    f1 = f1_score(test_y, (predicted > 0.5).astype(int))
    print("Confusion matrix:")
    print(confusion_matrix(test_y, (predicted > 0.5).astype(int)))
    print("Report:")
    print(classification_report(test_y, (predicted > 0.5).astype(int)))
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = cols
    fold_importance_df["importance"] = clf.feature_importance()

    return {'score': score, 'model': clf, 'prec': prec, 'f1': f1, 'fold_importance': fold_importance_df}

In [101]:
def eval_test(models):
#     test_lb = pd.read_csv('data/test_music.csv')
    test_lb = pd.read_pickle('data/test.pkl')
    test_lb = test_lb.drop(columns=['id'])
    predictions = []
    for model in models:
        predict = model.predict(test_lb)
        predictions.append(predict)
    return np.mean(predictions, axis=0)

In [102]:
def prepare_submission(predictions, score="Unknown"):
    sub = pd.read_csv('data/sample_submission_music.csv')
    sub.prediction = predictions
    sub.to_csv('s-%s.csv' % score, index=False)

In [103]:
def train_folds(folds, config):
    models = []
    auc = []
    f1 = []
    importances = []
    for n, fold in enumerate(folds):
        train_f, val_f = fold
        print("Training on %s" % str(train_f.shape))
        result = train_lgbm(train_f, val_f, config)
        
        importance = result['fold_importance']
        importance['fold'] = n
        importances.append(importance)
        
        models.append(result['model'])
        auc.append(result['score'])
        f1.append(result['f1'])
        print("Fold %s: %.4f, F1: %.4f, Precision: %.4f" % (n, result['score'], result['f1'], result['prec']))
    return models, {
        'auc': np.mean(auc),
        'importances': pd.concat(importances, sort=False),
        'f1': np.mean(f1)
    }

In [113]:
folds = prepare_folds(data)

In [119]:
models, result = train_folds(folds, config)

Training on (55999, 566)
Training until validation scores don't improve for 500 rounds.
[50]	train's auc: 0.851839	valid's auc: 0.81121
[100]	train's auc: 0.871398	valid's auc: 0.817691
[150]	train's auc: 0.885137	valid's auc: 0.820584
[200]	train's auc: 0.895614	valid's auc: 0.82129
[250]	train's auc: 0.904877	valid's auc: 0.822568
[300]	train's auc: 0.913546	valid's auc: 0.822419
[350]	train's auc: 0.921012	valid's auc: 0.821925
[400]	train's auc: 0.928083	valid's auc: 0.821716
[450]	train's auc: 0.934027	valid's auc: 0.821712
[500]	train's auc: 0.938964	valid's auc: 0.821656
[550]	train's auc: 0.944575	valid's auc: 0.820737
[600]	train's auc: 0.949691	valid's auc: 0.820396


KeyboardInterrupt: 

In [111]:
print("AUC: %.4f, F1: %.4f" % (result['auc'], result['f1']))

AUC: 0.8338, F1: 0.3633


In [120]:

config = {
'num_iterations': 5000,
'learning_rate': 0.1,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric':'auc',
'num_leaves': 12,
'max_depth': 4,
'min_data_in_leaf': 500, 
# 'reg_alpha': 5,  # L1 regularization term on weights
# 'reg_lambda': 5,
'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
'scale_pos_weight': 9 # because training data is unbalanced 
}

In [19]:
importance = result['importances'].groupby(['feature'])\
                    .agg({'importance': 'mean'})\
                    .sort_values(by="importance",ascending=False)

In [42]:
list(importance.head(50).index)

['lt',
 'balance_sum',
 'content_count_m1',
 'content_count_m3',
 'data_type_2_m1',
 'days_exp',
 'count_app_4',
 'data_type_3_m1',
 'os_category_is_my_vf',
 'vol_app_7',
 'data_type_1_m1',
 'count_sms_source_4',
 'vol_app_4',
 'count_url_category_2',
 'count_app_5',
 'service_1_count',
 'short_out_calls_part_m3',
 'content_count_m2',
 'all_cost_m1',
 'data_type_2_m3',
 'data_type_2_m2',
 'all_count_m1',
 'vol_app_5',
 'paym_last_days',
 'sms_in_count_m1',
 'count_act_type_1',
 'vol_app_1',
 'short_in_calls_part_m1',
 'is_my_vf_service_P_flag_m1',
 'sms_in_count_m3',
 'manufacturer_category_is_my_vf',
 'income_brnd_cont_m1',
 'short_out_calls_part_m1',
 'paym_sum_m1',
 'act_days_count_m3',
 'all_cost_m3',
 'data_type_3_m2',
 'paym_el_sum_m3',
 'count_url_category_10',
 'voice_omo_out_day_rest_cost_m3',
 'count_act_type_7',
 'short_in_calls_part_m3',
 'manufacturer_category_service_P_flag_m3',
 'voice_onnet_in_day_work_dur_m3',
 'paym_sum_m3',
 'count_app_1',
 'voice_in_uniq_count_m1',


In [24]:
test_target = eval_test(models)

In [25]:
prepare_submission(test_target, "AUC_%.4f_F1_%.4f" % (result['auc'], result['f1']))

In [None]:
# def objective(params):
#     params = {
#         'num_iterations': 1000,
#         'learning_rate': 0.1,
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric':'auc',
#         'num_leaves': int(params['num_leaves']),
#         'max_depth': int(params['max_depth']),
#         'reg_alpha': int(params['reg_alpha']),  # L1 regularization term on weights
#         'reg_lambda': int(params['reg_lambda']),
#         'min_child_samples': int(params['min_child_samples']),  # Minimum number of data need in a child(min_data_in_leaf)
#         'max_bin': 100,  # Number of bucketed bin for feature values
#         'subsample': float(params['subsample']),  # Subsample ratio of the training instance.
#         'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
#         'colsample_bytree': float(params['colsample_bytree']),  # Subsample ratio of columns when constructing each tree.
#         'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#         'scale_pos_weight': 9 # because training data is unbalanced 
#     }
#     models, result = train_folds(folds, params)
#     return -result['auc']
    

In [115]:
# space = {
#     'num_leaves': hp.quniform('num_leaves', 4, 24, 1),
#     'max_depth': hp.quniform('max_depth', 2, 8, 1),
#     'reg_alpha': hp.quniform('reg_alpha', 1, 100, 2),
#     'reg_lambda': hp.quniform('reg_lambda', 1, 100, 2),
#     'min_child_samples': hp.quniform('min_child_samples', 100, 5000, 100),
#     'subsample': hp.quniform('subsample', 0.4, 1, 0.1),
#     'colsample_bytree': hp.quniform('colsample_bytree', 0.4, 1, 0.1),
    
# }

In [116]:
# %%time
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=100)