In [134]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, precision_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [135]:
# data = pd.read_csv('data/train_music.csv')

In [136]:
data = pd.read_pickle('data/train.pkl')

In [137]:
global_config = {
    'folds': 5
}

In [154]:
data = data.drop(columns=zero_importance)

In [138]:
def prepare_folds(data):
    kfold = StratifiedKFold(n_splits=global_config['folds'], random_state=42)
    folds_data = []
    for ids in kfold.split(data.id.values, data.target.values):
        train, test = ids
        folds_data.append((data.iloc[train], data.iloc[test]))
        
    return folds_data

In [139]:
def train_lgbm(train, val, config):
    train_x = train.drop(columns=['target', 'id']).fillna(-9999)
    train_y = train.target
    
    test_x = val.drop(columns=['target', 'id']).fillna(-9999)
    test_y = val.target
    
    cols = list(train_x.columns)
    
    xgtrain = lgb.Dataset(train_x, label=train_y)
    
    xgvalid = lgb.Dataset(test_x, label=test_y)
    
    
    clf = lgb.train(config, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'],
                     early_stopping_rounds=500,
                     verbose_eval=50)

    n_estimators = clf.best_iteration
    predicted = clf.predict(test_x)
    score = roc_auc_score(test_y, predicted)
    prec = precision_score(test_y, (predicted > 0.5).astype(int))
    f1 = f1_score(test_y, (predicted > 0.5).astype(int))
    print("Confusion matrix:")
    print(confusion_matrix(test_y, (predicted > 0.5).astype(int)))
    print("Report:")
    print(classification_report(test_y, (predicted > 0.5).astype(int)))
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = cols
    fold_importance_df["importance"] = clf.feature_importance()

    return {'score': score, 'model': clf, 'prec': prec, 'f1': f1, 'fold_importance': fold_importance_df}

In [140]:
def eval_test(models, test_lb):
#     test_lb = pd.read_csv('data/test_music.csv')
    predictions = []
    for model in models:
        predict = model.predict(test_lb)
        predictions.append(predict)
    return np.mean(predictions, axis=0)

In [141]:
def prepare_submission(predictions, score="Unknown"):
    sub = pd.read_csv('data/sample_submission_music.csv')
    sub.prediction = predictions
    sub.to_csv('s-%s.csv' % score, index=False)

In [142]:
def train_folds(folds, config):
    models = []
    auc = []
    f1 = []
    importances = []
    for n, fold in enumerate(folds):
        train_f, val_f = fold
        print("Training on %s" % str(train_f.shape))
        result = train_lgbm(train_f, val_f, config)
        
        importance = result['fold_importance']
        importance['fold'] = n
        importances.append(importance)
        
        models.append(result['model'])
        auc.append(result['score'])
        f1.append(result['f1'])
        print("Fold %s: %.4f, F1: %.4f, Precision: %.4f" % (n, result['score'], result['f1'], result['prec']))
    return models, {
        'auc': np.mean(auc),
        'importances': pd.concat(importances, sort=False),
        'f1': np.mean(f1)
    }

In [155]:
folds = prepare_folds(data)

In [None]:
models, result = train_folds(folds, config)

Training on (55999, 1261)




Training until validation scores don't improve for 500 rounds.
[50]	train's auc: 0.854652	valid's auc: 0.810328
[100]	train's auc: 0.876075	valid's auc: 0.817332
[150]	train's auc: 0.890041	valid's auc: 0.819237
[200]	train's auc: 0.90213	valid's auc: 0.820569
[250]	train's auc: 0.913083	valid's auc: 0.82122
[300]	train's auc: 0.922776	valid's auc: 0.821185
[350]	train's auc: 0.931209	valid's auc: 0.820988
[400]	train's auc: 0.938127	valid's auc: 0.820653
[450]	train's auc: 0.94456	valid's auc: 0.820758
[500]	train's auc: 0.950456	valid's auc: 0.820257
[550]	train's auc: 0.955402	valid's auc: 0.820127
[600]	train's auc: 0.959822	valid's auc: 0.81952
[650]	train's auc: 0.964029	valid's auc: 0.818913
[700]	train's auc: 0.967855	valid's auc: 0.81951
[750]	train's auc: 0.971241	valid's auc: 0.819081
Early stopping, best iteration is:
[267]	train's auc: 0.917054	valid's auc: 0.821827
Confusion matrix:
[[10975  1965]
 [  419   642]]
Report:
              precision    recall  f1-score   suppo



Training until validation scores don't improve for 500 rounds.
[50]	train's auc: 0.850451	valid's auc: 0.831886
[100]	train's auc: 0.872999	valid's auc: 0.837681
[150]	train's auc: 0.889934	valid's auc: 0.838641
[200]	train's auc: 0.901673	valid's auc: 0.839386
[250]	train's auc: 0.912794	valid's auc: 0.838832
[300]	train's auc: 0.922156	valid's auc: 0.838596
[350]	train's auc: 0.931106	valid's auc: 0.8381
[400]	train's auc: 0.939458	valid's auc: 0.837331
[450]	train's auc: 0.945001	valid's auc: 0.836169
[500]	train's auc: 0.951165	valid's auc: 0.835398
[550]	train's auc: 0.956849	valid's auc: 0.834783
[600]	train's auc: 0.961462	valid's auc: 0.833704
[650]	train's auc: 0.965944	valid's auc: 0.832961
Early stopping, best iteration is:
[179]	train's auc: 0.896617	valid's auc: 0.839755
Confusion matrix:
[[10925  2015]
 [  375   685]]
Report:
              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90     12940
         1.0       0.25      0.65      



Training until validation scores don't improve for 500 rounds.
[50]	train's auc: 0.853015	valid's auc: 0.827099
[100]	train's auc: 0.874154	valid's auc: 0.832316
[150]	train's auc: 0.888937	valid's auc: 0.834599
[200]	train's auc: 0.900893	valid's auc: 0.835351
[250]	train's auc: 0.911796	valid's auc: 0.83526
[300]	train's auc: 0.921371	valid's auc: 0.835575


In [146]:
print("AUC: %.4f, F1: %.4f" % (result['auc'], result['f1']))

AUC: 0.8327, F1: 0.3645


In [144]:

config = {
'num_iterations': 5000,
'learning_rate': 0.1,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric':'auc',
'num_leaves': 12,
'max_depth': 4,
'min_data_in_leaf': 500, 
'reg_alpha': 5,  # L1 regularization term on weights
'reg_lambda': 50,
'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
'scale_pos_weight': 9 # because training data is unbalanced 
}

In [148]:
importance = result['importances'].groupby(['feature'])\
                    .agg({'importance': 'mean'})\
                    .sort_values(by="importance",ascending=False)

In [149]:
zero_importance = importance[importance.importance == 0].index

In [150]:
zero_importance

Index(['voice_roam_in_cost_m3', 'voice_roam_in_cost_m2',
       'sim_count_inact_days_count', 'voice_roam_in_count_m1',
       'voice_roam_in_count_m2', 'voice_roam_in_count_m3',
       'voice_roam_in_dur_m1', 'voice_roam_in_dur_m2', 'voice_roam_in_dur_m3',
       'voice_roam_out_cost_m1',
       ...
       'data_type_3_m3_div_by_com_num_cost_m2',
       'voice_omo_in_night_rest_count_m2', 'voice_omo_in_night_rest_count_m1',
       'data_type_3_m3_div_by_short_out_calls_part_m1',
       'data_type_3_m3_div_by_short_out_calls_part_m2',
       'days_exp_div_by_com_num_cost_m2', 'days_exp_div_by_vol_app_1',
       'voice_omo_in_day_work_count_m2', 'days_exp_div_by_vol_app_5',
       'abon_cost_m1'],
      dtype='object', name='feature', length=386)

In [151]:
test_lb = pd.read_pickle('data/test.pkl')
cols = list(data.columns)
cols.remove("target")
cols.remove("id")
test_lb = test_lb[cols]


In [152]:
test_target = eval_test(models, test_lb)

In [153]:
prepare_submission(test_target, "AUC_%.4f_F1_%.4f" % (result['auc'], result['f1']))

In [None]:
# def objective(params):
#     params = {
#         'num_iterations': 1000,
#         'learning_rate': 0.1,
#         'boosting_type': 'gbdt',
#         'objective': 'binary',
#         'metric':'auc',
#         'num_leaves': int(params['num_leaves']),
#         'max_depth': int(params['max_depth']),
#         'reg_alpha': int(params['reg_alpha']),  # L1 regularization term on weights
#         'reg_lambda': int(params['reg_lambda']),
#         'min_child_samples': int(params['min_child_samples']),  # Minimum number of data need in a child(min_data_in_leaf)
#         'max_bin': 100,  # Number of bucketed bin for feature values
#         'subsample': float(params['subsample']),  # Subsample ratio of the training instance.
#         'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
#         'colsample_bytree': float(params['colsample_bytree']),  # Subsample ratio of columns when constructing each tree.
#         'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
#         'scale_pos_weight': 9 # because training data is unbalanced 
#     }
#     models, result = train_folds(folds, params)
#     return -result['auc']
    

In [115]:
# space = {
#     'num_leaves': hp.quniform('num_leaves', 4, 24, 1),
#     'max_depth': hp.quniform('max_depth', 2, 8, 1),
#     'reg_alpha': hp.quniform('reg_alpha', 1, 100, 2),
#     'reg_lambda': hp.quniform('reg_lambda', 1, 100, 2),
#     'min_child_samples': hp.quniform('min_child_samples', 100, 5000, 100),
#     'subsample': hp.quniform('subsample', 0.4, 1, 0.1),
#     'colsample_bytree': hp.quniform('colsample_bytree', 0.4, 1, 0.1),
    
# }

In [116]:
# %%time
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=100)