In [1]:
import pandas as pd
import datetime

import seaborn as sns
sns.set(font_scale=1.3)

import catboost as cb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold


import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import f1_score, classification_report, confusion_matrix
import pickle

import warnings

from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
SEED = 1

In [98]:
class_weight = {'card2card_transfer': 0.6398475182669389,
 'card_recharge': 0.4502000735911028,
 'chat': 0.3815013547758437,
 'credit_info': 0.46101125727352393,
 'invest': 0.9321761812255462,
 'main_screen': 0.982994897893273,
 'mobile_recharge': 0.641212783884727,
 'own_transfer': 0.6317591535575899,
 'phone_money_transfer': 0.4600662111029351,
 'statement': 0.6951657522904898}

In [4]:
train_df_less = pd.read_pickle('/data/edmitrie/alfa_boosters/train_df_less_ready.pickle')
test_df = pd.read_pickle('/data/edmitrie/alfa_boosters/test_df.pickle')

In [101]:
submit_df = pd.read_pickle('/data/edmitrie/alfa_boosters/submit_df.pickle')

In [5]:
cat_atrs = pickle.load(open('cat_atrs', 'rb'))
atrs = pickle.load(open('atrs', 'rb'))

## Применение lgbm

In [6]:
def evalerror(y_true, y_pred):
    preds = np.reshape(y_pred, (y_true.shape[0], 10), order='F')
    preds = preds.argmax(axis = 1)
    f_score = f1_score(y_true, preds, average = 'macro')
    
    return 'f1_score', f_score, True

In [102]:
for cat_atr in cat_atrs:
#     train_df_less[cat_atr] = train_df_less[cat_atr].astype('category')
#     test_df[cat_atr] = test_df[cat_atr].astype('category')
#     train_df[cat_atr] = train_df[cat_atr].astype('category')
    submit_df[cat_atr] = submit_df[cat_atr].astype('category')

In [8]:
lgbm = LGBMClassifier(objective='multiclass', reg_alpha=12, max_depth=10, min_split_gain = 0.7, subsample=0.7, n_estimators=320, n_jobs=60, random_state=SEED)

In [9]:
cols_bigger = [atr for atr in atrs if 'bigger' in atr]

In [10]:
for col in cols_bigger:
    train_df_less[col] = train_df_less[col].astype(np.float32)
    test_df[col] = test_df[col].astype(np.float32)
#     train_df[col] = train_df[col].astype(np.float32)

In [11]:
atrs = [atr for atr in atrs if 'proba' not in atr or 'prev' in atr]

In [12]:
train_df_less.loc[train_df_less['timezone_num_prev_2'] == 'nan','timezone_num_prev_2'] = 0
train_df_less.loc[train_df_less['timezone_num_prev_1'] == 'nan','timezone_num_prev_1'] = 0

In [13]:
train_df_less['timezone_num_prev_2'] = train_df_less['timezone_num_prev_2'].astype(int)
train_df_less['timezone_num_prev_1'] = train_df_less['timezone_num_prev_1'].astype(int)

In [14]:
lgbm.fit(train_df_less[atrs], train_df_less['multi_class_target'], eval_set=(test_df[atrs], test_df['multi_class_target']), eval_metric=evalerror, verbose=20, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_logloss: 1.45094	valid_0's f1_score: 0.40638
[40]	valid_0's multi_logloss: 1.43375	valid_0's f1_score: 0.40599
[60]	valid_0's multi_logloss: 1.42871	valid_0's f1_score: 0.407418
[80]	valid_0's multi_logloss: 1.42688	valid_0's f1_score: 0.408683
[100]	valid_0's multi_logloss: 1.42505	valid_0's f1_score: 0.409115
[120]	valid_0's multi_logloss: 1.42398	valid_0's f1_score: 0.410432
[140]	valid_0's multi_logloss: 1.42311	valid_0's f1_score: 0.409683
[160]	valid_0's multi_logloss: 1.4231	valid_0's f1_score: 0.410029
[180]	valid_0's multi_logloss: 1.42312	valid_0's f1_score: 0.41006
[200]	valid_0's multi_logloss: 1.42251	valid_0's f1_score: 0.409579
[220]	valid_0's multi_logloss: 1.42221	valid_0's f1_score: 0.409973
[240]	valid_0's multi_logloss: 1.4225	valid_0's f1_score: 0.410764
[260]	valid_0's multi_logloss: 1.42219	valid_0's f1_score: 0.410813
[280]	valid_0's multi_logloss: 1.42183	valid_0's f1_score: 0.41

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=10,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.7,
               n_estimators=320, n_jobs=60, num_leaves=31,
               objective='multiclass', random_state=1, reg_alpha=12,
               reg_lambda=0.0, silent=True, subsample=0.7,
               subsample_for_bin=200000, subsample_freq=0)

In [15]:
def get_new_features(df):
        for i in range(df.shape[0]):
            if df['im'].iloc[:i].sum()/df['im'].sum() > 0.99:
                break
        return df['f'].iloc[:i].values.tolist()

In [16]:
f_imp = pd.DataFrame(zip(lgbm.feature_name_, lgbm.feature_importances_), columns=['f', 'im']).sort_values(by='im', ascending=False)

In [17]:
new_atrs = get_new_features(f_imp)

In [99]:
lgbm = LGBMClassifier(objective='multiclass', reg_alpha=12, max_depth=10, min_split_gain = 0.7, subsample=0.7, n_estimators=320, n_jobs=60, random_state=SEED, class_weight=class_weight)

In [100]:
lgbm.fit(train_df_less[new_atrs], train_df_less['multi_class_target'], eval_set=(test_df[new_atrs], test_df['multi_class_target']), eval_metric=evalerror, verbose=5, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[5]	valid_0's multi_logloss: 1.53466	valid_0's f1_score: 0.352629
[10]	valid_0's multi_logloss: 1.43285	valid_0's f1_score: 0.390188
[15]	valid_0's multi_logloss: 1.39226	valid_0's f1_score: 0.399601
[20]	valid_0's multi_logloss: 1.37346	valid_0's f1_score: 0.405104
[25]	valid_0's multi_logloss: 1.36429	valid_0's f1_score: 0.407788
[30]	valid_0's multi_logloss: 1.35893	valid_0's f1_score: 0.409097
[35]	valid_0's multi_logloss: 1.35745	valid_0's f1_score: 0.410357
[40]	valid_0's multi_logloss: 1.35549	valid_0's f1_score: 0.411354
[45]	valid_0's multi_logloss: 1.35358	valid_0's f1_score: 0.411935
[50]	valid_0's multi_logloss: 1.3523	valid_0's f1_score: 0.412762
[55]	valid_0's multi_logloss: 1.35132	valid_0's f1_score: 0.413338
[60]	valid_0's multi_logloss: 1.35075	valid_0's f1_score: 0.414003
[65]	valid_0's multi_logloss: 1.35024	valid_0's f1_score: 0.413719
[70]	valid_0's multi_logloss: 1.34942	valid_0's f1_score: 0.414253
[7

LGBMClassifier(boosting_type='gbdt',
               class_weight={'card2card_transfer': 0.6398475182669389,
                             'card_recharge': 0.4502000735911028,
                             'chat': 0.3815013547758437,
                             'credit_info': 0.46101125727352393,
                             'invest': 0.9321761812255462,
                             'main_screen': 0.982994897893273,
                             'mobile_recharge': 0.641212783884727,
                             'own_transfer': 0.6317591535575899,
                             'phone_money_transfer': 0.4600662111029351,
                             'statement': 0.6951657522904898},
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=10, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.7, n_estimators=320, n_jobs=60, num_leaves=31,
               objective='multiclass', random_state=1, reg_alpha=12,
          

In [109]:
def blending_lgbm(model, metric, X, y, X_eval, y_eval):
    k_fold = StratifiedKFold(5, shuffle=True, random_state=SEED)
    metrics = []
    models = []

    for train_ind, test_ind in k_fold.split(X, y):
        X_train, X_test = X.iloc[train_ind], X.iloc[test_ind]
        y_train, y_test = y.iloc[train_ind], y.iloc[test_ind]
        
        model = LGBMClassifier(objective='multiclass', class_weight=class_weight, reg_alpha=12, max_depth=10, min_split_gain = 0.7, subsample=0.7, n_estimators=250, n_jobs=60, random_state=SEED)

        model.fit(X_train, y_train, eval_set = (X_eval, y_eval), eval_metric=evalerror, early_stopping_rounds=100)
        models.append(model)

        metrics.append(metric(y_eval, model.predict(X_eval), average='macro'))
        print(metrics[-1])

    return models, metrics

In [111]:
models, metrics = blending_lgbm(lgbm, f1_score, train_df_less[new_atrs], train_df_less['multi_class_target'], test_df[new_atrs], test_df['multi_class_target'])

[1]	valid_0's multi_logloss: 1.76398	valid_0's f1_score: 0.0895974
Training until validation scores don't improve for 100 rounds
[2]	valid_0's multi_logloss: 1.67971	valid_0's f1_score: 0.188234
[3]	valid_0's multi_logloss: 1.61869	valid_0's f1_score: 0.275309
[4]	valid_0's multi_logloss: 1.57248	valid_0's f1_score: 0.326217
[5]	valid_0's multi_logloss: 1.5361	valid_0's f1_score: 0.348786
[6]	valid_0's multi_logloss: 1.50662	valid_0's f1_score: 0.363021
[7]	valid_0's multi_logloss: 1.48293	valid_0's f1_score: 0.372218
[8]	valid_0's multi_logloss: 1.46342	valid_0's f1_score: 0.379172
[9]	valid_0's multi_logloss: 1.44771	valid_0's f1_score: 0.384937
[10]	valid_0's multi_logloss: 1.43443	valid_0's f1_score: 0.389058
[11]	valid_0's multi_logloss: 1.4236	valid_0's f1_score: 0.390965
[12]	valid_0's multi_logloss: 1.41443	valid_0's f1_score: 0.393226
[13]	valid_0's multi_logloss: 1.40657	valid_0's f1_score: 0.395367
[14]	valid_0's multi_logloss: 1.39962	valid_0's f1_score: 0.39805
[15]	valid_

In [106]:
# submit_df['prediction'] = lgbm.predict(submit_df[new_atrs])

In [108]:
# submit_df.reset_index()[['client_pin', 'prediction']].to_csv('another_submit.csv', index=False)

In [112]:
predicts = models[0].predict_proba(submit_df[new_atrs])

In [113]:
for i in range(1, 5):
    predicts += models[i].predict_proba(submit_df[new_atrs])

(79268, 10)
(79268, 10)
(79268, 10)
(79268, 10)


In [114]:
submit_df['prediction'] = models[0].classes_[predicts.argmax(axis=-1)]

In [115]:
submit_df.reset_index()[['client_pin', 'prediction']].to_csv('ensemble.csv', index=False)