In [1]:
import os
import gc
import time
import pickle
from pathlib import Path
import numpy as np
import pandas as ps

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sbn
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_folder = Path('..') / 'data'

with open(str(data_folder / 'processed' / 'train_12.pkl'), 'rb') as f:
    train_df = pickle.load(f)
    
with open(str(data_folder / 'processed' / 'test_12.pkl'), 'rb') as f:
    test_df = pickle.load(f)

# test_df = ps.read_csv(data_folder / 'processed' / 'test.csv')
# train_df = ps.read_csv(data_folder / 'processed' / 'train.csv')

print('Train shapes:', train_df.shape)
print('Test shapes:', test_df.shape)

Train shapes: (354851, 210)
Test shapes: (202614, 209)


In [3]:
# train_df.columns.tolist()

In [4]:
target = 'CancelFlag'
drop_features = [
    'OrderID', 
    'Date_day', 
#     'Date_month', 
    'Date_year',
    'OrderDate_day', 
#     'OrderDate_month', 
    'OrderDate_year',
    target,
    # features with low importance

#     'MaterialsCnt|min',
#     'MaterialsCnt|mean',
#     'MaterialsCnt|max',
#     'Cluster_26',
#     'StartInterval_16|mean',
#     'Cluster_29',
#     'Cluster_28',
#     'Cluster_27',
#     'Cluster_25',
#     'MaterialsCnt|std',

#     'Cluster_25',
#     'Cluster_26',
#     'Cluster_27',
#     'Cluster_28',
#     'Cluster_29',
#     'HasPreviousOrder',
#     'GroupID_48',
#     'GroupID_44',
]
features2use = [f for f in train_df.columns if not any(f.startswith(df) for df in drop_features)]
# categorical_features = [
#     'ChannelID',
#     'OrderID',
#     'MaterialID',
#     'GroupID',
#     'Cluster',
#     'DeliveryType',
# #     'StartInterval',
# #     'EndInterval',
# #     'Date_weekday',
# #     'OrderDate_weekday',
# #     'Date_day',
# #     'Date_month',
# #     'Date_year',
# #     'OrderDate_day',
# #     'OrderDate_month',
# #     'OrderDate_year',
# ]
categorical_features = []

In [5]:
len(features2use)

208

In [6]:
test_df[target] = 0
# train_df = train_df.iloc[:10_000]

In [7]:
def make_predictions(folds, train_dframe, test_dframe, model_params, 
                     n_estimators=50_000, n_jobs=12, 
                     verbose=10_000, early_stopping_rounds=200):

    X, y = train_dframe[features2use], train_dframe[target]    
    P, P_y = test_dframe[features2use], train_df[target]  
    num_folds = folds.n_splits
    
    oof_preds = np.zeros((len(train_dframe), 1))
    predictions = np.zeros((len(test_dframe), 1))
    
    feature_importance = ps.DataFrame.from_dict({
        'feature': features2use,
        'importance': np.zeros(len(features2use))
    })
    scores = []
    
    for fold_, (train_idx, valid_idx) in enumerate(folds.split(X, y.values)):
        print()
        print(f'[{time.ctime()}] Fold: {fold_ + 1}/{num_folds}', flush=True)
        
        train_x, train_y = X.iloc[train_idx,:], y[train_idx]
        valid_x, valid_y = X.iloc[valid_idx,:], y[valid_idx]
#         pos_weight = np.sum(y == 0) / np.sum(y == 1)
#         print(f'Positive weight - {pos_weight}')
            
        print('Train & val shapes -', len(train_x), len(valid_y), flush=True)
        
        model = xgb.XGBClassifier(
            **model_params, 
            n_estimators=n_estimators, 
            n_jobs=n_jobs,
#             scale_pos_weight=pos_weight,
        )
        model.fit(
            train_x, train_y,
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
            early_stopping_rounds=early_stopping_rounds,
            verbose=verbose,
#             categorical_feature=categorical_features
        )
        feature_importance['importance'] += model.feature_importances_ / num_folds
        
        valid_preds = model.predict_proba(valid_x)[:, 1]
        oof_preds[valid_idx] = valid_preds.reshape(-1, 1)
        
        # store fold scores
        scores.append(roc_auc_score(valid_y.values, valid_preds))
        
        test_preds = model.predict_proba(P)[:, 1]
        predictions += test_preds.reshape(-1, 1)
    
    test_dframe[target] = predictions / num_folds
    
    print()
    print(f'Folds score - {np.mean(scores):.6f} +- {np.std(scores):.6f}', flush=True)
    print()
    
    return test_dframe, oof_preds, feature_importance

In [8]:
params = {
#     'num_leaves': 491,
#     'min_child_weight': 0.03454472573214212,
#     'feature_fraction': 0.3797454081646243,
#     'bagging_fraction': 0.4181193142567742,
#     'min_data_in_leaf': 106,
    'objective': 'binary:logistic',
#     'max_depth': 1_000,
    'learning_rate': 0.01,
    'boosting_type': 'gbtree',
    'bagging_seed': 11,
    'eval_metric': 'auc',
    'verbosity': 0,
#     'reg_alpha': 0.3899927210061127,
#     'reg_lambda': 0.6485237330340494,
    'random_state': 47,
}
folds = StratifiedKFold(n_splits=7, random_state=2019)
# folds = KFold(n_splits=7, random_state=2019)

In [9]:
%%time

test_df, oof_preds, fi = make_predictions(
    folds, 
    train_df, test_df,
    params, 
    n_jobs=os.cpu_count(), 
    verbose=500, 
    early_stopping_rounds=500
)


[Mon Nov 18 17:59:51 2019] Fold: 1/7
Train & val shapes - 304157 50694
[0]	validation_0-auc:0.641411	validation_1-auc:0.614396
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[500]	validation_0-auc:0.804036	validation_1-auc:0.76251
[1000]	validation_0-auc:0.850117	validation_1-auc:0.814354
[1500]	validation_0-auc:0.863827	validation_1-auc:0.830799
[2000]	validation_0-auc:0.872726	validation_1-auc:0.843999
[2500]	validation_0-auc:0.878964	validation_1-auc:0.85422
[3000]	validation_0-auc:0.883514	validation_1-auc:0.861055
[3500]	validation_0-auc:0.887118	validation_1-auc:0.865541
[4000]	validation_0-auc:0.890246	validation_1-auc:0.869211
[4500]	validation_0-auc:0.892896	validation_1-auc:0.871918
[5000]	validation_0-auc:0.895248	validation_1-auc:0.873825
[5500]	validation_0-auc:0.89727	validation_1-auc:0.87514
[6000]	validation_0-auc:0.899134	validation_1-auc:0.876945
[6500]	vali

In [10]:
fi = fi.sort_values(by='importance', ascending=False)

In [11]:
fi[fi['importance'] == 0]

Unnamed: 0,feature,importance
22,DiffWithPrevNumMaterialsByMonth|mean,0.0
21,PrevNumMaterialsByMonth|mean,0.0
86,Cluster_25,0.0
24,PrevNumGroupMaterialsByDay|mean,0.0
20,NumMaterialsByMonth|mean,0.0
87,Cluster_26,0.0
88,Cluster_27,0.0
89,Cluster_28,0.0
90,Cluster_29,0.0
23,NumGroupMaterialsByDay|mean,0.0


In [12]:
fi[fi['importance'] == 0]['feature'].tolist()

['DiffWithPrevNumMaterialsByMonth|mean',
 'PrevNumMaterialsByMonth|mean',
 'Cluster_25',
 'PrevNumGroupMaterialsByDay|mean',
 'NumMaterialsByMonth|mean',
 'Cluster_26',
 'Cluster_27',
 'Cluster_28',
 'Cluster_29',
 'NumGroupMaterialsByDay|mean',
 'GroupID_44',
 'DiffWithPrevNumGroupMaterialsByDay|mean',
 'DiffWithPrevNumUniqueMaterialsByOrder|mean',
 'MaterialsCnt|std',
 'MaterialsCnt|mean',
 'MaterialsCnt|max',
 'NumGroupMaterialsByMonth|mean',
 'PrevNumGroupMaterialsByMonth|mean',
 'DiffWithPrevNumGroupMaterialsByMonth|mean',
 'NumUniqueMaterialsByOrder|mean',
 'MaterialsCnt|min',
 'MaterialsCnt',
 'HasPreviousOrder_y|mean',
 'PrevNumUniqueMaterialsByOrder|mean',
 'HasPreviousOrder_x|mean',
 'StartInterval_16']

In [None]:
plt.figure(figsize=(12, 25))
sbn.barplot(x='importance', y='feature', data=fi);

In [None]:
fi.to_csv('xgb_feature_importance.csv', index=False)

In [None]:
t = test_df.copy()

res = t.groupby('OrderID').agg({'CancelFlag': 'mean'}).reset_index().rename(columns={'OrderID': 'ID', 'CancelFlag': 'Score'})
res['ID'] = res['ID'].map(lambda item: f'{item} ')
print(res.shape)
res.head()

In [None]:
empty_df = ps.read_csv(data_folder / 'empty.csv')
print(empty_df.shape)
empty_df.head()

In [None]:
res.to_csv('../submission/xgb_dummy_12.csv', index=False)