In [1]:
import pickle
from pathlib import Path
import numpy as np
import pandas as ps
import category_encoders as ce
import datetime
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sbn
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

ps.set_option('display.max_rows', 500)
ps.set_option('display.max_columns', 500)
ps.set_option('display.width', 1000)

In [2]:
data_folder = Path('..') / 'data'
test_df = ps.read_csv(data_folder / 'test.csv', encoding='latin-1')
test_df['CancelFlag'] = 0
train_df = ps.read_csv(data_folder / 'train.csv', encoding='latin-1')

print('Train shapes:', train_df.shape)
print('Test shapes:', test_df.shape)

Train shapes: (9023184, 14)
Test shapes: (5032740, 14)


In [3]:
train_df[train_df['CancelFlag'] == 1].head(10)

Unnamed: 0,Interval,Date,OrderDate,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,CancelFlag,OrderCnt,DeliveryType,prepay,count_edit
28,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3294844.0,36.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
29,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3309718.0,30.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
30,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3043258.0,63.0,,1,2.0,Îáû÷íàÿ äîñòàâêà,0,1
31,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3040464.0,17.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
32,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3349229.0,12.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
33,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3349205.0,12.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
34,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,2012526.0,36.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
35,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3349914.0,18.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
36,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3350718.0,61.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1
37,10-18.,17/10/2018,15/10/2018,93411902,2,90102104012,3049446.0,36.0,,1,1.0,Îáû÷íàÿ äîñòàâêà,0,1


In [4]:
holidays_set = {
    '01/01/2018', '02/01/2018', '03/01/2018',
    '04/01/2018', '05/01/2018', '06/01/2018',
    '07/01/2018', '23/02/2018', '08/03/2018',
    '09/03/2018', '30/04/2018', '01/05/2018',
    '09/05/2018', '11/06/2018', '12/06/2018',
    '04/11/2018', '05/11/2018', '25/11/2018',
}

def is_holiday(d: str) -> bool:
    return d in holidays_set

In [5]:
def map_all_the_stuff(d: ps.DataFrame) -> ps.DataFrame:
    # mappings
    d['DeliveryType'] = d['DeliveryType'].map({'Îáû÷íàÿ äîñòàâêà': 0, 'Äîñòàâêà Äåíü â Äåíü': 1})
    d['Cluster'] = d['Cluster'].fillna('MISSING')
    d['OrderCnt'] = d['OrderCnt'].fillna(0)
    d['GroupID'] = d['GroupID'].fillna(0)
    d['MaterialID'] = d['MaterialID'].fillna(0)
    
    # intervals
    d['StartInterval'] = d['Interval'].apply(lambda item: int(item.rsplit('-')[0]))
    d['EndInterval'] = d['Interval'].apply(lambda item: int(item.rsplit('-')[1][:-1]))
    
    # dates
    d['Date_is_holiday'] = d['Date'].apply(is_holiday)
    d['Date'] = ps.to_datetime(d['Date'], format='%d/%m/%Y')
#     d['Date_day'] = d['Date'].dt.day
    d['Date_month'] = d['Date'].dt.month
#     d['Date_year'] = d['Date'].dt.year
    d['Date_weekday'] = d['Date'].dt.weekday
    
    d['OrderDate_is_holiday'] = d['OrderDate'].apply(is_holiday)
    d['OrderDate'] = ps.to_datetime(d['OrderDate'], format='%d/%m/%Y')
#     d['OrderDate_day'] = d['OrderDate'].dt.day
    d['OrderDate_month'] = d['OrderDate'].dt.month
#     d['OrderDate_year'] = d['OrderDate'].dt.year
    d['OrderDate_weekday'] = d['OrderDate'].dt.weekday
    
    d['DatesGap'] = d['Date'] - d['OrderDate']
    d['DatesGap_days'] = d['DatesGap'].dt.days
    
    # cast types
    d['MaterialID'] = d['MaterialID'].astype(np.int32)
    d['GroupID'] = d['GroupID'].astype(np.int16)
    d['OrderCnt'] = d['OrderCnt'].astype(np.int16)
    
#     d.drop(columns=['Interval', 'Date', 'OrderDate', 'DatesGap'], inplace=True)
    return d


def client_orders_cnt(d: ps.DataFrame) -> ps.DataFrame:
    client_order_cnt = d.groupby(['ClientID']).agg({'OrderID': ps.Series.nunique}).reset_index()
    client_order_cnt = client_order_cnt.rename(columns={'OrderID': 'ClientOrderCnt'})
    client_order_cnt['HasPreviousOrder'] = client_order_cnt['ClientOrderCnt'] > 1
    d = d.merge(client_order_cnt, on='ClientID', how='left')
    return d


def order_days_delay(d: ps.DataFrame) -> ps.DataFrame:
    order_dates = d.groupby(['ClientID', 'OrderID']).agg({'OrderDate': 'first'}).reset_index()
    order_dates['ShiftedOrderDate'] = order_dates.groupby('ClientID')['OrderDate'].shift(1)
    order_dates['OrderDatesDaysDelay'] = (order_dates['OrderDate'] - order_dates['ShiftedOrderDate']).dt.days
    order_dates['OrderDatesDaysDelay'] = order_dates['OrderDatesDaysDelay'].fillna(0).astype(int)
    order_dates = order_dates[['ClientID', 'OrderID', 'OrderDatesDaysDelay']]
    d = d.merge(order_dates, on=['ClientID', 'OrderID'], how='left')
    return d


def orders_in_month(d: ps.DataFrame) -> ps.DataFrame:
    orders = d.groupby(['ClientID', 'Date_month']).agg({'Date': 'size'}).reset_index()
    orders = orders.rename(columns={'Date': 'MonthlyOrderCnt'})
    d = d.merge(orders, on=['ClientID', 'Date_month'], how='left')
    d['MonthlyOrderCnt'] = d['MonthlyOrderCnt'].fillna(0)
    return d

In [6]:
%%time

train_df = map_all_the_stuff(train_df)
test_df = map_all_the_stuff(test_df)

CPU times: user 1min 16s, sys: 1.54 s, total: 1min 18s
Wall time: 51.5 s


In [7]:
%%time

train_df = orders_in_month(train_df)
test_df = orders_in_month(test_df)

CPU times: user 12.2 s, sys: 3.09 s, total: 15.3 s
Wall time: 5.38 s


In [8]:
%%time

train_df = client_orders_cnt(train_df)
test_df = client_orders_cnt(test_df)

CPU times: user 15.1 s, sys: 3.42 s, total: 18.5 s
Wall time: 10.5 s


In [9]:
%%time

train_df = order_days_delay(train_df)
test_df = order_days_delay(test_df)

CPU times: user 14.1 s, sys: 3.8 s, total: 17.9 s
Wall time: 6.89 s


In [10]:
train_df.drop(columns=['Interval', 'Date', 'OrderDate', 'DatesGap'], inplace=True)
test_df.drop(columns=['Interval', 'Date', 'OrderDate', 'DatesGap'], inplace=True)

In [11]:
train_df.head()

Unnamed: 0,ClientID,ChannelID,OrderID,MaterialID,GroupID,Cluster,CancelFlag,OrderCnt,DeliveryType,prepay,count_edit,StartInterval,EndInterval,Date_is_holiday,Date_month,Date_weekday,OrderDate_is_holiday,OrderDate_month,OrderDate_weekday,DatesGap_days,MonthlyOrderCnt,ClientOrderCnt,HasPreviousOrder,OrderDatesDaysDelay
0,93808186,2,90102063002,3328810,61,MISSING,0,1,0,0,1,14,16,False,10,2,False,10,1,1,117,35,True,0
1,93808186,2,90102063002,3281258,30,MISSING,0,2,0,0,1,14,16,False,10,2,False,10,1,1,117,35,True,0
2,93808186,2,90102063002,3210734,10,MISSING,0,1,0,0,1,14,16,False,10,2,False,10,1,1,117,35,True,0
3,93808186,2,90102063002,3328848,61,MISSING,0,2,0,0,1,14,16,False,10,2,False,10,1,1,117,35,True,0
4,94112406,2,90102091007,3347801,17,MISSING,0,10,0,0,1,12,14,False,10,6,False,10,4,2,5,7,True,0


In [12]:
# %%time
# cat_cols = ['Cluster']

# for c in cat_cols:
#     le = LabelEncoder()
#     le.fit(np.concatenate([train_df[c], test_df[c]]))
#     train_df[c] = le.transform(train_df[c])
#     test_df[c] = le.transform(test_df[c])

## One Hot Encoding

In [13]:
%%time

cols2ohe = [
    'ChannelID', 'Cluster', 'GroupID', 
    'StartInterval', 'EndInterval', 
    'Date_month', 'OrderDate_month',
    'Date_weekday', 'OrderDate_weekday'
]
one_hot_encoder = ce.OneHotEncoder(verbose=1, cols=cols2ohe)
used_in_ohe_cols = ['OrderID'] + cols2ohe


train_ohe = train_df[used_in_ohe_cols].drop_duplicates()
test_ohe = test_df[used_in_ohe_cols].drop_duplicates()

one_hot_encoder.fit(ps.concat([
    train_ohe, 
    test_ohe,
]))

CPU times: user 1min 7s, sys: 42.5 s, total: 1min 50s
Wall time: 28.1 s


OneHotEncoder(cols=['ChannelID', 'Cluster', 'GroupID', 'StartInterval',
                    'EndInterval', 'Date_month', 'OrderDate_month',
                    'Date_weekday', 'OrderDate_weekday'],
              drop_invariant=False, handle_missing='value',
              handle_unknown='value', return_df=True, use_cat_names=False,
              verbose=1)

In [14]:
%%time

train_ohe = one_hot_encoder.transform(train_ohe)
print(train_ohe.shape)
train_ohe.head()

(3563213, 152)
CPU times: user 37 s, sys: 26.1 s, total: 1min 3s
Wall time: 14.6 s


Unnamed: 0,OrderID,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102063002,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
1,90102063002,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2,90102063002,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
4,90102091007,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
5,90102091007,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


In [15]:
train_ohe = train_ohe.groupby('OrderID').sum().reset_index()
print(train_ohe.shape)
train_ohe.head()

(354851, 152)


Unnamed: 0,OrderID,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102063002,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0
1,90102091007,4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,4,0,0,0,0,0
2,90102092000,7,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0
3,90102103017,4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,4,0,0,0,0,0
4,90102104012,9,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,9,0,0,0,0


In [16]:
%%time

test_ohe = one_hot_encoder.transform(test_ohe)
print(test_ohe.shape)
test_ohe.head()

(2003850, 152)
CPU times: user 24.3 s, sys: 14.5 s, total: 38.8 s
Wall time: 8.24 s


Unnamed: 0,OrderID,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102211131,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,90102211131,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
2,90102211131,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,90102211133,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
4,90102211133,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0


In [17]:
test_ohe = test_ohe.groupby('OrderID').sum().reset_index()
print(test_ohe.shape)
test_ohe.head()

(202614, 152)


Unnamed: 0,OrderID,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102211131,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,3,0,0,0,0,0
1,90102211133,9,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,9,0,0,0,0,0
2,90102216055,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,90102216081,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,90102216084,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0


In [18]:
train_df['MaterialsCnt'] = 1
test_df['MaterialsCnt'] = 1

In [19]:
def make_aggs(cols) -> dict:
    aggs = dict()
    for c in cols:
        if c.startswith('ChannelID'):
            aggs[c] = 'sum'
        elif c.startswith('GroupID'):
            aggs[c] = 'sum'
        elif c.startswith('Cluster'):
            aggs[c] = 'sum'
        elif c.startswith('OrderCnt'):
            aggs[c] = ['sum', 'min', 'max', 'mean', 'std']
        elif c.startswith('MaterialsCnt'):
            aggs[c] = ['sum', 'min', 'max', 'mean', 'std']
        else:
            aggs[c] = 'mean'
    return aggs

In [20]:
train_df = train_df.drop(columns=['ClientID', 'MaterialID'] + cols2ohe)
aggs = make_aggs(train_df.columns)
train_df = train_df.groupby('OrderID').agg(aggs).drop(columns=['OrderID']).reset_index()

def col_names_mapper(cols):
    return '|'.join(['' if c == 'sum' else c for c in cols])

train_df.columns = train_df.columns.map(col_names_mapper).str.strip('|')
train_df = train_df.rename(columns={'CancelFlag|mean': 'CancelFlag'})

for col in train_df.columns:
    if col.endswith('|std'):
        train_df[col] = train_df[col].fillna(0)

print(train_df.shape)
train_df.head()

(354851, 22)


Unnamed: 0,OrderID,CancelFlag,OrderCnt,OrderCnt|min,OrderCnt|max,OrderCnt|mean,OrderCnt|std,DeliveryType|mean,prepay|mean,count_edit|mean,Date_is_holiday|mean,OrderDate_is_holiday|mean,DatesGap_days|mean,MonthlyOrderCnt|mean,ClientOrderCnt|mean,HasPreviousOrder|mean,OrderDatesDaysDelay|mean,MaterialsCnt,MaterialsCnt|min,MaterialsCnt|max,MaterialsCnt|mean,MaterialsCnt|std
0,90102063002,0,6,1,2,1.5,0.57735,0,0,1,False,False,1,117,35,True,0,4,1,1,1,0.0
1,90102091007,0,28,1,10,5.6,3.361547,0,0,1,False,False,2,5,7,True,0,5,1,1,1,0.0
2,90102092000,0,16,1,3,1.6,0.843274,1,0,1,False,False,0,36,36,True,0,10,1,1,1,0.0
3,90102103017,0,14,1,3,1.555556,0.881917,0,0,1,False,False,0,36,36,True,7,9,1,1,1,0.0
4,90102104012,1,29,1,3,1.115385,0.431455,0,0,1,False,False,2,133,18,True,0,26,1,1,1,0.0


In [21]:
train_df = train_df.merge(train_ohe, on='OrderID', how='left')
print(train_df.shape)
train_df.head()

(354851, 173)


Unnamed: 0,OrderID,CancelFlag,OrderCnt,OrderCnt|min,OrderCnt|max,OrderCnt|mean,OrderCnt|std,DeliveryType|mean,prepay|mean,count_edit|mean,Date_is_holiday|mean,OrderDate_is_holiday|mean,DatesGap_days|mean,MonthlyOrderCnt|mean,ClientOrderCnt|mean,HasPreviousOrder|mean,OrderDatesDaysDelay|mean,MaterialsCnt,MaterialsCnt|min,MaterialsCnt|max,MaterialsCnt|mean,MaterialsCnt|std,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102063002,0,6,1,2,1.5,0.57735,0,0,1,False,False,1,117,35,True,0,4,1,1,1,0.0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0
1,90102091007,0,28,1,10,5.6,3.361547,0,0,1,False,False,2,5,7,True,0,5,1,1,1,0.0,4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,4,0,0,0,0,0
2,90102092000,0,16,1,3,1.6,0.843274,1,0,1,False,False,0,36,36,True,0,10,1,1,1,0.0,7,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,7,0,0,0,0,0
3,90102103017,0,14,1,3,1.555556,0.881917,0,0,1,False,False,0,36,36,True,7,9,1,1,1,0.0,4,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,4,0,0,0,0,0
4,90102104012,1,29,1,3,1.115385,0.431455,0,0,1,False,False,2,133,18,True,0,26,1,1,1,0.0,9,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,9,0,0,0,0


In [22]:
test_df = test_df.drop(columns=['ClientID', 'MaterialID', 'CancelFlag'] + cols2ohe)
aggs = make_aggs(test_df.columns)
test_df = test_df.groupby('OrderID').agg(aggs).drop(columns=['OrderID']).reset_index()

test_df.columns = test_df.columns.map(col_names_mapper).str.strip('|')

for col in test_df.columns:
    if col.endswith('|std'):
        test_df[col] = test_df[col].fillna(0)

print(test_df.shape)
test_df.head()

(202614, 21)


Unnamed: 0,OrderID,OrderCnt,OrderCnt|min,OrderCnt|max,OrderCnt|mean,OrderCnt|std,DeliveryType|mean,prepay|mean,count_edit|mean,Date_is_holiday|mean,OrderDate_is_holiday|mean,DatesGap_days|mean,MonthlyOrderCnt|mean,ClientOrderCnt|mean,HasPreviousOrder|mean,OrderDatesDaysDelay|mean,MaterialsCnt,MaterialsCnt|min,MaterialsCnt|max,MaterialsCnt|mean,MaterialsCnt|std
0,90102211131,4,1,2,1.333333,0.57735,1,0,1,False,False,0,62,26,True,0,3,1,1,1,0.0
1,90102211133,44,1,10,1.913043,1.998023,0,0,1,False,False,1,73,12,True,0,23,1,1,1,0.0
2,90102216055,5,5,5,5.0,0.0,0,0,1,False,False,1,8,24,True,0,1,1,1,1,0.0
3,90102216081,5,5,5,5.0,0.0,0,0,1,False,False,1,8,24,True,1,1,1,1,1,0.0
4,90102216084,12,1,5,2.0,1.549193,0,0,1,False,False,2,6,5,True,0,6,1,1,1,0.0


In [23]:
test_df = test_df.merge(test_ohe, on='OrderID', how='left')
print(test_df.shape)
test_df.head()

(202614, 172)


Unnamed: 0,OrderID,OrderCnt,OrderCnt|min,OrderCnt|max,OrderCnt|mean,OrderCnt|std,DeliveryType|mean,prepay|mean,count_edit|mean,Date_is_holiday|mean,OrderDate_is_holiday|mean,DatesGap_days|mean,MonthlyOrderCnt|mean,ClientOrderCnt|mean,HasPreviousOrder|mean,OrderDatesDaysDelay|mean,MaterialsCnt,MaterialsCnt|min,MaterialsCnt|max,MaterialsCnt|mean,MaterialsCnt|std,ChannelID_1,ChannelID_2,ChannelID_3,ChannelID_4,ChannelID_5,Cluster_1,Cluster_2,Cluster_3,Cluster_4,Cluster_5,Cluster_6,Cluster_7,Cluster_8,Cluster_9,Cluster_10,Cluster_11,Cluster_12,Cluster_13,Cluster_14,Cluster_15,Cluster_16,Cluster_17,Cluster_18,Cluster_19,Cluster_20,Cluster_21,Cluster_22,Cluster_23,Cluster_24,Cluster_25,Cluster_26,Cluster_27,Cluster_28,Cluster_29,GroupID_1,GroupID_2,GroupID_3,GroupID_4,GroupID_5,GroupID_6,GroupID_7,GroupID_8,GroupID_9,GroupID_10,GroupID_11,GroupID_12,GroupID_13,GroupID_14,GroupID_15,GroupID_16,GroupID_17,GroupID_18,GroupID_19,GroupID_20,GroupID_21,GroupID_22,GroupID_23,GroupID_24,GroupID_25,GroupID_26,GroupID_27,GroupID_28,GroupID_29,GroupID_30,GroupID_31,GroupID_32,GroupID_33,GroupID_34,GroupID_35,GroupID_36,GroupID_37,GroupID_38,GroupID_39,GroupID_40,GroupID_41,GroupID_42,GroupID_43,GroupID_44,GroupID_45,GroupID_46,GroupID_47,GroupID_48,GroupID_49,StartInterval_1,StartInterval_2,StartInterval_3,StartInterval_4,StartInterval_5,StartInterval_6,StartInterval_7,StartInterval_8,StartInterval_9,StartInterval_10,StartInterval_11,StartInterval_12,StartInterval_13,StartInterval_14,StartInterval_15,StartInterval_16,EndInterval_1,EndInterval_2,EndInterval_3,EndInterval_4,EndInterval_5,EndInterval_6,EndInterval_7,EndInterval_8,EndInterval_9,EndInterval_10,EndInterval_11,EndInterval_12,EndInterval_13,EndInterval_14,Date_month_1,Date_month_2,Date_month_3,Date_month_4,Date_month_5,Date_month_6,Date_month_7,Date_month_8,Date_month_9,Date_month_10,Date_month_11,Date_month_12,OrderDate_month_1,OrderDate_month_2,OrderDate_month_3,OrderDate_month_4,OrderDate_month_5,OrderDate_month_6,OrderDate_month_7,OrderDate_month_8,OrderDate_month_9,OrderDate_month_10,OrderDate_month_11,OrderDate_month_12,Date_weekday_1,Date_weekday_2,Date_weekday_3,Date_weekday_4,Date_weekday_5,Date_weekday_6,Date_weekday_7,OrderDate_weekday_1,OrderDate_weekday_2,OrderDate_weekday_3,OrderDate_weekday_4,OrderDate_weekday_5,OrderDate_weekday_6,OrderDate_weekday_7
0,90102211131,4,1,2,1.333333,0.57735,1,0,1,False,False,0,62,26,True,0,3,1,1,1,0.0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,3,0,0,0,0,0
1,90102211133,44,1,10,1.913043,1.998023,0,0,1,False,False,1,73,12,True,0,23,1,1,1,0.0,9,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,9,0,0,0,0,0
2,90102216055,5,5,5,5.0,0.0,0,0,1,False,False,1,8,24,True,0,1,1,1,1,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,90102216081,5,5,5,5.0,0.0,0,0,1,False,False,1,8,24,True,1,1,1,1,1,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,90102216084,12,1,5,2.0,1.549193,0,0,1,False,False,2,6,5,True,0,6,1,1,1,0.0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0


## Target encoding

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import clone
from sklearn.model_selection import check_cv, KFold
from category_encoders import CatBoostEncoder


class TargetEncoderCV(BaseEstimator, TransformerMixin):

    def __init__(self, cv, **cbe_params):
        self.cv = cv
        self.cbe_params = cbe_params

    @property
    def _n_splits(self):
        return check_cv(self.cv).n_splits

    def fit_transform(self, X: ps.DataFrame, y: ps.DataFrame) -> ps.DataFrame:
        self.cbe_ = []
        cv = check_cv(self.cv)

        cbe = CatBoostEncoder(
            cols=X.columns.tolist(),
            return_df=False,
            **self.cbe_params
        )

        X_transformed = np.zeros_like(X, dtype=np.float64)
        for train_idx, valid_idx in cv.split(X, y):
            self.cbe_.append(
                clone(cbe).fit(X.iloc[train_idx], y.iloc[train_idx])
            )
            X_transformed[valid_idx] = self.cbe_[-1].transform(
                X.iloc[valid_idx]
            )

        return ps.DataFrame(X_transformed, columns=X.columns)

    def transform(self, X: ps.DataFrame) -> ps.DataFrame:
        X_transformed = np.zeros_like(X, dtype=np.float64)
        for cbe in self.cbe_:
            X_transformed += cbe.transform(X) / self._n_splits
        return ps.DataFrame(X_transformed, columns=X.columns)

In [25]:
# te_encoder = TargetEncoderCV(KFold(n_splits=5, random_state=2019))

In [26]:
# train_id = train_df['OrderID']
# train_target = train_df['CancelFlag']
# train_features = train_df.drop(columns=['OrderID', 'CancelFlag'])

# train_features = te_encoder.fit_transform(train_features, train_target)
# train_features['OrderID'] = train_id
# train_features['CancelFlag'] = train_target
# train_features.head()

In [27]:
# test_id = test_df['OrderID']
# test_features = test_df.drop(columns=['OrderID'])
# test_df = te_encoder.transform(test_features)
# test_df['OrderID'] = test_id

# test_df.head()

## Dump dataframes

In [28]:
with open(str(data_folder / 'processed' / 'train_6.pkl'), 'wb') as f:
    pickle.dump(train_df, f)
# train_df.to_csv(data_folder / 'processed' / 'train.csv', index=False)

In [29]:
with open(str(data_folder / 'processed' / 'test_6.pkl'), 'wb') as f:
    pickle.dump(test_df, f)
# test_df.to_csv(data_folder / 'processed' / 'test.csv', index=False)

In [30]:
train_df['CancelFlag'].value_counts()

0    331075
1     23776
Name: CancelFlag, dtype: int64

In [19]:
train_df.columns.tolist()

['OrderID',
 'ChannelID_1',
 'ChannelID_2',
 'ChannelID_3',
 'ChannelID_4',
 'ChannelID_5',
 'GroupID_1',
 'GroupID_2',
 'GroupID_3',
 'GroupID_4',
 'GroupID_5',
 'GroupID_6',
 'GroupID_7',
 'GroupID_8',
 'GroupID_9',
 'GroupID_10',
 'GroupID_11',
 'GroupID_12',
 'GroupID_13',
 'GroupID_14',
 'GroupID_15',
 'GroupID_16',
 'GroupID_17',
 'GroupID_18',
 'GroupID_19',
 'GroupID_20',
 'GroupID_21',
 'GroupID_22',
 'GroupID_23',
 'GroupID_24',
 'GroupID_25',
 'GroupID_26',
 'GroupID_27',
 'GroupID_28',
 'GroupID_29',
 'GroupID_30',
 'GroupID_31',
 'GroupID_32',
 'GroupID_33',
 'GroupID_34',
 'GroupID_35',
 'GroupID_36',
 'GroupID_37',
 'GroupID_38',
 'GroupID_39',
 'GroupID_40',
 'GroupID_41',
 'GroupID_42',
 'GroupID_43',
 'GroupID_44',
 'GroupID_45',
 'GroupID_46',
 'GroupID_47',
 'GroupID_48',
 'GroupID_49',
 'Cluster_1',
 'Cluster_2',
 'Cluster_3',
 'Cluster_4',
 'Cluster_5',
 'Cluster_6',
 'Cluster_7',
 'Cluster_8',
 'Cluster_9',
 'Cluster_10',
 'Cluster_11',
 'Cluster_12',
 'Cluster_1