In [None]:
import pandas as pd
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
import re
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection, metrics, ensemble
from sklearn.model_selection import StratifiedKFold
import math
import collections
import time
import pickle

# Read Data

### Data Arrays

In [None]:
member_np = pickle.load(open("./pickle/member_np.pickle", "rb"))
member_idx2hdr = pickle.load(open("./pickle/member_idx2hdr.pickle", "rb"))
member_hdr2idx = pickle.load(open("./pickle/member_hdr2idx.pickle", "rb"))
order_np = pickle.load(open("./pickle/order_np.pickle", "rb"))
order_idx2hdr = pickle.load(open("./pickle/order_idx2hdr.pickle", "rb"))
order_hdr2idx = pickle.load(open("./pickle/order_hdr2idx.pickle", "rb"))
# behavior_np = pickle.load(open("./pickle/behavior_np.pickle", "rb"))
behavior_np = pickle.load(open("./pickle/behavior_new_np.pickle", "rb"))
behavior_idx2hdr = pickle.load(open("./pickle/behavior_idx2hdr.pickle", "rb"))
behavior_hdr2idx = pickle.load(open("./pickle/behavior_hdr2idx.pickle", "rb"))

print (time.asctime( time.localtime(time.time()) ))
behavior_new_np = behavior_np[(behavior_np[:,behavior_hdr2idx['OnlineMemberId']].astype(str))!='nan']
print (time.asctime( time.localtime(time.time()) ))
pickle.dump(behavior_new_np, open("./pickle/behavior_new_np.pickle", "wb"))

# Functions

### Member Features

In [None]:
def insert_member_feature(member_np, member_hdr2idx):
    out_np = member_np[:,member_hdr2idx['UUID']].reshape(-1,1)
    hdr = ['UUID']
#     print('There are {} members'.format(len(out_np)))

    #Gender
    new_np = member_np[:,member_hdr2idx['Gender']]
    new_np = pd.DataFrame(new_np).fillna(0,inplace=False)
    new_np = np.where(new_np=='Female',2,new_np).reshape(-1)
    new_np = np.where(new_np=='Male'  ,1,new_np).reshape(-1)
    new_np = new_np.reshape(-1,1)
    out_np = np.append(out_np, new_np, axis=1)
    hdr = np.append(hdr, ['Gender'], axis=0)

    #Age
    today = date.today()
    r = re.compile(".*1427.*")
    bd_list = member_np[:,member_hdr2idx['Birthday']]
    list_new = list()
    for i in range(len(bd_list)):
        if r.match(bd_list[i]):
            age = 0
        elif int(bd_list[i][:4],10)<=1900:
            age = 0
        elif int(bd_list[i][:4],10)<1911:
            age = today.year - int(bd_list[i][:4],10) - 1911
        else:
            age = today.year - pd.to_datetime(bd_list[i]).year
        list_new.append(age)
    new_np = np.array(list_new).reshape(-1,1)
    out_np = np.append(out_np, new_np, axis=1)
    hdr = np.append(hdr, ['Age'], axis=0)

    #IsOnlineMember
    new_np = member_np[:,member_hdr2idx['OnlineMemberId']]
    new_np = pd.DataFrame(new_np).fillna(0,inplace=False)
    new_np = np.where(new_np!=0,1,2).reshape(-1)
    new_np = new_np.reshape(-1,1)
    out_np = np.append(out_np, new_np, axis=1)
    hdr = np.append(hdr, ['IsOnlineMember'], axis=0)
    online_idx = np.where(new_np!=0)[0]
#     out_np = out_np[online_idx,:]
#     print('There are {} online members'.format(len(online_idx)))

    #RegisterSource
    type_list = list(set(member_np[:,member_hdr2idx['RegisterSourceTypeDef']]))
    type_list = list(filter(lambda x: type(x)==str, type_list))
    for tt in type_list:
        new_np = member_np[:,member_hdr2idx['RegisterSourceTypeDef']]
        new_np = pd.DataFrame(new_np).fillna(0,inplace=False)
        new_np = np.where(new_np==tt,1,0).reshape(-1)
        new_np = new_np.reshape(-1,1)
        out_np = np.append(out_np, new_np, axis=1)
        type_str = 'RegisterSourceTypeDef_' + tt
        hdr = np.append(hdr, [type_str], axis=0)

    #IsAppInstalled
    new_np = member_np[:,member_hdr2idx['IsAppInstalled']]
    new_np = pd.DataFrame(new_np).fillna(0,inplace=False)
    new_np = np.where(new_np==True ,1,new_np).reshape(-1)
    new_np = np.where(new_np==False,2,new_np).reshape(-1)
    new_np = new_np.reshape(-1,1)
    out_np = np.append(out_np, new_np, axis=1)
    hdr = np.append(hdr, ['IsAppInstalled'], axis=0)

    #MemberCardLevel
    new_np = member_np[:,member_hdr2idx['MemberCardLevel']]
    new_np = new_np.reshape(-1,1)
    out_np = np.append(out_np, new_np, axis=1)
    hdr = np.append(hdr, ['MemberCardLevel'], axis=0)

    return out_np, hdr

### Order Features

In [None]:
def insert_preorder_feature(order_np, order_hdr2idx, member_uuid):

    #PreSalesAmt and PreBuyTimes
    order_fin_np = order_np[(order_np[:,order_hdr2idx['Status']]=='New') | 
                            (order_np[:,order_hdr2idx['Status']]=='Finish')]
    buy_uuid = order_fin_np[:,order_hdr2idx['UUID']]
    buy_uuid_list = list(set(list(buy_uuid)))
    buy_amt_dict = dict.fromkeys(buy_uuid_list,0)
    buy_times_dict = dict.fromkeys(buy_uuid_list,0)
    ttl_uuid = member_uuid
    for n, uuid in enumerate(buy_uuid):
        buy_amt_dict[uuid] = buy_amt_dict[uuid] + order_fin_np[n,order_hdr2idx['TotalSalesAmount']]
        buy_times_dict[uuid] = buy_times_dict[uuid] + 1
    list_new = list()
    buy_times_list = list()
#     is_pre_buy_list = list()
    for uuid in member_uuid:
        amt = buy_amt_dict.get(uuid, 0)
        buy_times = buy_times_dict.get(uuid, 0)
#         is_pre_buy = 1 if buy_times>0 else 0
        list_new.append(amt)
        buy_times_list.append(buy_times)
#         is_pre_buy_list.append(is_pre_buy_list)
    out_np = np.array(list_new).reshape(-1,1)
    out_np = np.append(out_np, np.array(buy_times_list).reshape(-1,1), axis=1)
#     out_np = np.append(out_np, np.array(is_pre_buy_list).reshape(-1,1), axis=1)
    hdr = ['PreSalesAmt']
    hdr = np.append(hdr, ['PreBuyTimes'], axis=0)
#     hdr = np.append(hdr, ['IsPreBuy'], axis=0)

    #ShopPeriod
#     order_fin_np = order_sub_np[(order_sub_np[:,order_hdr2idx['Status']]=='New') | 
#                             (order_sub_np[:,order_hdr2idx['Status']]=='Finish')]
#     buy_uuid = order_fin_np[:,order_hdr2idx['UUID']]
#     buy_uuid_list = list(set(list(buy_uuid)))
    shop_time_dict = dict()
    for n, uuid in enumerate(buy_uuid):
        if uuid not in shop_time_dict:
            shop_time_dict[uuid] = list()
        shop_time_dict[uuid].append(order_fin_np[n,order_hdr2idx['TradesDate']])
    list_mean = list()
    list_max = list()
    list_min = list()
    list_median = list()
    for uuid in member_uuid:
        shop_time = shop_time_dict.get(uuid, [0])
        shop_time_list = sorted(shop_time)
        ll = list()
        for n in range(1, len(shop_time_list)):
            diff = shop_time_list[n] - shop_time_list[n-1]
            ll.append(diff.days)
        ll = list(filter(lambda x: x!= 0, ll))
        if len(ll)==0:
            list_mean.append(0)
            list_max.append(0)
            list_min.append(0)
            list_median.append(0)
        else:
            ll = np.array(ll)
            list_mean.append(ll.mean())
            list_max.append(ll.max())
            list_min.append(ll.min())
            list_median.append(np.median(ll))
    out_np = np.append(out_np, np.array(list_mean).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_max).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_min).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_median).reshape(-1,1), axis=1)
    hdr = np.append(hdr, ['ShopPeriod_Mean'], axis=0)
    hdr = np.append(hdr, ['ShopPeriod_Max'], axis=0)
    hdr = np.append(hdr, ['ShopPeriod_Min'], axis=0)
    hdr = np.append(hdr, ['ShopPeriod_Mediam'], axis=0)
        
    #PreStatusCnt
    #'Cancel', 'Fail', 'Finish', 'New', 'Overdue', 'Return', 'Shipping'
    #type_list = list(set(order_np[:,order_hdr2idx['Status']]))
    type_list = ['Cancel', 'Fail', 'Finish', 'New', 'Overdue', 'Return', 'Shipping']
    type_list = list(filter(lambda x: type(x)==str, type_list))
    for tt in type_list:
        row_idx = order_np[:, order_hdr2idx['Status']]==tt
        order_uuid = order_np[row_idx, order_hdr2idx['UUID']]
        mycollection = collections.Counter(order_uuid)
        list_new = list()
        for uuid in member_uuid:
            bcnt=mycollection[uuid]
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'PreStatusCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)

    #PreChannelTypeCnt
    #'Mall', 'OfficialECom', 'Pos'
    #type_list = list(set(order_np[:,order_hdr2idx['ChannelType']]))
    type_list = ['Mall', 'OfficialECom', 'Pos']
    type_list = list(filter(lambda x: type(x)==str, type_list))
    for tt in type_list:
        row_idx = order_np[:, order_hdr2idx['ChannelType']]==tt
        order_uuid = order_np[row_idx, order_hdr2idx['UUID']]
        mycollection = collections.Counter(order_uuid)
        list_new = list()
        for uuid in member_uuid:
            bcnt=mycollection[uuid]
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'PreChannelTypeCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)
    
    #PrePaymentTypeCnt
    #'ATM', 'Cash', 'CreditCardOnce', 'Family', 'LinePay', 'SevenEleven'
    #type_list = list(set(order_np[:,order_hdr2idx['PaymentType']]))
    type_list = ['ATM', 'Cash', 'CreditCardOnce', 'Family', 'LinePay', 'SevenEleven']
    type_list = list(filter(lambda x: type(x)==str, type_list))
    for tt in type_list:
        row_idx = order_np[:, order_hdr2idx['PaymentType']]==tt
        order_uuid = order_np[row_idx, order_hdr2idx['UUID']]
        mycollection = collections.Counter(order_uuid)
        list_new = list()
        for uuid in member_uuid:
            bcnt=mycollection[uuid]
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'PrePaymentTypeCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)

    #PreShippingTypeCnt
    #'Family', 'FamilyPickup', 'Home', 'LocationPickup', 'SevenEleven', 'SevenElevenPickup', 'Store'
    #type_list = list(set(order_np[:,order_hdr2idx['ShippingType']]))
    type_list = ['Family', 'FamilyPickup', 'Home', 'LocationPickup', 'SevenEleven', 'SevenElevenPickup', 'Store']
    type_list = list(filter(lambda x: type(x)==str, type_list))
    for tt in type_list:
        row_idx = order_np[:, order_hdr2idx['ShippingType']]==tt
        order_uuid = order_np[row_idx, order_hdr2idx['UUID']]
        mycollection = collections.Counter(order_uuid)
        list_new = list()
        for uuid in member_uuid:
            bcnt=mycollection[uuid]
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'PreShippingTypeCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)
        
    return out_np, hdr

### Behavior Features

In [None]:
def insert_behavior_feature(behavior_np, behavior_hdr2idx, member_uuid, uuid2omid):
    # HitTimes
    ttl_uuid = member_uuid
    behavior_omid = behavior_np[:,behavior_hdr2idx['OnlineMemberId']]
    behavior_omid = list(behavior_omid[~pd.isnull(behavior_omid)])
    mycollection = collections.Counter(behavior_omid)
    list_new = list()
    for uuid in ttl_uuid:
        omid=uuid2omid[uuid]
        bcnt=mycollection[omid]
        list_new.append(bcnt)
    out_np = np.array(list_new).reshape(-1,1)
    hdr = ['HitTimes']
    
    #
    aa = dict()
    bb = dict()
    cc = dict()
    dd = dict()
    ee = dict()
    ff = dict()
    gg = dict()
    os0 = dict()
    os1 = dict()
    os2 = dict()
    os3 = dict()
    os4 = dict()
    os5 = dict()
    os6 = dict()
    st0 = dict()
    st1 = dict()
    tsc0 = dict()
    tsc1 = dict()
    tsc2 = dict()
    tsc3 = dict()
    tsc4 = dict()
    tsc5 = dict()
    tsc6 = dict()
    tsc7 = dict()
    tsc8 = dict()
    type_list = ['Cart', 'Fav', 'Purchase', 'Search', 'ViewSalePage', 'ViewSalePageCategory']
    type_list_os = ['Android','Chrome OS','Intel Mac OS X','Linux','Ubuntu','Windows','iOS']
    type_list_st = ['APP', 'WEB']
    type_list_tsc = ['Direct','Email','Facebook','GoogleCpc',
                     'GoogleOrganic','Instagram','Line','LineShopping','Others']
    for n in range(len(behavior_np)):
#         if not str(behavior_np[n,behavior_hdr2idx['OnlineMemberId']]) == 'nan':
        omid = behavior_np[n,behavior_hdr2idx['OnlineMemberId']]
        sn = behavior_np[n,behavior_hdr2idx['SessionNumber']]
        if math.isnan(sn):
            sn = 0
        else:
            sn = sn
        gg[omid] = gg.get(omid,0) + sn
        for tt in type_list:
            if behavior_np[n,behavior_hdr2idx['BehaviorType']]==tt:
                if tt == type_list[0]:
                    aa[omid] = aa.get(omid,0) + 1
                elif tt == type_list[1]:
                    bb[omid] = bb.get(omid,0) + 1
                elif tt == type_list[2]:
                    cc[omid] = cc.get(omid,0) + 1
                elif tt == type_list[3]:
                    dd[omid] = dd.get(omid,0) + 1
                elif tt == type_list[4]:
                    ee[omid] = ee.get(omid,0) + 1
                else:
                    ff[omid] = ff.get(omid,0) + 1
        for tt in type_list_tsc:
            if behavior_np[n,behavior_hdr2idx['OperationSystem']]==tt:
                if tt == type_list_tsc[0]:
                    tsc0[omid] = tsc0.get(omid,0) + 1
                elif tt == type_list_tsc[1]:
                    tsc1[omid] = tsc1.get(omid,0) + 1
                elif tt == type_list_tsc[2]:
                    tsc2[omid] = tsc2.get(omid,0) + 1
                elif tt == type_list_tsc[3]:
                    tsc3[omid] = tsc3.get(omid,0) + 1
                elif tt == type_list_tsc[4]:
                    tsc4[omid] = tsc4.get(omid,0) + 1
                elif tt == type_list_tsc[5]:
                    tsc5[omid] = tsc5.get(omid,0) + 1
                elif tt == type_list_tsc[6]:
                    tsc6[omid] = tsc6.get(omid,0) + 1
                elif tt == type_list_tsc[7]:
                    tsc7[omid] = tsc7.get(omid,0) + 1
                else:
                    tsc8[omid] = tsc8.get(omid,0) + 1
        for tt in type_list_os:
            if behavior_np[n,behavior_hdr2idx['TrafficSourceCategory']]==tt:
                if tt == type_list_os[0]:
                    os0[omid] = os0.get(omid,0) + 1
                elif tt == type_list_os[1]:
                    os1[omid] = os1.get(omid,0) + 1
                elif tt == type_list_os[2]:
                    os2[omid] = os2.get(omid,0) + 1
                elif tt == type_list_os[3]:
                    os3[omid] = os3.get(omid,0) + 1
                elif tt == type_list_os[4]:
                    os4[omid] = os4.get(omid,0) + 1
                elif tt == type_list_os[5]:
                    os5[omid] = os5.get(omid,0) + 1
                else:
                    os6[omid] = os6.get(omid,0) + 1
        for tt in type_list_st:
            if behavior_np[n,behavior_hdr2idx['SourceType']]==tt:
                if tt == type_list_st[0]:
                    st0[omid] = st0.get(omid,0) + 1
                else:
                    st1[omid] = st1.get(omid,0) + 1

    #SessionNumberCnt
    list_new = list()
    for uuid in member_uuid:
        omid = uuid2omid[uuid]
        bcnt = gg.get(omid,0)
        list_new.append(bcnt)
    out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
    hdr = np.append(hdr, ['SessionNumberCnt'], axis=0)

    #BehaviorTypeCnt
    type_list = ['Direct','Email','Facebook','GoogleCpc',
                 'GoogleOrganic','Instagram','Line','LineShopping','Others']
    for tt in type_list:
        list_new = list()
        for uuid in member_uuid:
            omid = uuid2omid[uuid]
            if tt == type_list[0]:
                bcnt= aa.get(omid,0)
            elif tt == type_list[1]:
                bcnt= bb.get(omid,0)
            elif tt == type_list[2]:
                bcnt= cc.get(omid,0)
            elif tt == type_list[3]:
                bcnt= dd.get(omid,0)
            elif tt == type_list[4]:
                bcnt= ee.get(omid,0)
            else:
                bcnt= ff.get(omid,0)
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'BehaviorTypeCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)

    #TrafficSourceCategory
    type_list = ['Cart', 'Fav', 'Purchase', 'Search', 'ViewSalePage', 'ViewSalePageCategory']
    for tt in type_list:
        list_new = list()
        for uuid in member_uuid:
            omid = uuid2omid[uuid]
            if tt == type_list[0]:
                bcnt = tsc0.get(omid,0)
            elif tt == type_list[1]:
                bcnt = tsc1.get(omid,0)
            elif tt == type_list[2]:
                bcnt = tsc2.get(omid,0)
            elif tt == type_list[3]:
                bcnt = tsc3.get(omid,0)
            elif tt == type_list[4]:
                bcnt = tsc4.get(omid,0)
            elif tt == type_list[5]:
                bcnt = tsc5.get(omid,0)
            elif tt == type_list[6]:
                bcnt = tsc6.get(omid,0)
            elif tt == type_list[7]:
                bcnt = tsc7.get(omid,0)
            else:
                bcnt = tsc8.get(omid,0)
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'TrafficSourceCategoryCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)
    
    #SourceType
    type_list = ['APP', 'WEB']
    for tt in type_list:
        list_new = list()
        for uuid in member_uuid:
            omid = uuid2omid[uuid]
            if tt == type_list[0]:
                bnct = st0.get(omid,0)
            else:
                bcnt = st1.get(omid,0)
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'SourceTypeCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)
        
    #OperationSystemCnt
    type_list = ['Android','Chrome OS','Intel Mac OS X','Linux','Ubuntu','Windows','iOS']
    for tt in type_list:
        list_new = list()
        for uuid in member_uuid:
            omid = uuid2omid[uuid]
            if tt == type_list[0]:
                bcnt= os0.get(omid,0)
            elif tt == type_list[1]:
                bcnt= os1.get(omid,0)
            elif tt == type_list[2]:
                bcnt= os2.get(omid,0)
            elif tt == type_list[3]:
                bcnt= os3.get(omid,0)
            elif tt == type_list[4]:
                bcnt= os4.get(omid,0)
            elif tt == type_list[5]:
                bcnt= os5.get(omid,0)
            else:
                bcnt = os6.get(omid,0)
            list_new.append(bcnt)  
        out_np = np.append(out_np, np.array(list_new).reshape(-1,1), axis=1)
        type_str = 'OperationSystemCnt_' + tt
        hdr = np.append(hdr, [type_str], axis=0)
        
    # OnlineTime
#     behavior_np[:,behavior_hdr2idx['HitDateTime']] = pd.to_datetime(behavior_np[:,behavior_hdr2idx['HitDateTime']])
    behavior_omid = behavior_np[:,behavior_hdr2idx['OnlineMemberId']]
    online_time_dict = dict()
    for n, omid in enumerate(behavior_omid):
        if omid not in online_time_dict:
            online_time_dict[omid] = list()
        online_time_dict[omid].append(behavior_np[n,behavior_hdr2idx['HitDateTime']])

    list_mean = list()
    list_max = list()
    list_min = list()
    list_median = list()
    for uuid in member_uuid:
        omid = uuid2omid[uuid]
        online_time = online_time_dict.get(omid,[0])
        online_time = sorted(online_time)
        ll = list()
        for n in range(1, len(online_time)):
            diff = online_time[n] - online_time[n-1]
            if (diff.seconds < 36000): # 10hrs
                ll.append(diff.seconds)
        ll = list(filter(lambda x: x!= 0, ll))
        if len(ll)==0:
            list_mean.append(0)
            list_max.append(0)
            list_min.append(0)
            list_median.append(0)
        else:
            ll = np.array(ll)
            list_mean.append(ll.mean())
            list_max.append(ll.max())
            list_min.append(ll.min())
            list_median.append(np.median(ll))
    out_np = np.append(out_np, np.array(list_mean).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_max).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_min).reshape(-1,1), axis=1)
    out_np = np.append(out_np, np.array(list_median).reshape(-1,1), axis=1)
    hdr = np.append(hdr, ['OnlineTime_Mean'], axis=0)
    hdr = np.append(hdr, ['OnlineTime_Max'], axis=0)
    hdr = np.append(hdr, ['OnlineTime_Min'], axis=0)
    hdr = np.append(hdr, ['OnlineTime_Mediam'], axis=0)

    return out_np, hdr

### Others

In [None]:
def is_uuid_buy(buy_uuid, member_uuid):
    is_buy_uuid = list()
    buy_uuid_set = set(buy_uuid)
    for uuid in member_uuid:
        if uuid in buy_uuid_set:
            is_buy_uuid.append(True)
        else:
            is_buy_uuid.append(False)
    return is_buy_uuid

In [None]:
def filter_zeros(feature_array, tag_list):
    zero_row_idx = list()
    zero_array = np.array([0]*(train_np.shape[1]-17))
    not_zero_row_idx = list()
    for n in range(train_np.shape[0]):
        if not np.allclose(train_np[n,17:].astype(float),zero_array):
            not_zero_row_idx.append(n)
    not_zero_row_idx = np.array(not_zero_row_idx)
    tag_list = np.array(tag_list)
    tags = tag_list[not_zero_row_idx]
    features = feature_array[not_zero_row_idx,:]
    print ('Org len={}'.format(len(tag_list)), end=' ')
    print ('Fltd len={}'.format(len(tags)))
    return features, tags

In [None]:
def trunc_len(train_np, is_buy_uuid):
    is_buy_uuid = np.array(is_buy_uuid)
    buy_len = len(is_buy_uuid[is_buy_uuid==True])
    nobuy_idx = np.where(is_buy_uuid==False)[0][0:int(buy_len/4)]
    buy_idx = np.where(is_buy_uuid==True)[0]
    nobuy_train_np = train_np[nobuy_idx,:]
    buy_train_np = train_np[buy_idx,:]
    train_new_np = np.append(nobuy_train_np, buy_train_np, axis=0)
    is_buy_new_np = np.append(np.array([False]*len(nobuy_train_np)).T, np.array([True]*len(buy_train_np)).T, axis=0)
    print('Train Len={}'.format(len(is_buy_new_np)))
    return train_new_np, is_buy_new_np

# Parameters

In [None]:
order_beg = sorted(order_np[:,order_hdr2idx['TradesDate']])[0]
order_end = sorted(order_np[:,order_hdr2idx['TradesDate']])[-1]
for n in range(11):
    order_end = pd.to_datetime(order_end) - relativedelta(months=3)
    train_beg = order_end - relativedelta(months=3)
    ptrain_beg = train_beg - relativedelta(months=3)
    print('{}\t{}\t{}'.format(order_end,train_beg,ptrain_beg))

### Train

In [None]:
train_set_num = 3
order_strt_date = '2016-1-1' # fixed order
train_strt_date = '2019-1-1'
train_mon = 1
train_pre_mon = 3
train_strt_date_list = [train_strt_date]
for n in range(train_set_num-1):
    train_strt_date = pd.to_datetime(train_strt_date) - relativedelta(months=train_mon+2)
    train_strt_date = train_strt_date.strftime('%Y-%m-%d')
    train_strt_date_list.append(train_strt_date)

train_end_date_list = list()
train_pre_strt_date_list = list()
train_pre_end_date_list = list()
for n in range(len(train_strt_date_list)):
    train_strt_date = train_strt_date_list[n]
    train_end_date = pd.to_datetime(train_strt_date) + relativedelta(months=train_mon)
    train_end_date = train_end_date.strftime('%Y-%m-%d')
    train_pre_strt_date = pd.to_datetime(train_strt_date) - relativedelta(months=train_pre_mon)
    train_pre_strt_date = train_pre_strt_date.strftime('%Y-%m-%d')
    train_pre_end_date = train_strt_date
    print('Training: Start {}, End {}'.format(train_strt_date, train_end_date), end=' ')
    print('Training: Pre Start {}, End {}'.format(train_pre_strt_date, train_pre_end_date))
    train_end_date_list.append(train_end_date)
    train_pre_strt_date_list.append(train_pre_strt_date)
    train_pre_end_date_list.append(train_pre_end_date)

### Test

In [None]:
test_mon = 1
test_pre_mon = 3
test_strt_date = pd.to_datetime(train_end_date_list[0]) + relativedelta(months=2)
test_strt_date = test_strt_date.strftime('%Y-%m-%d')
test_end_date = pd.to_datetime(test_strt_date) + relativedelta(months=test_mon)
test_end_date = test_end_date.strftime('%Y-%m-%d')
test_pre_strt_date = pd.to_datetime(test_strt_date) - relativedelta(months=test_pre_mon)
test_pre_strt_date = test_pre_strt_date.strftime('%Y-%m-%d')
test_pre_end_date = test_strt_date
print('Testing: Start {}, End {}'.format(test_strt_date, test_end_date))
print('Testing: Pre Start {}, End {}'.format(test_pre_strt_date, test_pre_end_date))

# Training

## Train 1

### Features

In [None]:
def train_set(member_np, member_hdr2idx, order_np, order_hdr2idx, behavior_np, behavior_hdr2idx, 
             order_strt_date, train_pre_strt_date, train_pre_end_date,
             train_strt_date, train_end_date):

    ##### Member Info

    member_np[:,member_hdr2idx['RegisterDate']] = pd.to_datetime(member_np[:,member_hdr2idx['RegisterDate']])
    member_sub_np = member_np[
        (member_np[:,member_hdr2idx['RegisterDate']] >= datetime.strptime(order_strt_date, '%Y-%m-%d')) &
        (member_np[:,member_hdr2idx['RegisterDate']] <  datetime.strptime(train_pre_end_date, '%Y-%m-%d'))
    ]
    print (time.asctime( time.localtime(time.time()) ))
    print ('\tMember Start Date {}'.format(order_strt_date), end=' ')
    print ('End Date {}'.format(train_pre_end_date))
    print ('\tTraining Pre Start Date {}'.format(train_pre_strt_date), end=' ')
    print ('End Date {}'.format(train_pre_end_date))

    member_new_np, member_new_hdr = insert_member_feature(member_sub_np, member_hdr2idx)
    print ('\tMember number {}'.format(len(member_new_np)),end=' ')
    # member_df[member_df['IsEnablePushNotification']==False]

    ##### Previous Order Info

    # order_df.head()

    # set(list(order_df['ShippingType']))

    order_np[:,order_hdr2idx['TradesDate']] = pd.to_datetime(order_np[:,order_hdr2idx['TradesDate']])
    order_sub_np = order_np[
        (order_np[:,order_hdr2idx['TradesDate']] >= datetime.strptime(train_pre_strt_date, '%Y-%m-%d')) &
        (order_np[:,order_hdr2idx['TradesDate']] <  datetime.strptime(train_pre_end_date, '%Y-%m-%d'))
    ]
    print ('Order number {}'.format(len(order_sub_np)),end=' ')
    member_uuid = member_sub_np[:,member_hdr2idx['UUID']]
    preorder_new_np, preorder_new_hdr = insert_preorder_feature(order_sub_np, order_hdr2idx, member_uuid)

    # preorder_new_np[1,:]
    # len(preorder_new_hdr)

    ##### Behavior Info

    behavior_np[:,behavior_hdr2idx['HitDateTime']] = pd.to_datetime(behavior_np[:,behavior_hdr2idx['HitDateTime']])
    behavior_sub_np = behavior_np[
        (behavior_np[:,behavior_hdr2idx['HitDateTime']] >= datetime.strptime(train_pre_strt_date, '%Y-%m-%d')) &
        (behavior_np[:,behavior_hdr2idx['HitDateTime']] <  datetime.strptime(train_pre_end_date, '%Y-%m-%d'))
    ]
    print ('Behavior number {}'.format(len(behavior_sub_np)))

# #     pickle.dump(behavior_sub_np, open("./pickle/behavior_sub_np.pickle", "wb"))
# #     behavior_sub_np = pickle.load(open("./pickle/behavior_sub_np.pickle", "rb"))

    ttl_uuid = member_np[:,member_hdr2idx['UUID']]
    uuid2omid = dict()
    for n, uuid in enumerate(ttl_uuid):
        uuid2omid[uuid] = member_np[n,member_hdr2idx['OnlineMemberId']]
    behavior_new_np, behavior_new_hdr = insert_behavior_feature(
        behavior_sub_np, behavior_hdr2idx, member_uuid, uuid2omid)

    ##### Training Info

    train_np = np.append(member_new_np, preorder_new_np, axis=1)
    train_np = np.append(train_np, behavior_new_np, axis=1)

    train_hdr = np.append(member_new_hdr, preorder_new_hdr, axis=0)
    train_hdr = np.append(train_hdr, behavior_new_hdr, axis=0)
    train_hdr2idx = dict((hdr, idx) for idx, hdr in enumerate(train_hdr))
    print('\tThere are {} features'.format(len(train_hdr2idx)))

#     print (time.asctime( time.localtime(time.time()) ))

    ### Tags

    order_sub_np = order_np[(order_np[:,order_hdr2idx['Status']]=='New') | 
                            (order_np[:,order_hdr2idx['Status']]=='Finish')]

    order_sub_np[:,order_hdr2idx['TradesDate']] = pd.to_datetime(order_sub_np[:,order_hdr2idx['TradesDate']])
    order_train_np = order_sub_np[
        (order_sub_np[:,order_hdr2idx['TradesDate']] >= datetime.strptime(train_strt_date, '%Y-%m-%d')) &
        (order_sub_np[:,order_hdr2idx['TradesDate']] <  datetime.strptime(train_end_date, '%Y-%m-%d'))
    ]
    print ('\tTraining Start Date {}'.format(train_strt_date), end=' ')
    print ('End Date {}'.format(train_end_date))

    buy_uuid = order_train_np[:,order_hdr2idx['UUID']]
    ttl_uuid = train_np[:,train_hdr2idx['UUID']]
    is_buy_uuid = is_uuid_buy(buy_uuid, ttl_uuid)

    unique, counts = np.unique(is_buy_uuid, return_counts=True)
    print('\tnobuy={} ({:.2f}), buy={} ({:.2f})'.format(counts[0], counts[0]/counts.sum(),
                                                      counts[1], counts[1]/counts.sum()))

    print (time.asctime( time.localtime(time.time()) ))
    return train_np, train_hdr2idx, is_buy_uuid

In [None]:
for n in range(train_set_num):
    print('Traning {}/{}'.format(n+1,train_set_num))
    train_np, train_hdr2idx, is_buy_uuid = train_set(
        member_np, member_hdr2idx, order_np, order_hdr2idx, behavior_np, behavior_hdr2idx, 
        order_strt_date, train_pre_strt_date_list[n], train_pre_end_date_list[n],
        train_strt_date_list[n], train_end_date_list[n])
    train_np,is_buy_uuid = filter_zeros(train_np,is_buy_uuid)
    if n==0:
        train_new_np = train_np[:,1:]
        is_buy_new_np = np.array(is_buy_uuid)
    else:
        train_new_np = np.append(train_new_np, train_np[:,1:], axis=0)
        is_buy_new_np = np.append(is_buy_new_np, np.array(is_buy_uuid), axis=0)

### Random Forest

In [None]:
# trans_x = train_new_np
# y = is_buy_new_np
trans_x, y = trunc_len(train_new_np, is_buy_new_np)

In [None]:
# for n in range(69):
#     found = pd.DataFrame(trans_x[:,n]).isnull().values.any()
#     print('{} {}'.format(n,found))

In [None]:
kf = StratifiedKFold(n_splits=3, random_state=0, shuffle=False)
print(kf)
forest = ensemble.RandomForestClassifier(n_estimators = 100)
for train_index, test_index in kf.split(trans_x,y):
    X_train, X_test = trans_x[train_index], trans_x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    forest_fit = forest.fit(X_train, y_train)
    test_y_predicted = forest.predict(X_test)
    target_name = ['NoBuy', 'Buy']
    print(metrics.classification_report(y_test, test_y_predicted, target_names=target_name))
    accuracy = metrics.accuracy_score(y_test, test_y_predicted)
    print('Accuracy=',accuracy)

In [None]:
print (time.asctime( time.localtime(time.time()) ))

# Test

### Features

##### Member Info

In [None]:
member_np[:,member_hdr2idx['RegisterDate']] = pd.to_datetime(member_np[:,member_hdr2idx['RegisterDate']])
member_sub_np = member_np[
    (member_np[:,member_hdr2idx['RegisterDate']] >= datetime.strptime(order_strt_date, '%Y-%m-%d')) &
    (member_np[:,member_hdr2idx['RegisterDate']] <  datetime.strptime(test_pre_end_date, '%Y-%m-%d'))
]

In [None]:
member_test_np, member_test_hdr = insert_member_feature(member_sub_np,member_hdr2idx)

##### Previous Order Info

In [None]:
order_np[:,order_hdr2idx['TradesDate']] = pd.to_datetime(order_np[:,order_hdr2idx['TradesDate']])
order_sub_np = order_np[
    (order_np[:,order_hdr2idx['TradesDate']] >= datetime.strptime(test_pre_strt_date, '%Y-%m-%d')) &
    (order_np[:,order_hdr2idx['TradesDate']] <  datetime.strptime(test_pre_end_date, '%Y-%m-%d'))
]

In [None]:
member_uuid = member_sub_np[:,member_hdr2idx['UUID']]
preorder_test_np, preorder_test_hdr = insert_preorder_feature(order_sub_np, order_hdr2idx, member_uuid)

In [None]:
# len(preorder_test_hdr)

##### Behavior Info

In [None]:
behavior_np[:,behavior_hdr2idx['HitDateTime']] = pd.to_datetime(behavior_np[:,behavior_hdr2idx['HitDateTime']])
behavior_sub_np = behavior_np[
    (behavior_np[:,behavior_hdr2idx['HitDateTime']] >= datetime.strptime(test_pre_strt_date, '%Y-%m-%d')) &
    (behavior_np[:,behavior_hdr2idx['HitDateTime']] <  datetime.strptime(test_pre_end_date, '%Y-%m-%d'))
]

In [None]:
ttl_uuid = member_np[:,member_hdr2idx['UUID']]
uuid2omid = dict()
for n, uuid in enumerate(ttl_uuid):
    uuid2omid[uuid] = member_np[n,member_hdr2idx['OnlineMemberId']]
behavior_test_np, behavior_test_hdr = insert_behavior_feature(
    behavior_sub_np, behavior_hdr2idx, member_uuid, uuid2omid)

##### Test Info

In [None]:
test_np = np.append(member_test_np, preorder_test_np, axis=1)
test_np = np.append(test_np, behavior_test_np, axis=1)
# test_np

In [None]:
test_hdr = np.append(member_test_hdr, preorder_test_hdr, axis=0)
test_hdr = np.append(test_hdr, behavior_test_hdr, axis=0)
test_hdr2idx = dict((hdr, idx) for idx, hdr in enumerate(test_hdr))
# test_hdr2idx
print('There are {} features'.format(len(test_hdr2idx)))

In [None]:
print (time.asctime( time.localtime(time.time()) ))

### Tags

In [None]:
order_sub_np = order_np[(order_np[:,order_hdr2idx['Status']]=='New') | (order_np[:,order_hdr2idx['Status']]=='Finish')]

In [None]:
order_sub_np[:,order_hdr2idx['TradesDate']] = pd.to_datetime(order_sub_np[:,order_hdr2idx['TradesDate']])
order_test_np = order_sub_np[
    (order_sub_np[:,order_hdr2idx['TradesDate']] >= datetime.strptime(test_strt_date, '%Y-%m-%d')) &
    (order_sub_np[:,order_hdr2idx['TradesDate']] <  datetime.strptime(test_end_date, '%Y-%m-%d'))
]

In [None]:
buy_uuid = order_test_np[:,order_hdr2idx['UUID']]
ttl_uuid = test_np[:,test_hdr2idx['UUID']]
is_buy_uuid = is_uuid_buy(buy_uuid, ttl_uuid)

In [None]:
test_np,is_buy_uuid = filter_zeros(test_np,is_buy_uuid)

In [None]:
unique, counts = np.unique(is_buy_uuid, return_counts=True)
print('nobuy={:.2f}, buy={:.2f}'.format(counts[0]/counts.sum(),counts[1]/counts.sum()))

In [None]:
print (time.asctime( time.localtime(time.time()) ))

### Random Forest Predict

In [None]:
actual_y = is_buy_uuid
predict_y = forest.predict(test_np[:,1:])

In [None]:
accuracy = metrics.accuracy_score(actual_y, predict_y)
target_name = ['NoBuy', 'Buy']
print(metrics.classification_report(actual_y, predict_y, target_names=target_name))
print('Tested accuracy = {}'.format(accuracy))

In [None]:
print (time.asctime( time.localtime(time.time()) ))

In [None]:
unique, counts = np.unique(predict_y, return_counts=True)
print('nobuy={:.2f}, buy={:.2f}'.format(counts[0]/counts.sum(),counts[1]/counts.sum()))