In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load train data
uid_train = pd.read_csv('../data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [3]:
# load test data
voice_test = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [4]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./uid_test_b.txt',index=None)

In [5]:
# reset
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

**voice records**

In [6]:
voice_cnt = voice.groupby(['uid'])['in_out'].agg({'voice_count':'count'}).reset_index()
voice_unique_num_cnt = voice.groupby(['uid'])['opp_num'].agg({'voice_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
voice_unique_head_cnt = voice.groupby(['uid'])['opp_head'].agg({'voice_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [7]:
voice_onehot_len_cnt = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_onehot_type_cnt = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_onehot_inout_cnt = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

voice_onehot_head_cnt = voice.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('voice_opp_head_').reset_index().fillna(0)


In [8]:
voice['start_day'] = (voice.start_time.astype(int) / 1000000).astype(int)
voice['start_hour'] = (voice.start_time.astype(int) % 1000000 / 10000).astype(int)
voice['start_minute'] = (voice.start_time.astype(int) % 10000 / 100).astype(int)
voice['start_second'] = voice.start_time.astype(int) % 100

voice['end_day'] = (voice.end_time.astype(int) / 1000000).astype(int)
voice['end_hour'] = (voice.end_time.astype(int) % 1000000 / 10000).astype(int)
voice['end_minute'] = (voice.end_time.astype(int) % 10000 / 100).astype(int)
voice['end_second'] = voice.end_time.astype(int) % 100

voice['voice_dura'] = (voice.end_day-voice.start_day)*24*60*60+(voice.end_hour-voice.start_hour)*60*60+(voice.end_minute-voice.start_minute)*60+(voice.end_second-voice.start_second)

In [9]:
voice_dura_val = voice.groupby(['uid'])['voice_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_dura_').reset_index()
voice_onehot_hour_cnt = voice.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('voice_start_hour_').reset_index().fillna(0)


**sms records**

In [10]:
sms_cnt = sms.groupby(['uid'])['in_out'].agg({'sms_count':'count'}).reset_index()
sms_unique_num_cnt = sms.groupby(['uid'])['opp_num'].agg({'sms_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
sms_unique_head_cnt = sms.groupby(['uid'])['opp_head'].agg({'sms_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [11]:
sms_onehot_len_cnt = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_onehot_inout_cnt = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

sms_onehot_head_cnt = sms.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('sms_opp_head_').reset_index().fillna(0)


In [12]:
sms['start_day'] = (sms.start_time.astype(int) / 1000000).astype(int)
sms['start_hour'] = (sms.start_time.astype(int) % 1000000 / 10000).astype(int)
sms['start_minute'] = (sms.start_time.astype(int) % 10000 / 100).astype(int)
sms['start_second'] = sms.start_time.astype(int) % 100

sms_onehot_hour_cnt = sms.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('sms_start_hour_').reset_index().fillna(0)


**wa records** 

In [13]:
wa_unique_name_cnt = wa.groupby(['uid'])['wa_name'].agg({'wa_name_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [14]:
wa_onehot_type_cnt = wa.groupby(['uid','wa_type'])['uid'].count().unstack().add_prefix('wa_type_').reset_index().fillna(0)


In [15]:
wa_visit_cnt_val = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
wa_visit_dura_val = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
wa_up_flow_val = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
wa_down_flow_val = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()


In [16]:
wa['wa_mean_dura_per_visit'] = wa.visit_dura / wa.visit_cnt
wa_dura_per_visit_val = wa.groupby(['uid'])['wa_mean_dura_per_visit'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_dura_per_visit_').reset_index()


In [17]:
# for i in range(45):
#     up_flow = wa[wa.date == str(i+1)].groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('up_in_date'+str(i+1)+'_').reset_index().fillna(0)
#     down_flow = wa[wa.date == str(i+1)].groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('down_in_date'+str(i+1)+'_').reset_index().fillna(0)
#     wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, up_flow, how='left', on='uid')
#     wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, down_flow, how='left', on='uid')


In [18]:
wa_dura_per_visit_val = wa_dura_per_visit_val.fillna(0)
wa_dura_per_visit_val.head(5)

Unnamed: 0,uid,wa_dura_per_visit_std,wa_dura_per_visit_max,wa_dura_per_visit_min,wa_dura_per_visit_median,wa_dura_per_visit_mean,wa_dura_per_visit_sum
0,u0001,1143.680091,9150.0,0.0,1130.0,1236.558231,1428225.0
1,u0002,1478.449461,5112.0,0.0,2084.285714,1880.592515,54537.18
2,u0003,1146.996555,8092.0,0.0,956.0,1176.050697,1372451.0
3,u0004,1387.195407,9553.0,0.0,1065.631628,1303.624458,2190089.0
4,u0005,1340.392078,8223.0,0.0,990.0,1250.471406,801552.2


In [31]:
feature = [
    voice_cnt,
    voice_unique_num_cnt,
#     voice_unique_head_cnt,
    voice_onehot_len_cnt,
    voice_onehot_type_cnt,
    voice_onehot_inout_cnt,
    voice_dura_val,
    voice_onehot_head_cnt,
    sms_cnt,
    sms_unique_num_cnt,
#     sms_unique_head_cnt,
    sms_onehot_len_cnt,
    sms_onehot_inout_cnt,
    sms_onehot_head_cnt,
    sms_onehot_hour_cnt,
    wa_unique_name_cnt,
    wa_onehot_type_cnt,
#     wa_visit_cnt_val,
#     wa_visit_dura_val,
    wa_up_flow_val,
    wa_down_flow_val,
    wa_dura_per_visit_val
]


In [32]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [33]:
train_feature.to_csv('./train_feature.csv',index=None)
test_feature.to_csv('./test_feature.csv',index=None)

In [34]:
### MODEL ###

In [35]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
train = pd.read_csv('./train_feature.csv')
test = pd.read_csv('./test_feature.csv')

In [37]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [88]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 20,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [89]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [90]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.808475 + 0.0132303
[10]	cv_agg's res: 0.819046 + 0.0148332
[15]	cv_agg's res: 0.829907 + 0.0118797
[20]	cv_agg's res: 0.836626 + 0.011884
[25]	cv_agg's res: 0.847139 + 0.0146399
[30]	cv_agg's res: 0.851104 + 0.0149044
[35]	cv_agg's res: 0.85332 + 0.014996
[40]	cv_agg's res: 0.854511 + 0.0161842
[45]	cv_agg's res: 0.856843 + 0.0133363
[50]	cv_agg's res: 0.85613 + 0.0138307
[55]	cv_agg's res: 0.856007 + 0.012654
[60]	cv_agg's res: 0.857458 + 0.0159703
[65]	cv_agg's res: 0.858655 + 0.0161691
[70]	cv_agg's res: 0.858298 + 0.0163782
[75]	cv_agg's res: 0.857969 + 0.0166321
[80]	cv_agg's res: 0.857395 + 0.016469
[85]	cv_agg's res: 0.859555 + 0.0154037
[90]	cv_agg's res: 0.859478 + 0.0140894
[95]	cv_agg's res: 0.85918 + 0.0161168
[100]	cv_agg's res: 0.859555 + 0.0167923
[105]	cv_agg's res: 0.861488 + 0.0151219
[110]	cv_agg's res: 0.859833 + 0.0147681
[115]	cv_agg's res: 0.860005 + 0.0125929
[120]	cv_agg's res: 0.861426 + 0.0121369
[125]	cv_agg's res: 0.862043 + 0.0141678
[1

{'res-mean': [0.77188690574471386,
  0.79628063208271993,
  0.80392318539773999,
  0.80687344322902133,
  0.80847509706902032,
  0.80872453268557054,
  0.80883902767157856,
  0.81185097806607442,
  0.81813037908199304,
  0.81904589111461668,
  0.82121327505849973,
  0.82383903798902047,
  0.82566397391291035,
  0.82940857380067268,
  0.82990670307497127,
  0.83038458610845323,
  0.8319166094124536,
  0.83389947870506231,
  0.83620455590569864,
  0.83662588903703083,
  0.83935038963762842,
  0.84159626287620837,
  0.84103395288633598,
  0.84432040363650607,
  0.84713903256518608,
  0.84847395273673831,
  0.85050438430508668,
  0.84982695051105306,
  0.85120608678610965,
  0.85110372681279411,
  0.84974333948728897,
  0.85173241343830297,
  0.85201086420012773,
  0.85170512412218724,
  0.85331968638899303,
  0.85252068211653231,
  0.85333285463508701,
  0.85519511957658267,
  0.85413076584817238,
  0.8545113707764026,
  0.85661039581949472,
  0.85707848572573353,
  0.85645826990086016,
 

In [91]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.855432
[10]	training's res: 0.871162
[15]	training's res: 0.877523
[20]	training's res: 0.884232
[25]	training's res: 0.898438
[30]	training's res: 0.908075
[35]	training's res: 0.916105
[40]	training's res: 0.922881
[45]	training's res: 0.928143
[50]	training's res: 0.934697
[55]	training's res: 0.939251
[60]	training's res: 0.944557
[65]	training's res: 0.9519
[70]	training's res: 0.95884
[75]	training's res: 0.962513
[80]	training's res: 0.966766
[85]	training's res: 0.97117
[90]	training's res: 0.973961
[95]	training's res: 0.976867
[100]	training's res: 0.979666
[105]	training's res: 0.981784
[110]	training's res: 0.985039
[115]	training's res: 0.986347
[120]	training's res: 0.988255
[125]	training's res: 0.990382
[130]	training's res: 0.991567
[135]	training's res: 0.992549
[140]	training's res: 0.994364
[145]	training's res: 0.996392
[150]	training's res: 0.996628
[155]	training's res: 0.997089
[160]	training's res: 0.997313
[165]	training's res: 0.997543
[

In [92]:
pred = model.predict(test.drop(['uid'],axis=1))
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result.csv',index=False,header=False,sep=',',columns=['uid','label'])