In [149]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [150]:
# load train data
uid_train = pd.read_csv('../data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [151]:
# load test data
voice_test = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [152]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./uid_test_b.txt',index=None)

In [153]:
# reset
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

**voice records**

In [154]:
voice_cnt = voice.groupby(['uid'])['in_out'].agg({'voice_count':'count'}).reset_index()
voice_unique_num_cnt = voice.groupby(['uid'])['opp_num'].agg({'voice_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
# voice_unique_head_cnt = voice.groupby(['uid'])['opp_head'].agg({'voice_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [155]:
voice_onehot_len_cnt = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_onehot_type_cnt = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_onehot_inout_cnt = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

voice_onehot_head_cnt = voice.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('voice_opp_head_').reset_index().fillna(0)


In [156]:
voice['start_day'] = (voice.start_time.astype(int) / 1000000).astype(int)
voice['start_hour'] = (voice.start_time.astype(int) % 1000000 / 10000).astype(int)
voice['start_minute'] = (voice.start_time.astype(int) % 10000 / 100).astype(int)
voice['start_second'] = voice.start_time.astype(int) % 100

voice['end_day'] = (voice.end_time.astype(int) / 1000000).astype(int)
voice['end_hour'] = (voice.end_time.astype(int) % 1000000 / 10000).astype(int)
voice['end_minute'] = (voice.end_time.astype(int) % 10000 / 100).astype(int)
voice['end_second'] = voice.end_time.astype(int) % 100

voice['voice_dura'] = (voice.end_day-voice.start_day)*24*60*60+(voice.end_hour-voice.start_hour)*60*60+(voice.end_minute-voice.start_minute)*60+(voice.end_second-voice.start_second)

In [157]:
voice_dura_val = voice.groupby(['uid'])['voice_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_dura_').reset_index()
voice_onehot_hour_cnt = voice.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('voice_start_hour_').reset_index().fillna(0)


**sms records**

In [158]:
sms_cnt = sms.groupby(['uid'])['in_out'].agg({'sms_count':'count'}).reset_index()
sms_unique_num_cnt = sms.groupby(['uid'])['opp_num'].agg({'sms_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
# sms_unique_head_cnt = sms.groupby(['uid'])['opp_head'].agg({'sms_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [159]:
sms_onehot_len_cnt = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_onehot_inout_cnt = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

sms_onehot_head_cnt = sms.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('sms_opp_head_').reset_index().fillna(0)


In [160]:
sms['start_day'] = (sms.start_time.astype(int) / 1000000).astype(int)
sms['start_hour'] = (sms.start_time.astype(int) % 1000000 / 10000).astype(int)
sms['start_minute'] = (sms.start_time.astype(int) % 10000 / 100).astype(int)
sms['start_second'] = sms.start_time.astype(int) % 100

sms_onehot_hour_cnt = sms.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('sms_start_hour_').reset_index().fillna(0)


**wa records** 

In [161]:
wa_unique_name_cnt = wa.groupby(['uid'])['wa_name'].agg({'wa_name_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [162]:
wa_onehot_type_cnt = wa.groupby(['uid','wa_type'])['uid'].count().unstack().add_prefix('wa_type_').reset_index().fillna(0)


In [163]:
# wa_visit_cnt_val = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
# wa_visit_dura_val = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
# wa_up_flow_val = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
# wa_down_flow_val = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()


In [164]:
wa['wa_mean_dura_per_visit'] = wa.visit_dura / wa.visit_cnt
wa_dura_per_visit_val = wa.groupby(['uid'])['wa_mean_dura_per_visit'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_dura_per_visit_').reset_index()


In [165]:
for i in range(45):
    up_flow = wa[wa.date == str(i+1)].groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('up_in_date'+str(i+1)+'_').reset_index().fillna(0)
    down_flow = wa[wa.date == str(i+1)].groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('down_in_date'+str(i+1)+'_').reset_index().fillna(0)
    wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, up_flow, how='left', on='uid')
    wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, down_flow, how='left', on='uid')


Unnamed: 0,uid,wa_dura_per_visit_std,wa_dura_per_visit_max,wa_dura_per_visit_min,wa_dura_per_visit_median,wa_dura_per_visit_mean,wa_dura_per_visit_sum,up_in_date1_std,up_in_date1_max,up_in_date1_min,...,up_in_date45_min,up_in_date45_median,up_in_date45_mean,up_in_date45_sum,down_in_date45_std,down_in_date45_max,down_in_date45_min,down_in_date45_median,down_in_date45_mean,down_in_date45_sum
0,u0001,1143.680091,9150.000000,0.0,1130.000000,1236.558231,1.428225e+06,0.0,0.0,0.0,...,52.0,17877.5,2.253695e+05,8564041.0,1.240819e+06,6.273218e+06,40.0,42964.5,4.584999e+05,1.742300e+07
1,u0002,1478.449461,5112.000000,0.0,2084.285714,1880.592515,5.453718e+04,0.0,0.0,0.0,...,0.0,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00
2,u0003,1146.996555,8092.000000,0.0,956.000000,1176.050697,1.372451e+06,0.0,0.0,0.0,...,60.0,23884.0,4.398688e+05,25072520.0,6.162219e+07,4.570219e+08,120.0,70849.0,1.267444e+07,7.224432e+08
3,u0004,1387.195407,9553.000000,0.0,1065.631628,1303.624458,2.190089e+06,0.0,0.0,0.0,...,452.0,9644.0,7.722000e+04,2471040.0,4.922232e+05,2.655040e+06,319.0,16378.0,1.897105e+05,6.070737e+06
4,u0005,1340.392078,8223.000000,0.0,990.000000,1250.471406,8.015522e+05,0.0,0.0,0.0,...,0.0,2023.5,5.088625e+03,81418.0,1.890868e+05,7.609150e+05,240.0,3070.0,5.690719e+04,9.105150e+05
5,u0006,1282.338173,7145.500000,0.0,699.333333,1079.816067,5.237108e+05,0.0,0.0,0.0,...,0.0,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00
6,u0007,1169.570567,8149.000000,0.0,904.588235,1163.879750,1.531666e+06,0.0,0.0,0.0,...,77.0,9091.0,2.543332e+04,635833.0,9.289837e+05,4.661579e+06,0.0,15501.0,2.229975e+05,5.574938e+06
7,u0008,1278.574532,9370.000000,0.0,920.173913,1184.599394,1.265152e+06,0.0,0.0,0.0,...,160.0,4162.0,8.500883e+04,2465256.0,3.910391e+06,2.093690e+07,40.0,15180.0,8.836665e+05,2.562633e+07
8,u0009,1298.202756,9453.000000,0.0,994.250000,1236.003449,1.320052e+06,0.0,0.0,0.0,...,200.0,4098.5,9.369100e+03,93691.0,1.054180e+05,3.423690e+05,0.0,6532.0,4.458790e+04,4.458790e+05
9,u0010,1082.201412,4251.000000,0.0,617.062500,1058.292467,9.207144e+04,0.0,0.0,0.0,...,0.0,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0.000000e+00,0.000000e+00


In [166]:
wa_dura_per_visit_val.head(5)

Unnamed: 0,uid,wa_dura_per_visit_std,wa_dura_per_visit_max,wa_dura_per_visit_min,wa_dura_per_visit_median,wa_dura_per_visit_mean,wa_dura_per_visit_sum,up_in_date1_std,up_in_date1_max,up_in_date1_min,...,up_in_date45_min,up_in_date45_median,up_in_date45_mean,up_in_date45_sum,down_in_date45_std,down_in_date45_max,down_in_date45_min,down_in_date45_median,down_in_date45_mean,down_in_date45_sum
0,u0001,1143.680091,9150.0,0.0,1130.0,1236.558231,1428225.0,,,,...,52.0,17877.5,225369.5,8564041.0,1240819.0,6273218.0,40.0,42964.5,458499.9,17422996.0
1,u0002,1478.449461,5112.0,0.0,2084.285714,1880.592515,54537.18,,,,...,,,,,,,,,,
2,u0003,1146.996555,8092.0,0.0,956.0,1176.050697,1372451.0,,,,...,60.0,23884.0,439868.77193,25072520.0,61622190.0,457021859.0,120.0,70849.0,12674440.0,722443226.0
3,u0004,1387.195407,9553.0,0.0,1065.631628,1303.624458,2190089.0,,,,...,452.0,9644.0,77220.0,2471040.0,492223.2,2655040.0,319.0,16378.0,189710.5,6070737.0
4,u0005,1340.392078,8223.0,0.0,990.0,1250.471406,801552.2,,,,...,0.0,2023.5,5088.625,81418.0,189086.8,760915.0,240.0,3070.0,56907.19,910515.0


In [121]:
feature = [
    voice_cnt,
    voice_unique_num_cnt,
#     voice_unique_head_cnt,
    voice_onehot_len_cnt,
    voice_onehot_type_cnt,
    voice_onehot_inout_cnt,
    voice_dura_val,
    voice_onehot_head_cnt,
    sms_cnt,
    sms_unique_num_cnt,
#     sms_unique_head_cnt,
    sms_onehot_len_cnt,
    sms_onehot_inout_cnt,
    sms_onehot_head_cnt,
    sms_onehot_hour_cnt,
    wa_unique_name_cnt,
    wa_onehot_type_cnt,
#     wa_visit_cnt_val,
#     wa_visit_dura_val,
#     wa_up_flow_val,
#     wa_down_flow_val,
    wa_dura_per_visit_val
]

In [122]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [123]:
train_feature.to_csv('./train_feature.csv',index=None)
test_feature.to_csv('./test_feature.csv',index=None)

In [124]:
### MODEL ###

In [125]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [126]:
train = pd.read_csv('./train_feature.csv')
test = pd.read_csv('./test_feature.csv')

In [127]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [128]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [129]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [130]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.811315 + 0.00919801
[10]	cv_agg's res: 0.827486 + 0.00560019
[15]	cv_agg's res: 0.829986 + 0.00771688
[20]	cv_agg's res: 0.837229 + 0.00691382
[25]	cv_agg's res: 0.840113 + 0.0105309
[30]	cv_agg's res: 0.845457 + 0.00854639
[35]	cv_agg's res: 0.845353 + 0.00854043
[40]	cv_agg's res: 0.848856 + 0.00859427
[45]	cv_agg's res: 0.848814 + 0.00883844
[50]	cv_agg's res: 0.849415 + 0.00974274
[55]	cv_agg's res: 0.849807 + 0.00940455
[60]	cv_agg's res: 0.851563 + 0.011016
[65]	cv_agg's res: 0.852914 + 0.0113552
[70]	cv_agg's res: 0.855268 + 0.0114264
[75]	cv_agg's res: 0.856037 + 0.0095037
[80]	cv_agg's res: 0.856871 + 0.00949773
[85]	cv_agg's res: 0.856897 + 0.0119637
[90]	cv_agg's res: 0.857543 + 0.00940182
[95]	cv_agg's res: 0.858121 + 0.0101455
[100]	cv_agg's res: 0.858784 + 0.0111978
[105]	cv_agg's res: 0.85801 + 0.0115143
[110]	cv_agg's res: 0.856845 + 0.0119702
[115]	cv_agg's res: 0.857788 + 0.0102344
[120]	cv_agg's res: 0.857766 + 0.00997759
[125]	cv_agg's res: 0.858

{'res-mean': [0.76930977408635337,
  0.79091496972838649,
  0.80025674885593123,
  0.80445126165613556,
  0.81131502340717532,
  0.82062642968324206,
  0.82092187611744138,
  0.82506277996820498,
  0.82628684588402823,
  0.82748578769681347,
  0.83111828676201593,
  0.83105451384650364,
  0.83168778405913846,
  0.83275797926183603,
  0.82998612497222923,
  0.83301178411726484,
  0.83123155998720211,
  0.83233953791853887,
  0.83627892139902971,
  0.83722850476516097,
  0.83628662003109644,
  0.83696100754834413,
  0.83935251518883847,
  0.83955320617256524,
  0.84011309267389456,
  0.84104879338488947,
  0.84156252017316613,
  0.84265404053542925,
  0.84424069065390794,
  0.84545707235229994,
  0.84512083547802652,
  0.84681302114799573,
  0.84467765786368754,
  0.84713423865908011,
  0.84535294015939366,
  0.8487874851274877,
  0.84801509045703527,
  0.8485652533810174,
  0.84847274506536297,
  0.84885586074455677,
  0.84805966257307241,
  0.84763754827792293,
  0.84703387932757257,
 

In [131]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.918196
[10]	training's res: 0.939537
[15]	training's res: 0.95131
[20]	training's res: 0.958369
[25]	training's res: 0.970264
[30]	training's res: 0.977524
[35]	training's res: 0.983404
[40]	training's res: 0.989563
[45]	training's res: 0.993506
[50]	training's res: 0.996348
[55]	training's res: 0.998869
[60]	training's res: 0.999107
[65]	training's res: 0.999555
[70]	training's res: 0.999778
[75]	training's res: 1
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[195]	train

In [132]:
pred = model.predict(test.drop(['uid'],axis=1))
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result.csv',index=False,header=False,sep=',',columns=['uid','label'])