In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load train data
uid_train = pd.read_csv('../data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [3]:
# load test data
voice_test = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [4]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./uid_test_b.txt',index=None)

In [5]:
# reset
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

**voice records**

In [6]:
voice_cnt = voice.groupby(['uid'])['in_out'].agg({'voice_count':'count'}).reset_index()
voice_unique_num_cnt = voice.groupby(['uid'])['opp_num'].agg({'voice_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
# voice_unique_head_cnt = voice.groupby(['uid'])['opp_head'].agg({'voice_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [7]:
voice_onehot_len_cnt = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_onehot_type_cnt = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_onehot_inout_cnt = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

voice_onehot_head_cnt = voice.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('voice_opp_head_').reset_index().fillna(0)


In [8]:
voice['start_day'] = (voice.start_time.astype(int) / 1000000).astype(int)
voice['start_hour'] = (voice.start_time.astype(int) % 1000000 / 10000).astype(int)
voice['start_minute'] = (voice.start_time.astype(int) % 10000 / 100).astype(int)
voice['start_second'] = voice.start_time.astype(int) % 100

voice['end_day'] = (voice.end_time.astype(int) / 1000000).astype(int)
voice['end_hour'] = (voice.end_time.astype(int) % 1000000 / 10000).astype(int)
voice['end_minute'] = (voice.end_time.astype(int) % 10000 / 100).astype(int)
voice['end_second'] = voice.end_time.astype(int) % 100

voice['voice_dura'] = (voice.end_day-voice.start_day)*24*60*60+(voice.end_hour-voice.start_hour)*60*60+(voice.end_minute-voice.start_minute)*60+(voice.end_second-voice.start_second)

In [9]:
voice_dura_val = voice.groupby(['uid'])['voice_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_dura_').reset_index()
voice_onehot_hour_cnt = voice.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('voice_start_hour_').reset_index().fillna(0)


**sms records**

In [11]:
sms_cnt = sms.groupby(['uid'])['in_out'].agg({'sms_count':'count'}).reset_index()
sms_unique_num_cnt = sms.groupby(['uid'])['opp_num'].agg({'sms_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
# sms_unique_head_cnt = sms.groupby(['uid'])['opp_head'].agg({'sms_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [12]:
sms_onehot_len_cnt = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_onehot_inout_cnt = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

sms_onehot_head_cnt = sms.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('sms_opp_head_').reset_index().fillna(0)


In [13]:
sms['start_day'] = (sms.start_time.astype(int) / 1000000).astype(int)
sms['start_hour'] = (sms.start_time.astype(int) % 1000000 / 10000).astype(int)
sms['start_minute'] = (sms.start_time.astype(int) % 10000 / 100).astype(int)
sms['start_second'] = sms.start_time.astype(int) % 100

sms_onehot_hour_cnt = sms.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('sms_start_hour_').reset_index().fillna(0)


**wa records** 

In [33]:
wa_unique_name_cnt = wa.groupby(['uid'])['wa_name'].agg({'wa_name_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [34]:
wa_onehot_type_cnt = wa.groupby(['uid','wa_type'])['uid'].count().unstack().add_prefix('wa_type_').reset_index().fillna(0)


In [36]:
wa_visit_cnt_val = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
wa_visit_dura_val = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
wa_up_flow_val = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
wa_down_flow_val = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()


In [None]:
wa['wa_mean_dura_per_visit'] = wa.visit_dura / wa.visit_cnt

In [76]:
feature = [
    voice_cnt,
    voice_unique_num_cnt,
    voice_unique_head_cnt,
    voice_onehot_len_cnt,
    voice_onehot_type_cnt,
    voice_onehot_inout_cnt,
    voice_dura_val,
    sms_cnt,
    sms_unique_num_cnt,
    sms_unique_head_cnt,
    sms_onehot_len_cnt,
    sms_onehot_inout_cnt,
    wa_unique_name_cnt,
    wa_onehot_type_cnt,
    wa_visit_cnt_val,
    wa_visit_dura_val,
    wa_up_flow_val,
    wa_down_flow_val
]

In [77]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [78]:
train_feature.to_csv('./train_feature.csv',index=None)
test_feature.to_csv('./test_feature.csv',index=None)

In [79]:
### MODEL ###

In [80]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
train = pd.read_csv('./train_feature.csv')
test = pd.read_csv('./test_feature.csv')

In [82]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [87]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 20,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [84]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [88]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.73231 + 0.00747884
[10]	cv_agg's res: 0.745173 + 0.00490622
[15]	cv_agg's res: 0.754866 + 0.00764463
[20]	cv_agg's res: 0.758771 + 0.0145163
[25]	cv_agg's res: 0.761768 + 0.00961862
[30]	cv_agg's res: 0.764256 + 0.00733368
[35]	cv_agg's res: 0.765637 + 0.0105603
[40]	cv_agg's res: 0.765877 + 0.0114144
[45]	cv_agg's res: 0.768032 + 0.0122545
[50]	cv_agg's res: 0.769798 + 0.0130991
[55]	cv_agg's res: 0.768481 + 0.0138534
[60]	cv_agg's res: 0.768026 + 0.0126506
[65]	cv_agg's res: 0.767461 + 0.0128214
[70]	cv_agg's res: 0.772113 + 0.0135535
[75]	cv_agg's res: 0.771964 + 0.0154801
[80]	cv_agg's res: 0.769938 + 0.0157634
[85]	cv_agg's res: 0.769979 + 0.013697
[90]	cv_agg's res: 0.7689 + 0.0149098
[95]	cv_agg's res: 0.768318 + 0.0161838
[100]	cv_agg's res: 0.769678 + 0.0166503
[105]	cv_agg's res: 0.768798 + 0.016363
[110]	cv_agg's res: 0.769123 + 0.0145957
[115]	cv_agg's res: 0.770736 + 0.0113486
[120]	cv_agg's res: 0.769576 + 0.0114143
[125]	cv_agg's res: 0.768425 + 0.012

{'res-mean': [0.69233437687815247,
  0.71695884205636917,
  0.73211731882101672,
  0.72465235011757645,
  0.73230983744009137,
  0.72675657819556327,
  0.73939068166402799,
  0.74095069814535142,
  0.74188797144032981,
  0.7451729592308548,
  0.75162044933991023,
  0.75142291151626006,
  0.75299422281830586,
  0.75580602278033249,
  0.75486569659258729,
  0.75678085672051731,
  0.75793798613456831,
  0.76084361189474381,
  0.75970675638293284,
  0.75877087918291386,
  0.75656554444630053,
  0.75839260776024064,
  0.76214368889139428,
  0.75997230322993026,
  0.76176828821564901,
  0.76126752950805077,
  0.76116112809682546,
  0.76357142606838579,
  0.76372241035458677,
  0.7642562783254383,
  0.76488141063020887,
  0.76361485172754717,
  0.76592381376569529,
  0.76682478851164537,
  0.76563696622042487,
  0.7667025471096709,
  0.76663595911646631,
  0.76637632260987765,
  0.76588525391693663,
  0.76587651989917482,
  0.76654952699073464,
  0.76722349710233695,
  0.76705962122818949,
  

In [86]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.88917
[10]	training's res: 0.911595
[15]	training's res: 0.923948
[20]	training's res: 0.9356
[25]	training's res: 0.946514
[30]	training's res: 0.959568
[35]	training's res: 0.968393
[40]	training's res: 0.975463
[45]	training's res: 0.98213
[50]	training's res: 0.987562
[55]	training's res: 0.992838
[60]	training's res: 0.995221
[65]	training's res: 0.997067
[70]	training's res: 0.99798
[75]	training's res: 0.999107
[80]	training's res: 0.999555
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[

In [None]:
pred = model.predict(test.drop(['uid'],axis=1))
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result.csv',index=False,header=False,sep=',',columns=['uid','label'])