In [11]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [12]:
# load train data
uid_train = pd.read_csv('../data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [13]:
# load test data
voice_test = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [14]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./uid_test_b.txt',index=None)

In [15]:
# reset
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

**voice records**

In [21]:
voice_cnt = voice.groupby(['uid'])['in_out'].agg({'voice_count':'count'}).reset_index()
voice_unique_num_cnt = voice.groupby(['uid'])['opp_num'].agg({'voice_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
voice_unique_head_cnt = voice.groupby(['uid'])['opp_head'].agg({'voice_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [25]:
voice_onehot_len_cnt = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_onehot_type_cnt = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_onehot_inout_cnt = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)


In [72]:
voice['start_day'] = (voice.start_time.astype(int) / 1000000).astype(int)
voice['start_hour'] = (voice.start_time.astype(int) % 1000000 / 10000).astype(int)
voice['start_minute'] = (voice.start_time.astype(int) % 10000 / 100).astype(int)
voice['start_second'] = voice.start_time.astype(int) % 100

voice['end_day'] = (voice.end_time.astype(int) / 1000000).astype(int)
voice['end_hour'] = (voice.end_time.astype(int) % 1000000 / 10000).astype(int)
voice['end_minute'] = (voice.end_time.astype(int) % 10000 / 100).astype(int)
voice['end_second'] = voice.end_time.astype(int) % 100

voice['voice_dura'] = (voice.end_day-voice.start_day)*24*60*60+(voice.end_hour-voice.start_hour)*60*60+(voice.end_minute-voice.start_minute)*60+(voice.end_second-voice.start_second)

In [74]:
voice_dura_val = voice.groupby(['uid'])['voice_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_dura_').reset_index()

**sms records**

In [29]:
sms_cnt = sms.groupby(['uid'])['in_out'].agg({'sms_count':'count'}).reset_index()
sms_unique_num_cnt = sms.groupby(['uid'])['opp_num'].agg({'sms_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
sms_unique_head_cnt = sms.groupby(['uid'])['opp_head'].agg({'sms_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [31]:
sms_onehot_len_cnt = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_onehot_inout_cnt = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)


**wa records** 

In [33]:
wa_unique_name_cnt = wa.groupby(['uid'])['wa_name'].agg({'wa_name_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [34]:
wa_onehot_type_cnt = wa.groupby(['uid','wa_type'])['uid'].count().unstack().add_prefix('wa_type_').reset_index().fillna(0)


In [36]:
wa_visit_cnt_val = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
wa_visit_dura_val = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
wa_up_flow_val = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
wa_down_flow_val = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()


In [76]:
feature = [
    voice_cnt,
    voice_unique_num_cnt,
    voice_unique_head_cnt,
    voice_onehot_len_cnt,
    voice_onehot_type_cnt,
    voice_onehot_inout_cnt,
    voice_dura_val,
    sms_cnt,
    sms_unique_num_cnt,
    sms_unique_head_cnt,
    sms_onehot_len_cnt,
    sms_onehot_inout_cnt,
    wa_unique_name_cnt,
    wa_onehot_type_cnt,
    wa_visit_cnt_val,
    wa_visit_dura_val,
    wa_up_flow_val,
    wa_down_flow_val
]

In [77]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [78]:
train_feature.to_csv('./train_feature.csv',index=None)
test_feature.to_csv('./test_feature.csv',index=None)

In [79]:
### MODEL ###

In [80]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
train = pd.read_csv('./train_feature.csv')
test = pd.read_csv('./test_feature.csv')

In [82]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [83]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [84]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [85]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.734155 + 0.00912526
[10]	cv_agg's res: 0.746915 + 0.00302043
[15]	cv_agg's res: 0.75074 + 0.00821751
[20]	cv_agg's res: 0.754514 + 0.00549259
[25]	cv_agg's res: 0.758803 + 0.00688168
[30]	cv_agg's res: 0.761377 + 0.00734881
[35]	cv_agg's res: 0.762939 + 0.0102068
[40]	cv_agg's res: 0.765305 + 0.0121328
[45]	cv_agg's res: 0.762427 + 0.0126351
[50]	cv_agg's res: 0.761179 + 0.0114532
[55]	cv_agg's res: 0.761144 + 0.0158645
[60]	cv_agg's res: 0.759265 + 0.0171671
[65]	cv_agg's res: 0.760501 + 0.015942
[70]	cv_agg's res: 0.759073 + 0.0140772
[75]	cv_agg's res: 0.761155 + 0.01235
[80]	cv_agg's res: 0.762326 + 0.0132621
[85]	cv_agg's res: 0.760881 + 0.0107098
[90]	cv_agg's res: 0.759623 + 0.0114119
[95]	cv_agg's res: 0.763286 + 0.012936
[100]	cv_agg's res: 0.762609 + 0.0132619
[105]	cv_agg's res: 0.761315 + 0.0130801
[110]	cv_agg's res: 0.760917 + 0.0130759
[115]	cv_agg's res: 0.763206 + 0.0110764
[120]	cv_agg's res: 0.761519 + 0.0103018
[125]	cv_agg's res: 0.76422 + 0.011

{'res-mean': [0.66383105713976154,
  0.71658133062251339,
  0.72406558344523608,
  0.73108288678505451,
  0.73415513001927002,
  0.72957256074411647,
  0.73343488414375291,
  0.73911249778493726,
  0.74334960434711184,
  0.74691526825295684,
  0.75220133387932042,
  0.75205326198603439,
  0.75265997157983355,
  0.75246369220481224,
  0.75073950918659771,
  0.75346896092023075,
  0.75451345384434487,
  0.75374647608189405,
  0.75434461245449047,
  0.7545140659898254,
  0.75559960136854987,
  0.75729432738685354,
  0.75836242930302278,
  0.75812070623308692,
  0.75880316748920495,
  0.76038582888149664,
  0.76123958103601785,
  0.76190585855315085,
  0.76493417101351613,
  0.76137655781873137,
  0.76218199246605778,
  0.76217179647469058,
  0.76451933665777572,
  0.76514806767523347,
  0.7629385756588708,
  0.76318353532881522,
  0.76217919982204163,
  0.76400894633552829,
  0.76344841221210524,
  0.76530484844595748,
  0.76389403675423961,
  0.76402004781623478,
  0.76178436135662986,
 

In [86]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.88917
[10]	training's res: 0.911595
[15]	training's res: 0.923948
[20]	training's res: 0.9356
[25]	training's res: 0.946514
[30]	training's res: 0.959568
[35]	training's res: 0.968393
[40]	training's res: 0.975463
[45]	training's res: 0.98213
[50]	training's res: 0.987562
[55]	training's res: 0.992838
[60]	training's res: 0.995221
[65]	training's res: 0.997067
[70]	training's res: 0.99798
[75]	training's res: 0.999107
[80]	training's res: 0.999555
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[

In [None]:
pred = model.predict(test.drop(['uid'],axis=1))
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result.csv',index=False,header=False,sep=',',columns=['uid','label'])