In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load train data
uid_train = pd.read_csv('../data/train/uid_train.txt',sep='\t',header=None,names=('uid','label'))
voice_train = pd.read_csv('../data/train/voice_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_train = pd.read_csv('../data/train/sms_train.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_train = pd.read_csv('../data/train/wa_train.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [3]:
# load test data
voice_test = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','end_time','call_type','in_out'),dtype={'start_time':str,'end_time':str})
sms_test = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',header=None,names=('uid','opp_num','opp_head','opp_len','start_time','in_out'),dtype={'start_time':str})
wa_test = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',header=None,names=('uid','wa_name','visit_cnt','visit_dura','up_flow','down_flow','wa_type','date'),dtype={'date':str})

In [4]:
uid_test = pd.DataFrame({'uid':pd.unique(wa_test['uid'])})
uid_test.to_csv('./uid_test_b.txt',index=None)

In [234]:
# reset
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)

**voice records**

In [6]:
voice_cnt = voice.groupby(['uid'])['in_out'].agg({'voice_count':'count'}).reset_index()
voice_unique_num_cnt = voice.groupby(['uid'])['opp_num'].agg({'voice_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
voice_unique_head_cnt = voice.groupby(['uid'])['opp_head'].agg({'voice_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [7]:
voice_onehot_len_cnt = voice.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('voice_opp_len_').reset_index().fillna(0)
voice_onehot_type_cnt = voice.groupby(['uid','call_type'])['uid'].count().unstack().add_prefix('voice_call_type_').reset_index().fillna(0)
voice_onehot_inout_cnt = voice.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('voice_in_out_').reset_index().fillna(0)

voice_onehot_head_cnt = voice.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('voice_opp_head_').reset_index().fillna(0)


In [8]:
voice['start_day'] = (voice.start_time.astype(int) / 1000000).astype(int)
voice['start_hour'] = (voice.start_time.astype(int) % 1000000 / 10000).astype(int)
voice['start_minute'] = (voice.start_time.astype(int) % 10000 / 100).astype(int)
voice['start_second'] = voice.start_time.astype(int) % 100

voice['end_day'] = (voice.end_time.astype(int) / 1000000).astype(int)
voice['end_hour'] = (voice.end_time.astype(int) % 1000000 / 10000).astype(int)
voice['end_minute'] = (voice.end_time.astype(int) % 10000 / 100).astype(int)
voice['end_second'] = voice.end_time.astype(int) % 100

voice['voice_dura'] = (voice.end_day-voice.start_day)*24*60*60+(voice.end_hour-voice.start_hour)*60*60+(voice.end_minute-voice.start_minute)*60+(voice.end_second-voice.start_second)

In [9]:
voice_dura_val = voice.groupby(['uid'])['voice_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('voice_dura_').reset_index()
voice_onehot_hour_cnt = voice.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('voice_start_hour_').reset_index().fillna(0)


**sms records**

In [10]:
sms_cnt = sms.groupby(['uid'])['in_out'].agg({'sms_count':'count'}).reset_index()
sms_unique_num_cnt = sms.groupby(['uid'])['opp_num'].agg({'sms_opp_num_unique_count': lambda x: len(pd.unique(x))}).reset_index()
sms_unique_head_cnt = sms.groupby(['uid'])['opp_head'].agg({'sms_opp_head_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [11]:
sms_onehot_len_cnt = sms.groupby(['uid','opp_len'])['uid'].count().unstack().add_prefix('sms_opp_len_').reset_index().fillna(0)
sms_onehot_inout_cnt = sms.groupby(['uid','in_out'])['uid'].count().unstack().add_prefix('sms_in_out_').reset_index().fillna(0)

sms_onehot_head_cnt = sms.groupby(['uid','opp_head'])['uid'].count().unstack().add_prefix('sms_opp_head_').reset_index().fillna(0)


In [12]:
sms['start_day'] = (sms.start_time.astype(int) / 1000000).astype(int)
sms['start_hour'] = (sms.start_time.astype(int) % 1000000 / 10000).astype(int)
sms['start_minute'] = (sms.start_time.astype(int) % 10000 / 100).astype(int)
sms['start_second'] = sms.start_time.astype(int) % 100

sms_onehot_hour_cnt = sms.groupby(['uid','start_hour'])['uid'].count().unstack().add_prefix('sms_start_hour_').reset_index().fillna(0)


**wa records** 

In [235]:
# wa = wa[wa['visit_dura'] != 0]
wa_unique_name_cnt = wa.groupby(['uid'])['wa_name'].agg({'wa_name_unique_count': lambda x: len(pd.unique(x))}).reset_index()


In [236]:
wa_onehot_type_cnt = wa.groupby(['uid','wa_type'])['uid'].count().unstack().add_prefix('wa_type_').reset_index().fillna(0)


In [237]:
wa_visit_cnt_val = wa.groupby(['uid'])['visit_cnt'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_cnt_').reset_index()
wa_visit_dura_val = wa.groupby(['uid'])['visit_dura'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_visit_dura_').reset_index()
wa_up_flow_val = wa.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_').reset_index()
wa_down_flow_val = wa.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_').reset_index()


In [238]:
wa['wa_mean_dura_per_visit'] = wa.visit_dura / wa.visit_cnt
wa_dura_per_visit_val = wa.groupby(['uid'])['wa_mean_dura_per_visit'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_dura_per_visit_').reset_index()


In [239]:
# for i in range(45):
#     up_flow = wa[wa.date == str(i+1)].groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('up_in_date'+str(i+1)+'_').reset_index().fillna(0)
#     down_flow = wa[wa.date == str(i+1)].groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('down_in_date'+str(i+1)+'_').reset_index().fillna(0)
#     wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, up_flow, how='left', on='uid')
#     wa_dura_per_visit_val = pd.merge(wa_dura_per_visit_val, down_flow, how='left', on='uid')

wa['wa_mean_up_flow_per_visit'] = wa.up_flow / wa.visit_cnt
wa_up_flow_per_visit_val = wa.groupby(['uid'])['wa_mean_up_flow_per_visit'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_per_visit_').reset_index()
wa['wa_mean_down_flow_per_visit'] = wa.down_flow / wa.visit_cnt
wa_down_flow_per_visit_val = wa.groupby(['uid'])['wa_mean_down_flow_per_visit'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_per_visit_').reset_index()


In [240]:
wa['wa_up_flow_log'] = np.log(wa.up_flow)
wa['wa_down_flow_log'] = np.log(wa.down_flow)
wa_up_flow_log_val = wa.groupby(['uid'])['wa_up_flow_log'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_log_').reset_index()
wa_down_flow_log_val = wa.groupby(['uid'])['wa_down_flow_log'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_log_').reset_index()

wa_onehot_name_cnt = wa.groupby(['uid','wa_name'])['uid'].count().unstack().add_prefix('wa_name_').reset_index().fillna(0)

In [241]:
wa_dura_per_visit_val = wa_dura_per_visit_val.fillna(0)
wa_dura_per_visit_val.head(5)

Unnamed: 0,uid,wa_dura_per_visit_std,wa_dura_per_visit_max,wa_dura_per_visit_min,wa_dura_per_visit_median,wa_dura_per_visit_mean,wa_dura_per_visit_sum
0,u0001,1143.680091,9150.0,0.0,1130.0,1236.558231,1428225.0
1,u0002,1478.449461,5112.0,0.0,2084.285714,1880.592515,54537.18
2,u0003,1146.996555,8092.0,0.0,956.0,1176.050697,1372451.0
3,u0004,1387.195407,9553.0,0.0,1065.631628,1303.624458,2190089.0
4,u0005,1340.392078,8223.0,0.0,990.0,1250.471406,801552.2


In [242]:
# wa_flow_per_date = []
# for i in range(45):
#     wa_temp = wa[wa.date == str(i+1)]
#     up_temp = wa_temp.groupby(['uid'])['up_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_flow_per_visit_in_date'+str(i+1)+'_').reset_index()
#     down_temp = wa_temp.groupby(['uid'])['down_flow'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_flow_per_visit_in_date'+str(i+1)+'_').reset_index()
#     wa_flow_per_date.append(up_temp)
#     wa_flow_per_date.append(down_temp)


In [243]:
wa['wa_up_speed'] = wa.up_flow / wa.visit_dura
wa['wa_down_speed'] = wa.down_flow / wa.visit_dura
wa_up_speed_val = wa.groupby(['uid'])['wa_up_speed'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_up_speed_').reset_index()
wa_down_speed_val = wa.groupby(['uid'])['wa_down_speed'].agg(['std','max','min','median','mean','sum']).add_prefix('wa_down_speed_').reset_index()


In [247]:
def cnt(x):
    c = 0
    for i in x:
        if i == 0:
            c = c+1
    return c
wa_special_dura_cnt = wa.groupby(['uid'])['visit_dura'].agg({'wa_name_unique_count': cnt}).reset_index()

In [318]:
feature = [
    voice_cnt,
    voice_unique_num_cnt,
#     voice_unique_head_cnt,
    voice_onehot_len_cnt,
    voice_onehot_type_cnt,
    voice_onehot_inout_cnt,
    voice_dura_val,
    voice_onehot_head_cnt,
    sms_cnt,
    sms_unique_num_cnt,
#     sms_unique_head_cnt,
    sms_onehot_len_cnt,
    sms_onehot_inout_cnt,
    sms_onehot_head_cnt,
    sms_onehot_hour_cnt,
    wa_unique_name_cnt,
#     wa_onehot_name_cnt,
    wa_onehot_type_cnt,
#     wa_visit_cnt_val,
#     wa_visit_dura_val,
    wa_up_flow_val,
    wa_down_flow_val,
#     wa_up_flow_log_val,
#     wa_down_flow_log_val,
    wa_special_dura_cnt,
    wa_up_speed_val,
    wa_down_speed_val,
    wa_dura_per_visit_val
#     wa_up_flow_per_visit_val,
#     wa_down_flow_per_visit_val
]

# feature = feature + wa_flow_per_date

In [319]:
train_feature = uid_train
for feat in feature:
    train_feature=pd.merge(train_feature,feat,how='left',on='uid')

test_feature = uid_test
for feat in feature:
    test_feature=pd.merge(test_feature,feat,how='left',on='uid')

In [320]:
train_feature.to_csv('./train_feature.csv',index=None)
test_feature.to_csv('./test_feature.csv',index=None)

In [321]:
### MODEL ###

In [322]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
# import matplotlib.pyplot as plt
# import seaborn as sns

In [323]:
train = pd.read_csv('./train_feature.csv')
test = pd.read_csv('./test_feature.csv')

In [324]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [325]:
from multiprocessing import cpu_count
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
   'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 24,  # 21 24
    'learning_rate': 0.08,  # 0.08
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
    'num_threads': cpu_count() - 1
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [326]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [327]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.816324 + 0.0111453
[10]	cv_agg's res: 0.825299 + 0.0100551
[15]	cv_agg's res: 0.833288 + 0.0128544
[20]	cv_agg's res: 0.83684 + 0.011596
[25]	cv_agg's res: 0.841583 + 0.00986277
[30]	cv_agg's res: 0.844465 + 0.0123002
[35]	cv_agg's res: 0.846857 + 0.0115614
[40]	cv_agg's res: 0.849626 + 0.0125914
[45]	cv_agg's res: 0.849252 + 0.0109619
[50]	cv_agg's res: 0.851708 + 0.00996166
[55]	cv_agg's res: 0.851503 + 0.0115371
[60]	cv_agg's res: 0.852997 + 0.0121988
[65]	cv_agg's res: 0.852881 + 0.0121278
[70]	cv_agg's res: 0.851369 + 0.0125643
[75]	cv_agg's res: 0.854461 + 0.0121691
[80]	cv_agg's res: 0.852715 + 0.00981944
[85]	cv_agg's res: 0.856088 + 0.0106066
[90]	cv_agg's res: 0.856656 + 0.00976418
[95]	cv_agg's res: 0.855764 + 0.00949648
[100]	cv_agg's res: 0.855624 + 0.00950705
[105]	cv_agg's res: 0.854895 + 0.0106798
[110]	cv_agg's res: 0.857285 + 0.00826711
[115]	cv_agg's res: 0.856723 + 0.0104178
[120]	cv_agg's res: 0.85831 + 0.00804586
[125]	cv_agg's res: 0.858975 + 

{'res-mean': [0.7595283194763395,
  0.7939788350806052,
  0.7965449088635137,
  0.8061646458232282,
  0.816323817444515,
  0.814748143147645,
  0.8190477168719127,
  0.8188209385544462,
  0.819292282255785,
  0.8252993102479506,
  0.8246882388703553,
  0.8279479605679062,
  0.8300907005809407,
  0.8327256826451027,
  0.8332882266048011,
  0.8317113796220119,
  0.8339072312773901,
  0.8342624292764883,
  0.8322276652606911,
  0.8368402425089759,
  0.8367275326076845,
  0.8379236099464394,
  0.8393355002341717,
  0.8394533571525896,
  0.8415829182219218,
  0.8410557155762324,
  0.8422433215961312,
  0.8431718114856651,
  0.8429937565253723,
  0.8444645747685525,
  0.8443388990182594,
  0.8459325381849044,
  0.8454744480990968,
  0.8471831968590268,
  0.8468566107854604,
  0.8475408160317017,
  0.8465320505941797,
  0.8485427542874895,
  0.8472700767081295,
  0.8496259590558134,
  0.8494021444174923,
  0.8491264666397901,
  0.850122191666066,
  0.8492949493244494,
  0.8492523699585682,
  

In [328]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.874999
[10]	training's res: 0.88528
[15]	training's res: 0.891772
[20]	training's res: 0.902412
[25]	training's res: 0.912617
[30]	training's res: 0.920512
[35]	training's res: 0.926799
[40]	training's res: 0.931656
[45]	training's res: 0.939556
[50]	training's res: 0.943477
[55]	training's res: 0.95427
[60]	training's res: 0.959218
[65]	training's res: 0.96636
[70]	training's res: 0.971097
[75]	training's res: 0.978095
[80]	training's res: 0.983037
[85]	training's res: 0.983951
[90]	training's res: 0.986666
[95]	training's res: 0.98981
[100]	training's res: 0.992876
[105]	training's res: 0.99364
[110]	training's res: 0.995478
[115]	training's res: 0.997078
[120]	training's res: 0.997753
[125]	training's res: 0.998657
[130]	training's res: 0.999106
[135]	training's res: 0.999108
[140]	training's res: 0.999555
[145]	training's res: 0.999778
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	

In [329]:
pred = model.predict(test.drop(['uid'],axis=1))
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result.csv',index=False,header=False,sep=',',columns=['uid','label'])