In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

二、特征工程

In [12]:
def get_features(data):
    # 用众数填充年龄为0的数据
    data.loc[data['用户年龄']==0,'用户年龄'] = data['用户年龄'].mode() 
    # 两个重要性比较高的特征
    data['相对网龄'] = data['用户网龄（月）']/data['用户年龄']
    data['网龄年龄差'] = data['用户年龄'] - data['用户网龄（月）']/12
    
    # 构造一些特征
    data['缴费金额能否覆盖当月账单'] = data['缴费用户最近一次缴费金额（元）']-data['用户账单当月总费用（元）']
    data['最近一次交费是否超过平均消费额'] = data['缴费用户最近一次缴费金额（元）']-data['用户近6个月平均消费值（元）']
    data['当月账单是否超过平均消费额'] = data['用户账单当月总费用（元）']-data['用户近6个月平均消费值（元）']
    
    data['通话人均花费'] = data['用户账单当月总费用（元）'] / (data['当月通话交往圈人数']+1)
    
    data['近半年账单'] = data['用户近6个月平均消费值（元）']*6 + data['用户账单当月总费用（元）']
    
    
    data['是否去过高档商场'] = data['当月是否逛过福州仓山万达'] + data['当月是否到过福州山姆会员店']
    data['是否去过高档商场'] = data['是否去过高档商场'].map(lambda x: 1 if x>=1 else 0)

        
    # 相乘组合特征
    data['是否商场_旅游'] = data['是否去过高档商场'] * data['当月是否景点游览']
    data['是否商场_体育馆'] = data['是否去过高档商场'] * data['当月是否体育场馆消费']
    data['是否商场_电影'] = data['是否去过高档商场'] * data['当月是否看电影'] 
    data['是否体育馆_旅游'] = data['当月是否体育场馆消费'] * data['当月是否景点游览']
    data['是否电影_旅游'] = data['当月是否看电影'] * data['当月是否景点游览']
    data['是否电影_体育馆'] = data['当月是否看电影'] * data['当月是否体育场馆消费']    
    
    data['是否商场_旅游_体育馆'] = data['是否去过高档商场'] * data['当月是否景点游览'] * data['当月是否体育场馆消费']
    data['是否商场_电影_体育馆'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否体育场馆消费']
    data['是否商场_电影_旅游'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否景点游览']
    data['是否体育馆_电影_旅游'] = data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']
    
    data['是否商场_体育馆_电影_旅游'] = data['是否去过高档商场']*data['当月是否体育场馆消费']*data['当月是否看电影']*data['当月是否景点游览']
       
    
    # 把这些连续的特征离散化
    data['交通类应用使用次数'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']
    
    discrete_features = ['交通类应用使用次数','当月物流快递类应用使用次数','当月飞机类应用使用次数',
                         '当月火车类应用使用次数','当月旅游资讯类应用使用次数']
    
    def map_discrete(x):
        if x == 0:
            return 0
        elif x <= 5:
            return 1
        elif x<=15:
            return 2
        elif x <=50:
            return 3
        elif x <= 100:
            return 4
        else:
            return 5
        
    for col in discrete_features:
        data[col] = data[col].map(lambda x:map_discrete(x))
        
    return data

In [13]:
# 对数据的基本的处理（预处理）
def base_process(data):
    transform_features = ['相对网龄','网龄年龄差','用户年龄','用户网龄（月）','当月通话交往圈人数',
        '最近一次交费是否超过平均消费额','近三个月月均商场出现次数','当月网购类应用使用次数',
        '当月物流快递类应用使用次数','当月账单是否超过平均消费额','当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数','近半年账单','通话人均花费']
    
    
    user_bill_features = [ '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',
                        '用户账单当月总费用（元）', '用户当月账户余额（元）']
    
    log_features = ['当月网购类应用使用次数','当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数']
    
    for col in transform_features + user_bill_features + log_features:
        up_limit = np.percentile(data[col].values,99.9)
        down_limit = np.percentile(data[col].values,0.1)
        data[col].loc[data[col] > up_limit] = up_limit
        data[col].loc[data[col] < down_limit] = down_limit
            
    # 对数化，x 变成 log（1+x）
    for col in user_bill_features + log_features:
        data[col] = data[col].map(lambda x : np.log1p(x))
    # loc 按索引值定位；iloc 按索引位置定位
        
    train,test = data[:50000],data[50000:]
    return train,test

In [14]:
def load_data():
    train = pd.read_csv('G:/Jupyter_program/Algorithm/DCIC/train_dataset.csv')
    label = train['信用分'].values
    del train['信用分']
    
    test = pd.read_csv('G:/Jupyter_program/Algorithm/DCIC/test_dataset.csv')
    test_id = test['用户编码'].values
    
    data = pd.concat([train,test],axis=0,ignore_index=True)
    del data['用户编码']
    
    return data,label,test_id
    

三、模型构造和调参

In [15]:
data,label,test_id = load_data()
data = get_features(data)
train,test = base_process(data)

  interpolation=interpolation)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [16]:
X_train,X_test,y_train,y_test = train_test_split(train,label,test_size=0.25,random_state=33)

In [21]:
clf1 = lgb.LGBMRegressor(boosting_type='gbdt',num_leaves=31,max_depth=-1,
                n_estimators=2000,subsample=0.8,
                subsample_freq=1,colsample_bytree=0.7,
                random_state=2019,n_jobs=-1)

In [28]:
clf2 = lgb.LGBMRegressor(boosting_type='gbdt',num_leaves=31,max_depth=-1,
                n_estimators=2000,subsample=0.8,
                subsample_freq=1,colsample_bytree=0.7,
                random_state=2018,n_jobs=-1)

In [44]:
clf3 = XGBRegressor(n_estimators=2000,silent=True,
            objective='reg:linear',booster='gbtree',n_jobs=-1,gamma=0,
            subsample=0.8,colsample_bytree=0.7,colsample_bylevel=1,
            scale_pos_weight=1,base_score=0.5,random_state=2017)

In [39]:
clf4 = XGBRegressor(max_depth=4,learning_rate=0.03,n_estimators=2000,silent=True,
            objective='reg:linear',booster='gbtree',n_jobs=-1,gamma=0,
            subsample=0.8,colsample_bytree=0.7,colsample_bylevel=1,
            scale_pos_weight=1,base_score=0.5,random_state=2016)

In [10]:
kf = StratifiedKFold(n_splits=10,random_state=2015,shuffle=False)
best_score = []
sub_list = []

In [45]:
param_test ={
   'reg_alpha' :[0.015,0.03,0.05],
   'reg_lambda':[0.8,1.0,1.2]
}

In [46]:
# grid search 寻找最优超参数
grid_search = GridSearchCV(estimator=clf3,param_grid=param_test,verbose=1,cv=5)

In [47]:
grid_search.fit(X_train,y_train,eval_metric='mae')

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 14.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=2017,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'reg_alpha': [0.015, 0.03, 0.05], 'reg_lambda': [0.8, 1.0, 1.2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [48]:
print(grid_search.score(X_test,y_test))
print(grid_search.best_params_)

0.7906394026966757
{'reg_alpha': 0.03, 'reg_lambda': 0.8}


In [11]:
for i,(train_index,val_index) in enumerate(kf.split(train,label)):
        X_train = train.loc[train_index,:]
        y_train = label[train_index]
        X_val = train.loc[val_index,:]
        y_val = label[val_index]
        
        clf1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='mae',early_stopping_rounds=110,verbose=200)
        pred_val1 = clf1.predict(X_val,num_iteration=clf1.best_iteration_)
        val1_mae = mean_absolute_error(y_val,np.round(pred_val1))
        pred_test1 = clf1.predict(test,num_iteration = clf1.best_iteration_)
        
        clf2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='rmse',early_stopping_rounds=110,verbose=200)
        pred_val2 = clf2.predict(X_val,num_iteration = clf2.best_iteration_)
        val2_mae = mean_absolute_error(y_val,np.round(pred_val2))
        pred_test2 = clf2.predict(test,num_iteration = clf2.best_iteration_)
        
        clf3.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='mae',early_stopping_rounds=100,verbose=200)
        pred_val3 = clf3.predict(X_val,ntree_limit=clf3.best_ntree_limit)
        val3_mae = mean_absolute_error(y_val,np.round(pred_val3))
        pred_test3 = clf3.predict(test,ntree_limit=clf3.best_ntree_limit)
        
        clf4.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='rmse',early_stopping_rounds=100,verbose=200)
        pred_val4 = clf4.predict(X_val,ntree_limit=clf4.best_ntree_limit)
        val4_mae = mean_absolute_error(y_val,np.round(pred_val4))
        pred_test4 = clf4.predict(test,ntree_limit=clf4.best_ntree_limit)
        
        
        pred_val = np.round(pred_val1*0.25 + pred_val2*0.25 + pred_val3*0.25 + pred_val4*0.25)
        vali_mae = mean_absolute_error(y_val,pred_val)
        best_score.append(1/(1+vali_mae))
        
        pred_test = np.round(pred_test1*0.25 + pred_test2*0.25 + pred_test3*0.25 + pred_test4*0.25)
        sub_list.append(pred_test)
        
        print('Round:{:.1f},clf1 score:{:.7f},clf2 score:{:.7f},clf3 score:{:.7f},clf4 score:{:.7f},fusion score:{:.7f}\n'.
             format(i+1,1/(1+val1_mae),1/(1+val2_mae),1/(1+val3_mae),1/(1+val4_mae),1/(1+vali_mae)))



Training until validation scores don't improve for 110 rounds.
[200]	training's l2: 342.63	training's l1: 14.3288	valid_1's l2: 402.545	valid_1's l1: 15.3872
[400]	training's l2: 316.321	training's l1: 13.8047	valid_1's l2: 394.29	valid_1's l1: 15.2445
[600]	training's l2: 297.437	training's l1: 13.4037	valid_1's l2: 392.09	valid_1's l1: 15.2057
[800]	training's l2: 281.672	training's l1: 13.0533	valid_1's l2: 391.425	valid_1's l1: 15.1986
Early stopping, best iteration is:
[777]	training's l2: 283.394	training's l1: 13.0929	valid_1's l2: 391.257	valid_1's l1: 15.1953
Training until validation scores don't improve for 110 rounds.
[200]	training's rmse: 18.5263	training's l2: 343.226	valid_1's rmse: 20.0361	valid_1's l2: 401.447
[400]	training's rmse: 17.7942	training's l2: 316.632	valid_1's rmse: 19.8207	valid_1's l2: 392.859
[600]	training's rmse: 17.2547	training's l2: 297.724	valid_1's rmse: 19.7791	valid_1's l2: 391.212
[800]	training's rmse: 16.7848	training's l2: 281.731	valid_1'

Round:4.0,clf1 score:0.0626391,clf2 score:0.0626757,clf3 score:0.0625000,clf4 score:0.0626578,fusion score:0.0627311

Training until validation scores don't improve for 110 rounds.
[200]	training's l2: 344.597	training's l1: 14.3724	valid_1's l2: 375.476	valid_1's l1: 14.8332
[400]	training's l2: 318.26	training's l1: 13.8499	valid_1's l2: 370.323	valid_1's l1: 14.7471
[600]	training's l2: 299.687	training's l1: 13.4582	valid_1's l2: 369.044	valid_1's l1: 14.7301
[800]	training's l2: 283.699	training's l1: 13.1045	valid_1's l2: 368.6	valid_1's l1: 14.7193
[1000]	training's l2: 269.328	training's l1: 12.7768	valid_1's l2: 368.492	valid_1's l1: 14.72
Early stopping, best iteration is:
[916]	training's l2: 275.13	training's l1: 12.91	valid_1's l2: 368.347	valid_1's l1: 14.7158
Training until validation scores don't improve for 110 rounds.
[200]	training's rmse: 18.5497	training's l2: 344.09	valid_1's rmse: 19.4009	valid_1's l2: 376.395
[400]	training's rmse: 17.8228	training's l2: 317.654

[400]	training's l2: 319.243	training's l1: 13.8683	valid_1's l2: 362.661	valid_1's l1: 14.6599
[600]	training's l2: 299.76	training's l1: 13.4664	valid_1's l2: 362.043	valid_1's l1: 14.6386
[800]	training's l2: 283.536	training's l1: 13.1109	valid_1's l2: 361.889	valid_1's l1: 14.6395
[1000]	training's l2: 268.99	training's l1: 12.7777	valid_1's l2: 361.383	valid_1's l1: 14.6236
Early stopping, best iteration is:
[1045]	training's l2: 266.052	training's l1: 12.7084	valid_1's l2: 361.123	valid_1's l1: 14.6162
Training until validation scores don't improve for 110 rounds.
[200]	training's rmse: 18.5847	training's l2: 345.392	valid_1's rmse: 19.1078	valid_1's l2: 365.107
[400]	training's rmse: 17.8425	training's l2: 318.355	valid_1's rmse: 19.0197	valid_1's l2: 361.749
Early stopping, best iteration is:
[439]	training's rmse: 17.7276	training's l2: 314.267	valid_1's rmse: 19.0094	valid_1's l2: 361.357
[0]	validation_0-mae:586.622	validation_1-mae:587.3
Multiple eval metrics have been pas

In [23]:
pred_test = np.mean(np.array(sub_list[2:]),axis=0)

In [24]:
pred_test = pred_test.astype('int64')
result = pd.Series(pred_test,name='score')
submission = pd.concat([pd.Series(test_id,name='id'),result],axis=1,)

In [25]:
submission.to_csv('G:/Jupyter_program/Algorithm/DCIC/submit_v2.csv',index=False)

In [26]:
importance_features = []
for i in range(0,55):
    importance_features.append((clf1.feature_importances_[i],X_train.columns[i]))


In [27]:
importance_features.sort()

In [28]:
importance_features

[(1, '是否去过高档商场'),
 (2, '当月是否到过福州山姆会员店'),
 (4, '当月是否逛过福州仓山万达'),
 (4, '是否商场_电影'),
 (5, '是否大学生客户'),
 (6, '是否商场_体育馆'),
 (7, '当月飞机类应用使用次数'),
 (9, '是否商场_旅游'),
 (11, '是否黑名单客户'),
 (11, '用户实名制是否通过核实'),
 (12, '是否商场_旅游_体育馆'),
 (16, '是否商场_电影_体育馆'),
 (18, '是否商场_电影_旅游'),
 (19, '当月火车类应用使用次数'),
 (22, '当月是否看电影'),
 (22, '是否体育馆_旅游'),
 (23, '商场_旅游'),
 (23, '当月是否体育场馆消费'),
 (24, '当月物流快递类应用使用次数'),
 (25, '商场_体育馆'),
 (26, '是否体育馆_电影_旅游'),
 (27, '商场_电影'),
 (29, '体育馆_旅游'),
 (31, '是否电影_体育馆'),
 (31, '是否经常逛商场的人'),
 (31, '电影_体育馆'),
 (34, '当月是否景点游览'),
 (35, '电影_旅游'),
 (36, '是否电影_旅游'),
 (44, '用户最近一次缴费距今时长（月）'),
 (60, '是否不良记录'),
 (94, '缴费用户当前是否欠费缴费'),
 (118, '是否4G不健康客户'),
 (239, '当月旅游资讯类应用使用次数'),
 (271, '缴费用户最近一次缴费金额（元）'),
 (303, '用户话费敏感度'),
 (341, '用户当月账户余额（元）'),
 (495, '是否积极缴费'),
 (496, '当月余额是否够用'),
 (535, '近三个月月均商场出现次数'),
 (568, '最近一次交费是否超过平均消费额'),
 (577, '是否有可能超额消费'),
 (638, '缴费金额能否覆盖当月账单'),
 (700, '当月金融理财类应用使用总次数'),
 (701, '当月网购类应用使用次数'),
 (759, '用户账单当月总费用（元）'),
 (817, '近半年账单'),
 (863, '当月视频播放类应用使用次数'),
 (960, '网龄年

In [21]:
len(importance_features)

55