In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# 二、特征工程

In [2]:
def load_data():
    train = pd.read_csv('F:/Jupyter_Program/Algorithm/DCIC/train_dataset.csv')
    
    label = train['信用分'].values
    del train['信用分']
    
    test = pd.read_csv('F:/Jupyter_Program/Algorithm/DCIC/test_dataset.csv')
    test_id = test['用户编码'].values
    
    data = pd.concat([train,test], axis=0, ignore_index=True)
    del data['用户编码']
    
    return data, label, test_id
    

In [3]:
def get_features(data):
    """
    data：both training and testing data
    """
    # 用众数填充年龄为0的数据
    data.loc[data['用户年龄']==0, '用户年龄'] = data['用户年龄'].mode() 
    
    # 两个重要性比较高的特征
    data['用户网龄（年）'] = data['用户网龄（月）']/12
    data['相对网龄'] = data['用户年龄']/(data['用户网龄（年）']+1)
    data['网龄年龄差'] = data['用户年龄'] - data['用户网龄（年）']
    
    # 构造费用相关的一些特征，衡量消费积极性，强特
    data['缴费金额能否覆盖当月账单'] = data['缴费用户最近一次缴费金额（元）'] - data['用户账单当月总费用（元）']
    data['最近一次交费是否超过平均消费额'] = data['缴费用户最近一次缴费金额（元）'] - data['用户近6个月平均消费值（元）']
    data['当月账单是否超过平均消费额'] = data['用户账单当月总费用（元）'] - data['用户近6个月平均消费值（元）']
     
    data['近半年账单'] = data['用户近6个月平均消费值（元）']*6 + data['用户账单当月总费用（元）']
    data['通话人均花费'] = data['用户账单当月总费用（元）'] / (data['当月通话交往圈人数']+1)
    
    # 根据缴费金额是否整数，判断缴费方式
    def ways(x):
        if x==0:
            return -1
        elif x%10==0:
            return 1
        else:
            return 0
    data['缴费方式'] = data['缴费用户最近一次缴费金额（元）'].map(lambda x:ways(x))
    data = pd.get_dummies(data, columns=['缴费方式'])
    
    # 参考开源，新加的
    data['话费稳定性'] = data['用户账单当月总费用（元）']/(data['用户近6个月平均消费值（元）']+1)
    data['余额稳定性'] = data['用户账单当月总费用（元）']/(data['用户当月账户余额（元）']+1)
    
        
    data['次数'] = data['当月网购类应用使用次数'] + data['当月物流快递类应用使用次数']+data['当月金融理财类应用使用总次数'] + data['当月视频播放类应用使用次数']+ data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数'] + data['当月旅游资讯类应用使用次数']
    
    
    # 赛事方微信公众号透露的信息
    data['网龄十年'] =data['用户网龄（月）'].map(lambda x: 1 if x>=10 else 0)
         
    data['是否去过高档商场'] = data['当月是否逛过福州仓山万达'] + data['当月是否到过福州山姆会员店']
    data['是否去过高档商场'] = data['是否去过高档商场'].map(lambda x: 1 if x>=1 else 0)

    # 相乘组合特征
    data['是否商场_旅游'] = data['是否去过高档商场'] * data['当月是否景点游览']
    data['是否商场_体育馆'] = data['是否去过高档商场'] * data['当月是否体育场馆消费']
    data['是否商场_电影'] = data['是否去过高档商场'] * data['当月是否看电影'] 
    data['是否体育馆_旅游'] = data['当月是否体育场馆消费'] * data['当月是否景点游览']
    data['是否电影_旅游'] = data['当月是否看电影'] * data['当月是否景点游览']
    data['是否电影_体育馆'] = data['当月是否看电影'] * data['当月是否体育场馆消费']    
    
    data['是否商场_旅游_体育馆'] = data['是否去过高档商场'] * data['当月是否景点游览'] * data['当月是否体育场馆消费']
    data['是否商场_电影_体育馆'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否体育场馆消费']
    data['是否商场_电影_旅游'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否景点游览']
    data['是否体育馆_电影_旅游'] = data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']
    
    data['是否商场_体育馆_电影_旅游'] = data['是否去过高档商场'] * data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']
    
    
    # 把这些连续的特征离散化
    data['交通类应用使用次数'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']
    
    discrete_features = ['交通类应用使用次数','当月物流快递类应用使用次数','当月飞机类应用使用次数',
                         '当月火车类应用使用次数','当月旅游资讯类应用使用次数']
    
    def map_discrete(x):
        if x == 0:
            return 0
        elif x <= 5:
            return 1
        elif x<=15:
            return 2
        elif x <=50:
            return 3
        elif x<=100:
            return 4
        else:
            return 5
        
    for col in discrete_features:
        data[col] = data[col].map(lambda x:map_discrete(x))
        
    
    return data

In [4]:
# 对数据的基本的处理（预处理）
def base_process(data):
    transform_features = ['相对网龄','网龄年龄差','用户年龄','用户网龄（月）','当月通话交往圈人数',
        '最近一次交费是否超过平均消费额','近三个月月均商场出现次数','当月网购类应用使用次数',
        '当月物流快递类应用使用次数','当月账单是否超过平均消费额','当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数','通话人均花费','次数']
    
    
    user_bill_features = [ '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',
                        '用户账单当月总费用（元）', '用户当月账户余额（元）']
    
    log_features = ['当月网购类应用使用次数','当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数','次数']
    
    for col in transform_features + user_bill_features + log_features:
        up_limit = np.percentile(data[col].values,99.9)
        down_limit = np.percentile(data[col].values,0.1)
        data[col].loc[data[col] > up_limit] = up_limit
        data[col].loc[data[col] < down_limit] = down_limit
            
    # 平滑数据，对数化，x 变成 log（1+x）
    for col in user_bill_features+ log_features:
        data[col] = data[col].map(lambda x : np.log1p(x))
    # loc 按索引值定位；iloc 按索引位置定位
        
    train, test = data[:50000], data[50000:]
    return train, test

# 三、模型构造和调参

In [5]:
data, label, test_id = load_data()
data = get_features(data)
train, test = base_process(data)

  interpolation=interpolation)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
clf1 = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=20, max_depth=-1,
                learning_rate=0.015,n_estimators=5000,subsample=0.8,objective='mae',
                subsample_freq=1,colsample_bytree=0.7,reg_alpha=2.2,reg_lambda=1.2,
                random_state=2019,n_jobs=-1)

In [7]:
clf2 = lgb.LGBMRegressor(boosting_type='gbdt',num_leaves=20,max_depth=-1,
                learning_rate=0.015,n_estimators=5000,subsample=0.8,objective='rmse',
                subsample_freq=1,colsample_bytree=0.7,reg_alpha=1.6,reg_lambda=1.8,
                random_state=2018,n_jobs=-1)

In [8]:
clf3 = XGBRegressor(max_depth=4,learning_rate=0.03,n_estimators=2500,silent=True,
            objective='reg:linear',booster='gbtree',n_jobs=-1,gamma=0,
            subsample=0.8,colsample_bytree=0.7,colsample_bylevel=1,reg_alpha=0.03,reg_lambda=0.8,
            scale_pos_weight=1,base_score=0.5,random_state=2017)


In [9]:
clf4 = XGBRegressor(max_depth=4,learning_rate=0.03,n_estimators=2500,silent=True,
            objective='reg:linear',booster='gbtree',n_jobs=-1,gamma=0,
            subsample=0.8,colsample_bytree=0.7,colsample_bylevel=1,reg_alpha=0.05,reg_lambda=1.3,
            scale_pos_weight=1,base_score=0.5,random_state=2016)

In [10]:
kf = StratifiedKFold(n_splits=10,random_state=2015,shuffle=False)
best_score = []
sub_list = []

In [None]:
for i,(train_index,val_index) in enumerate(kf.split(train,label)):
        X_train = train.loc[train_index,:]
        y_train = label[train_index]
        X_val = train.loc[val_index,:]
        y_val = label[val_index]
        
        clf1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='mae',early_stopping_rounds=140,verbose=200)
        pred_val1 = clf1.predict(X_val,num_iteration=clf1.best_iteration_)
        val1_mae = mean_absolute_error(y_val,np.round(pred_val1))
        pred_test1 = clf1.predict(test,num_iteration = clf1.best_iteration_)
        
        clf2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
                 eval_metric='rmse',early_stopping_rounds=140,verbose=200)
        pred_val2 = clf2.predict(X_val,num_iteration = clf2.best_iteration_)
        val2_mae = mean_absolute_error(y_val,np.round(pred_val2))
        pred_test2 = clf2.predict(test,num_iteration = clf2.best_iteration_)
    
       
        pred_val = np.round(pred_val1*0.5 + pred_val2*0.5)
        vali_mae = mean_absolute_error(y_val,pred_val)
        best_score.append(1/(1+vali_mae))
        
        pred_test = np.round(pred_test1*0.5 + pred_test2*0.5)
        sub_list.append(pred_test)
        
 
        print('Round:{:.1f},clf1 score:{:.7f},clf2 score:{:.7f},fusion score:{:.7f}\n'.
             format(i+1,1/(1+val1_mae),1/(1+val2_mae),1/(1+vali_mae)))



Training until validation scores don't improve for 140 rounds.
[200]	training's l1: 15.3377	valid_1's l1: 16.2645
[400]	training's l1: 14.497	valid_1's l1: 15.473
[600]	training's l1: 14.2158	valid_1's l1: 15.3331
[800]	training's l1: 14.0225	valid_1's l1: 15.2795
[1000]	training's l1: 13.8623	valid_1's l1: 15.2391
[1200]	training's l1: 13.7213	valid_1's l1: 15.2226
[1400]	training's l1: 13.5922	valid_1's l1: 15.2078
[1600]	training's l1: 13.4725	valid_1's l1: 15.1932
[1800]	training's l1: 13.3633	valid_1's l1: 15.1838
[2000]	training's l1: 13.2575	valid_1's l1: 15.1788
[2200]	training's l1: 13.1569	valid_1's l1: 15.1741
[2400]	training's l1: 13.0636	valid_1's l1: 15.1744
[2600]	training's l1: 12.9729	valid_1's l1: 15.1683
[2800]	training's l1: 12.8861	valid_1's l1: 15.1624
[3000]	training's l1: 12.8046	valid_1's l1: 15.1607
[3200]	training's l1: 12.7258	valid_1's l1: 15.1579
[3400]	training's l1: 12.6501	valid_1's l1: 15.1543
Early stopping, best iteration is:
[3445]	training's l1: 12

In [None]:
# 前两次的结果比较差，舍弃以后模型表现有提升
pred_test = np.mean(np.array(sub_list[2:]), axis=0)

In [None]:
pred_test = pred_test.astype('int64')
result = pd.Series(pred_test, name='score')
submission = pd.concat([pd.Series(test_id, name='id'), result], axis=1)

In [None]:
submission.to_csv('F:/Jupyter_Program/Algorithm/DCIC/submit_v4.csv', index=False)

In [None]:
importance_features = []
for i in range(0,44):
    importance_features.append((clf1.feature_importances_[i], X_train.columns[i]))

importance_features.sort(reverse=True)
importance_features

In [None]:
best_score

In [None]:
np.mean(best_score[2:])