train.csv： 训练集由Criteo 7天内的部分流量组成。每一行对应一个由Criteo提供的显示广告。为了减少数据集的大小，正(点击)和负(未点击)的例子都以不同的比例进行了抽样。示例是按时间顺序排列的<br>
test.csv: 测试集的计算方法与训练集相同，只是针对训练期之后一天的事件<br>

Label： 目标变量， 0表示未点击， 1表示点击<br>
l1-l13: 13列的数值特征， 大部分是计数特征<br>
C1-C26: 26列分类特征， 为了达到匿名的目的， 这些特征的值离散成了32位的数据表示<br>

### 数据导入与简单处理

In [1]:
"""导入包"""

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss

import gc
from scipy import sparse

import warnings
warnings.filterwarnings('ignore')

In [2]:
"""数据读取与预处理"""

# 数据读取
path = '/Users/linjiaxi/Desktop/RecommendationSystem/Recommendation-System/algorithms/经典排序模型/GBDT+LR/data/'
df_train = pd.read_csv(path + 'train.csv')
df_test = pd.read_csv(path + 'test.csv')

# 简单的数据预处理
# 去掉id列，把测试集和训练集合并， 填充缺失值
df_train.drop(['Id'], axis=1, inplace=True)
df_test.drop(['Id'], axis=1, inplace=True)

df_test['Label'] = -1

data = pd.concat([df_train, df_test])
data.fillna(-1, inplace=True)


In [3]:
df_test.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,Label
0,,-1,,,8020.0,26.0,6.0,0.0,80.0,,...,7119e567,1d04f4a4,b1252a9d,d5f54153,,32c7478e,a9d771cd,c9f3bea7,0a47000d,-1
1,,-1,,,17881.0,9.0,8.0,0.0,0.0,,...,51369abb,,,d4b6b7e8,,32c7478e,37821b83,,,-1
2,0.0,0,2.0,13.0,2904.0,104.0,1.0,3.0,100.0,0.0,...,bd17c3da,966f1c31,a458ea53,1d1393f4,ad3062eb,32c7478e,3fdb382b,010f6491,49d68486,-1
3,0.0,1471,51.0,4.0,1573.0,63.0,1.0,4.0,13.0,0.0,...,1f9656b8,21ddcdc9,b1252a9d,602ce342,,3a171ecb,1793a828,e8b83407,70b6702c,-1
4,0.0,16,9.0,17.0,2972.0,621.0,13.0,42.0,564.0,0.0,...,87c6f83c,,,bf8efd4c,c9d4222a,423fab69,f96a556f,,,-1


In [4]:
df_train.head()

Unnamed: 0,Label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,1,1.0,0,1.0,,227.0,1.0,173.0,18.0,50.0,...,3486227d,e88ffc9d,c393dc22,b1252a9d,57c90cd9,,bcdee96c,4d19a3eb,cb079c2d,456c12a0
1,1,4.0,1,1.0,2.0,27.0,2.0,4.0,2.0,2.0,...,07c540c4,92555263,,,242bb710,,3a171ecb,72c78f11,,
2,1,0.0,806,,,1752.0,142.0,2.0,0.0,50.0,...,07c540c4,25c88e42,21ddcdc9,b1252a9d,a0136dd2,,32c7478e,8fc66e78,001f3601,f37f3967
3,0,2.0,-1,42.0,14.0,302.0,38.0,25.0,38.0,90.0,...,e5ba7672,5aed7436,21ddcdc9,b1252a9d,c3abeb21,,423fab69,1793a828,e8b83407,5cef228f
4,1,0.0,57,2.0,1.0,2891.0,2.0,35.0,1.0,137.0,...,e5ba7672,642f2610,1d1eb838,b1252a9d,1640d50b,ad3062eb,423fab69,45ab94c8,2bf691b1,c84c4aec


In [5]:
"""将特征列分开处理（分为连续型特征和字符型特征）"""
continuous_fea = ['I'+str(i+1) for i in range(13)]
category_fea = ['C'+str(i+1) for i in range(26)]

### 建模

1. 逻辑回归模型：连续特征需要归一化处理，离散特征需要one-hot处理
2. GBDT模型：树模型连续特征不需要归一化处理，但是离散特征需要one-hot处理
3. LR+GBDT模型：由于LR使用的特征是GBDT的输出，原数据依然是GBDT进行处理交叉，所以只需要离散特征one-hot处理

#### 逻辑回归模型

In [8]:
def lr_model(data, category_fea, continuous_fea):
    # 连续特征归一化
    scaler=MinMaxScaler()
    for col in continuous_fea:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))
    
    # 离散特征one-hot编码
    for col in category_fea:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
        
    # 分开训练集和测试集
    train = data[data['Label'] != -1]
    target = train.pop('Label')
    test = data[data['Label'] == -1]
    test.drop(['Label'], axis=1, inplace=True)
    
    # 划分数据集
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2020)
    
    # 简历模型
    lr=LogisticRegression()
    lr.fit(x_train,y_train)
    tr_logloss=log_loss(y_train, lr.predict_proba(x_train)[:,1])  
    val_logloss=log_loss(y_val, lr.predict_proba(x_val)[:,1]) 
    print('tr_logloss: ',tr_logloss)
    print('val_logloss: ',val_logloss) 
    
    # 模型预测
    y_pred=lr.predict_proba(test)[:,1] # predict_proba返回n行k列的矩阵，第i行第j列上的数值是模型预测第i个预测样本为某个标签的概率, 这里的1表示点击的概率
    print('predict: ', y_pred[:10]) # 看前面10个，预测为点击的概率    

In [9]:
# 训练和预测
lr_model(data.copy(), category_fea, continuous_fea)

tr_logloss:  0.1242339516477483
val_logloss:  0.44407245698825887
predict:  [0.44783059 0.80628705 0.1756691  0.02070154 0.13984202 0.46490042
 0.43386417 0.07089967 0.07121148 0.27896238]


### GBDT建模

In [24]:
def gbdt_model(data, category_fea, continuous_fea):
    
    # 离散特征one-hot编码
    for col in category_fea:
        onehot_feats = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, onehot_feats], axis=1)
    
    # 训练集和测试集分开
    train = data[data['Label'] != -1]
    target = train.pop('Label')
    test = data[data['Label'] == -1]
    test.drop(['Label'], axis=1, inplace=True)
    
    # 划分数据集
    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2020)
    
    # 建模
    gbm = lgb.LGBMClassifier(boosting_type='gbdt',  # 这里用gbdt
                             objective='binary', 
                             subsample=0.8,
                             min_child_weight=0.5, 
                             colsample_bytree=0.7,
                             num_leaves=100,
                             max_depth=12,
                             learning_rate=0.01,
                             n_estimators=10000,
                             early_stopping_rounds=100
                            )
    gbm.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_val, y_val)], 
            eval_names=['train', 'val'],
            eval_metric='binary_logloss'
           )
    
    tr_logloss = log_loss(y_train, gbm.predict_proba(x_train)[:, 1])   # −(ylog(p)+(1−y)log(1−p)) log_loss
    val_logloss = log_loss(y_val, gbm.predict_proba(x_val)[:, 1])
    print('tr_logloss: ', tr_logloss)
    print('val_logloss: ', val_logloss)
    
    # 模型预测
    y_pred = gbm.predict_proba(test)[:, 1]  # predict_proba 返回n行k列的矩阵，第i行第j列上的数值是模型预测第i个预测样本为某个标签的概率, 这里的1表示点击的概率
    print('predict: ', y_pred[:10]) # 这里看前10个， 预测为点击的概率

In [25]:
# 模型训练与预测
gbdt_model(data.copy(), category_fea, continuous_fea)

[LightGBM] [Info] Number of positive: 281, number of negative: 998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1632
[LightGBM] [Info] Number of data points in the train set: 1279, number of used features: 149
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219703 -> initscore=-1.267399
[LightGBM] [Info] Start training from score -1.267399
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[147]	train's binary_logloss: 0.330627	val's binary_logloss: 0.434013
tr_logloss:  0.3306267544890511
val_logloss:  0.4340127102761729
predict:  [0.39638553 0.34273839 0.23514265 0.14554212 0.16483957 0.36111202
 0.16796998 0.10459984 0.0929376  0.30083677]


#### LR+GBDT建模

In [28]:
def gbdt_lr_model(data, category_feature, continuous_feature):
    # 离散特征one-hot编码
    for col in category_feature:
        onehot_feats=pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data=pd.concat([data,onehot_feats],axis=1)
        
    train=data[data['Label']!=-1]
    target=train.pop('Label')
    test=data[data['Label']==-1]
    test.drop(['Label'],axis=1, inplace=True)
    
    # 划分数据集
    x_train, x_val, y_train, y_val=train_test_split(train, target, test_size=0.2, random_state=2020)
    
    
    gbm=lgb.LGBMClassifier(
        objective='binary',
        subsample=0.8,
        min_child_weight=0.5,
        colsample_bytree=0.7,
        num_leaves=100,
        max_depth=12,
        learning_rate=0.01,
        n_estimators=1000,
        early_stopping_rounds=100
    )    
    
    gbm.fit(x_train,y_train,
            eval_set=[(x_train, y_train),(x_val,y_val)],
            eval_names=['train','val'],
            eval_metric='binary_logloss'
            )
    
    model=gbm.booster_
    
    gbdt_feats_train=model.predict(train,pred_leaf=True)
    gbdt_feats_test=model.predict(test, pred_leaf=True)
    gbdt_feats_name=['gbdt_leaf_'+str(i) for i in range(gbdt_feats_train.shape[1])]
    df_train_gbdt_feats=pd.DataFrame(gbdt_feats_train, columns=gbdt_feats_name)
    df_test_gbdt_feats=pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name)
    
    train = pd.concat([train, df_train_gbdt_feats], axis = 1)
    test = pd.concat([test, df_test_gbdt_feats], axis = 1)
    train_len = train.shape[0]
    data = pd.concat([train, test])
    del train
    del test
    gc.collect()
    
    # 连续特征归一化
    scaler = MinMaxScaler()
    for col in continuous_feature:
        data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))

    for col in gbdt_feats_name:
        onehot_feats = pd.get_dummies(data[col], prefix = col)
        data.drop([col], axis = 1, inplace = True)
        data = pd.concat([data, onehot_feats], axis = 1)

    train = data[: train_len]
    test = data[train_len:]
    del data
    gc.collect()

    x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2018)

    
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
    print('tr-logloss: ', tr_logloss)
    val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
    print('val-logloss: ', val_logloss)
    y_pred = lr.predict_proba(test)[:, 1]
    print(y_pred[:10])

In [29]:
# 训练和预测
gbdt_lr_model(data.copy(),category_fea, continuous_fea)

[LightGBM] [Info] Number of positive: 281, number of negative: 998
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1632
[LightGBM] [Info] Number of data points in the train set: 1279, number of used features: 149
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.219703 -> initscore=-1.267399
[LightGBM] [Info] Start training from score -1.267399
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[147]	train's binary_logloss: 0.330627	val's binary_logloss: 0.434013
tr-logloss:  0.012043875601048587
val-logloss:  0.30527705372305114
[9.42504639e-01 2.76597656e-01 2.27699953e-02 1.86025096e-02
 4.45934326e-03 6.48474724e-01 6.80003741e-03 3.57831915e-03
 4.16062509e-04 5.44962925e-02]
