In [1]:
# -*- coding: utf-8 -*-
#  Content：Porto Seguro是巴西最大的汽车与住房保险公司之一，根据汽车保单持有人的数据建立机器学习模型，分析该持有人是否会
#           在次年提出索赔。数据已进行脱敏使用GBDT+LR模型进行预测，并计算Normalized Cross Entropy.
#  Author:  HuiHui
#  Date:    2020-03-25
#  Reference:
#  DataSet: porto seguro safe driver prediction（kaggle 2017年比赛）

import numpy as np
import pandas as pd
np.random.seed(10)
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,log_loss
from sklearn.pipeline import make_pipeline

#数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
# #数据探索
# print('查看数据信息：列名、非空个数、类型等')
# train_data.info()
# print('～'*30)
# print('查看前5条数据')
# train_data.head()

In [2]:
# 将样本集分成测试集和训练集，训练集还应分成gbdt训练集和gbdt测试集（其得到的Xgbdt向量作为lr的训练集）
X = train_data.drop('target',axis=1)
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)
X_test_data = test_data #最终测试集

In [3]:
# 基于GBDT监督变换
n_estimator = 10 #使用到的决策树的数量
grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd.fit(X_train, y_train)
# 得到OneHot编码
grd_enc = OneHotEncoder(categories='auto')
grd_enc.fit(grd.apply(X_train)[:, :, 0])
#print(grd_enc.transform(grd.apply(X_train)[:, :, 0]).toarray()) #打印OneHot编码

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [4]:
# 使用OneHot编码作为特征，训练LR
grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000)
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)#apply(X_train_lr)返回训练数据X_train_lr在训练好的模型里每棵树中所处的叶子节点的位置（索引）
# 使用LR进行预测
y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] #predict_proba预测概率
# 计算logloss；Normalized Cross Entropy怎么计算❓
logloss=log_loss(y_test, y_pred_grd_lm)
print("logloss: " + str(logloss))

#保存最终测试集预测结果至submission.csv
y_pred= grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test_data)[:, :, 0]))[:, 1]
result=pd.DataFrame(columns=("id","target"))
result["id"]=X_test_data["id"]
result["target"]=np.around(y_pred, decimals=4) # 保留4位小数
result.to_csv("./submission.csv",index=False) # index=False避免产生Unnamed:0列


logloss: 0.15479607581081992
