In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt
from datetime import date
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
import lightgbm as lgb

# 获取数据

In [3]:
def get_processed_data():
    dataset1 = pd.read_csv('./GenerateData1.csv')
    dataset2 = pd.read_csv('./GenerateData2.csv')
    dataset3 = pd.read_csv('./GenerateData3.csv') 
    
    dataset1.label.replace(-1, 0, inplace=True)
    dataset2.label.replace(-1, 0, inplace=True)
    
    dataset1.drop_duplicates(inplace=True)
    dataset2.drop_duplicates(inplace=True)
    dataset3.drop_duplicates(inplace=True) 
    # 按照行或列进行合并,axis=0为列索引，axis=1为行索引 因为特征处理都一样, 所以按照列索引
    dataset12 = pd.concat([dataset1, dataset2], axis=0)

    dataset12.fillna(-1, inplace=True)
#     dataset3.fillna(0, inplace=True)

    return dataset12, dataset3

In [4]:
dataset12, dataset3 = get_processed_data()

In [5]:
predict_dataset = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
dataset12_label = dataset12.label
# 降低维度, 把没有必要的字段删除
dataset12_x = dataset12.drop(['user_id','label','coupon_id','day_gap_before','day_gap_after'],axis=1)
dataset3.fillna(-1, inplace=True)
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

# 数据分割 

In [6]:
x_train, x_test, y_train, y_test = train_test_split(dataset12_x, dataset12_label, test_size=0.25, random_state=88)

In [7]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((328240, 52), (109414, 52), (328240,), (109414,))

# 模型训练

## 随机森林 score:0.7790

In [10]:
model = RandomForestClassifier(n_estimators=190, 
                               criterion='gini', 
                               bootstrap=True,  
                               max_depth=15, 
                               max_features=24, 
                               min_samples_leaf=5, 
                               oob_score=True, 
                               random_state=0, 
                               n_jobs=-1)

In [11]:
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features=24,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=190,
                       n_jobs=-1, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [13]:
model.score(x_test, y_test)

0.9399071416820517

In [14]:
y_predict_proba = model.predict_proba(x_test)

In [17]:
y_predict_proba[:, 1].itemsize

8

In [20]:
print("AUC",roc_auc_score(y_test,y_predict_proba[:,1]))

AUC 0.8979076720483452


In [21]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("rdf_preds1.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## GBDT score:0.7297

In [24]:
model =GradientBoostingClassifier(learning_rate=0.1,
                                 n_estimators=190,
                                 min_samples_split=5,
                                 min_samples_leaf=5,
                                 max_depth=15,
                                 random_state=0,
                                 max_features=24,)
model.fit(x_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=15,
                           max_features=24, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=5, min_samples_split=5,
                           min_weight_fraction_leaf=0.0, n_estimators=190,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [25]:
model.score(x_test, y_test)

0.9373754729742081

In [32]:
x_test.shape

(109414, 52)

In [28]:
y_predict_proba = model.predict_proba(x_test)
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))

AUC准确率: 0.8692195079846718


In [30]:
y_predict_proba

array([[4.16554643e-01, 5.83445357e-01],
       [9.98395049e-01, 1.60495139e-03],
       [9.87593646e-01, 1.24063544e-02],
       ...,
       [9.46224733e-01, 5.37752665e-02],
       [8.65366794e-01, 1.34633206e-01],
       [9.99404371e-01, 5.95628598e-04]])

In [31]:
y_predict_proba.itemsize

8

In [29]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("gbdt_preds2.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## lightGBM  score:0.7869

In [34]:
# 1.boosting_type=‘gbdt’# 提升树的类型 gbdt,dart,goss,rf
# 2.num_leaves=32#树的最大叶子数，对比xgboost一般为2^(max_depth)
# 3.max_depth=-1#最大树的深度
# 4.learning_rate#学习率
# 5.n_estimators=10: 拟合的树的棵树，相当于训练轮数
# 6.subsample=1.0: 训练样本采样率 
# 7.colsample_bytree=1.0: 训练特征采样率 列
# 8.subsample_freq=1: 子样本频率
# 9.reg_alpha=0.0: L1正则化系数
# 10.reg_lambda=0.0: L2正则化系数
# 11.random_state=None: 随机种子数
# 12.n_jobs=-1: 并行运行多线程核心数
# 13.silent=True: 训练过程是否打印日志信息
# 14.min_split_gain=0.0: 最小分割增益
# 15.min_child_weight=0.001: 分支结点的最小权重
# 16.sub_feature: LightGBM将在每次迭代(树)中随机选择部分特性 即随机选择70%的特性

model = lgb.LGBMClassifier(
                    learning_rate = 0.01,
                    boosting_type = 'gbdt',
                    objective = 'binary',
                    metric = 'logloss',
                    max_depth = 5,
                    sub_feature = 0.7,
                    num_leaves = 3,
                    colsample_bytree = 0.7,
                    n_estimators = 5000,
                    early_stop = 50)


In [35]:
model.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
               early_stop=50, importance_type='split', learning_rate=0.01,
               max_depth=5, metric='logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
               n_jobs=-1, num_leaves=3, objective='binary', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, sub_feature=0.7,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbose=-1)

In [36]:
model.score(x_test, y_test)

0.9350448754272762

In [37]:
y_predict_proba = model.predict_proba(x_test)

In [38]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))

AUC准确率: 0.8819782907036887


In [39]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("lightGBM_preds.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## 逻辑回归 score:0.6932

In [49]:
model = LogisticRegression(max_iter=1000, n_jobs=-1, l1_ratio=0.01, random_state=22)

In [50]:
model.fit(x_train, y_train)

  "(penalty={})".format(self.penalty))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.01, max_iter=1000,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=22,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [51]:
model.score(x_test, y_test)

0.9256036704626465

In [52]:
y_predict_proba = model.predict_proba(x_test)

In [56]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))

AUC准确率: 0.8190074583111724


In [57]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("LOG_preds1.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## 逻辑回归 SGDClassifier score:0.6119

In [58]:
#fit_intercept：是否计算偏置
model = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1)

In [59]:
model.fit(x_train, y_train)



SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.01, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [60]:
model.score(x_test, y_test)

0.8991719524009725

In [61]:
y_predict_proba = model.predict_proba(x_test)

In [62]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))

AUC准确率: 0.7653896044259063


In [63]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("SGD_preds1.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## xgboost模型 XGBClassifier score:0.7551

In [64]:
from xgboost import XGBClassifier

In [65]:
# 可以使用一个参数, 其他参数不变进行调优. 类似于随机森林模型的调优方法
model = XGBClassifier(max_depth=15, learning_rate=0.01,eta=1, gamma=0, n_jobs=-1)

In [66]:
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [67]:
model.score(x_test, y_test)

0.939221671815307

In [68]:
y_predict_proba = model.predict_proba(x_test)

In [69]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))

AUC准确率: 0.892734335681119


In [70]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("XGBC_preds1.csv",index=None,header=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
