In [2]:
import os
import numpy as np
import pandas as pd
from datetime import date
import datetime
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# model features
cate_var = ['Distance', 'discount_type', 'weekday', 'weekday_type']
conti_var = ['discount_rate', 'discount_total', 'discount_price',
             'user_total_coupon','user_use_coupon','user_coupon_use_ratio', 
             #'user_mean_days', 'user_coupon_mean_days',
             'send_coupon_cnt',
             'user_purch_cnt',
             'user_recieved_coupon_cnt',
             'coupon_id_recieved_coupon_cnt',
             'auser_total_cnt', 'auser_use_cnt', 'coupon_use_last_days', 'acoupon_ratio'
            ]

In [5]:
def generate_result(model, test_data, output):
    
    y_valid_pred = model.predict_proba(test_data)
    output['pred_prob'] = y_valid_pred[:, 1]
    output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
    output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x: x.strftime('%Y%m%d'))
    output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
    output.reset_index(drop=True, inplace=True)
    out = output.groupby("uid", as_index=False).mean()
    out = out[["uid", "pred_prob"]]
    out.columns = ["uid", "label"]
    print(out.shape)
    return out

def generate_feature(df, conti_var, cate_var):
    df_f = df[conti_var]
    for i in cate_var:
        tmp = pd.get_dummies(df[i], prefix=i)
        df_f = pd.concat([df_f, tmp], axis=1)
    return df_f

In [None]:
# build model for valid data

In [5]:
pretrain_received = pd.read_pickle("./pretrain_received.pkl")
valid_received = pd.read_pickle("./valid_received.pkl")

In [6]:
pretrain_df = generate_feature(pretrain_received, conti_var, cate_var)
valid_df = generate_feature(valid_received, conti_var, cate_var)
pretrain_df = pretrain_df.fillna(0)
valid_df = valid_df.fillna(0)

In [8]:
model = GradientBoostingClassifier()
model.fit(pretrain_df, pretrain_received['label'])
y_valid_pred = model.predict_proba(valid_df)

In [13]:
valid1 = valid_received.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

# Validation AUC: 0.787, Accuracy: 0.952
# Validation AUC: 0.800, Accuracy: 0.943
# Validation AUC: 0.822, Accuracy: 0.943x
# Validation AUC: 0.806, Accuracy: 0.943
# Validation AUC: 0.859, Accuracy: 0.932x
# Validation AUC: 0.812, Accuracy: 0.932
# Validation AUC: 0.829, Accuracy: 0.943
# Validation AUC: 0.837, Accuracy: 0.943x
# Validation AUC: 0.844, Accuracy: 0.944
# Validation AUC: 0.856, Accuracy: 0.944
auc_score = roc_auc_score(y_true=valid_received.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid_received.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.856, Accuracy: 0.944


In [None]:
# build model for test data

In [3]:
train_received = pd.read_pickle("./train_received_t.pkl")
test_received = pd.read_pickle("./test_received_t.pkl")

In [6]:
train_df = generate_feature(train_received, conti_var, cate_var)
test_df = generate_feature(test_received, conti_var, cate_var)
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [7]:
# model GradientBoosting
model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=300)
model.fit(train_df, train_received['label'])
output = test_received[["User_id", "Coupon_id", "Date_received"]]
out = generate_result(model, test_df, output)
out.to_csv("add_purch_cnt_gdbt_3.csv", header=["uid", "label"], index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(304096, 2)


In [36]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

gdbc_param_grid = {
    'n_estimators': [50, 100, 200, 300], # 樹有幾棵
    'learning_rate': [.1, .5, .01, .05]
    }
gdbc = GradientBoostingClassifier()

gdbc_random = RandomizedSearchCV(param_distributions=gdbc_param_grid,
                   estimator=gdbc,
                   scoring="roc_auc",verbose=1,
                   n_iter=50, 
                   cv=4,
                   n_jobs=-1)

In [None]:
# Fit randomized_mse to the data
gdbc_random.fit(pretrain_df, pretrain_received['label'])
print("Best parameters found: ", gdbc_random.best_params_)
print("Best accuracy found: ", gdbc_random.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 4 folds for each of 16 candidates, totalling 64 fits


In [35]:
# model xgboost
xgbc = XGBClassifier(n_estimators=200, max_depth=10)

xgbc.fit(train_df, train_received['label'])
output = test_received[["User_id", "Coupon_id", "Date_received"]]
out = generate_result(xgbc, test_df, output)
out.to_csv("xgbc_3.csv", header=["uid", "label"], index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(304096, 2)
