In [1]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore")

In [2]:
new_data = pd.read_csv("./dataset/features.csv")

##### 預處理


In [3]:
# 壓縮資料
fcols = new_data.select_dtypes("float").columns
icols = new_data.select_dtypes("integer").columns
new_data[fcols] = new_data[fcols].apply(pd.to_numeric, downcast="float")
new_data[icols] = new_data[icols].apply(pd.to_numeric, downcast="integer")

new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522341 entries, 0 to 522340
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   user_id          522341 non-null  int32  
 1   merchant_id      522341 non-null  int16  
 2   label            260864 non-null  float32
 3   data             522341 non-null  object 
 4   age_range        522341 non-null  float32
 5   gender           522341 non-null  float32
 6   uniq_item_id     522341 non-null  int16  
 7   total_cat_id     522341 non-null  int8   
 8   total_time_temp  522341 non-null  int8   
 9   clicks           522341 non-null  int16  
 10  shopping_cart    522341 non-null  int8   
 11  purchases        522341 non-null  int8   
 12  favourites       522341 non-null  int16  
 13  purchases_ctr    522341 non-null  float32
dtypes: float32(4), int16(4), int32(1), int8(4), object(1)
memory usage: 19.9+ MB


In [4]:
train = new_data[new_data["data"] == "train"]
test = new_data[new_data["data"] == "test"].drop(["data", "label"], axis=1)

In [5]:
X, y = train.drop(["user_id", "merchant_id", "label", "data"], axis=1), train["label"]

# 無限值替換成0
X.replace([np.inf, -np.inf], 0, inplace=True)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

### 找出最佳的模型參數


In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

##### 隨機森林(RandomForest)


In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
# 調整參數

rf_1 = RandomForestClassifier()

# n_estimators:決策數的各數, max_depath:樹的最大深度, min_samples_split:根據屬性劃分節點時，每個劃分最少的樣本數, min_samples_leaf:葉子節點最少的樣本數數
params = {
    "n_estimators": [50, 100],
    "max_depth": [1, 5, 10],
    "min_samples_split": [1, 2, 100],
    "min_samples_leaf": [1, 10, 50],
}

# 窮舉搜索
grid_search_1 = GridSearchCV(
    rf_1, params, cv=3, scoring="roc_auc"
)  # estimator：機械學習算法/param_grid：優化的參數和取值/cv：交叉驗證的折數/scoring：準確度的判斷標準

In [10]:
grid_search_1.fit(X_train, y_train)

In [11]:
display(grid_search_1.best_params_)
display(grid_search_1.best_score_)

{'max_depth': 10,
 'min_samples_leaf': 50,
 'min_samples_split': 100,
 'n_estimators': 100}

np.float64(0.6181883449262512)

##### 邏輯迴歸(LogisticRegression)


In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
# 調整參數

clf_1 = LogisticRegression()

# slover: 優化算法選擇參數, C:正規化係數 λ 的倒數, penalty:懲罰項
params = {
    "solver": ["liblinear", "saga", "lbfgs", "newton-cg"],
    "C": [0.01, 0.1, 1],
    "penalty": ["l1", "l2"],
}

grid_search_2 = GridSearchCV(clf_1, params, cv=5, scoring="roc_auc")

In [14]:
grid_search_2.fit(X_train, y_train)

In [15]:
display(grid_search_2.best_params_)
display(grid_search_2.best_score_)

{'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}

np.float64(0.6147690206571472)

##### XGBoost


In [16]:
from xgboost import XGBClassifier

In [17]:
# 調整參數

xgc_1 = XGBClassifier()

params = {
    "eta": [0.1, 0.2],
    "gamma": [1, 5, 50],
    "min_child_weight": [10, 100, 500],
    "max_depth": [1, 5, 50],
    "subsample": [0.5],
    "objective": ["binary:logistic"],
    "eval_metric": ["auc"],
}

grid_search_3 = GridSearchCV(xgc_1, params, cv=3, scoring="roc_auc")

In [18]:
grid_search_3.fit(X_train, y_train)

In [19]:
display(grid_search_3.best_params_)
display(grid_search_3.best_score_)

{'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 5,
 'max_depth': 5,
 'min_child_weight': 100,
 'objective': 'binary:logistic',
 'subsample': 0.5}

np.float64(0.6201131970923502)

##### LightGBM


In [20]:
from lightgbm import LGBMClassifier

In [21]:
lgbm_1 = LGBMClassifier()
params = {
    "boosting_type": ["gbdt", "dart", "goss"],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [100, 500],
    "num_leaves": [10, 30, 100],
    "max_depth": [50, 60, 70],
    "subsample": [0.5],
    "min_split_gain": [0.05],
}

grid_search_4 = GridSearchCV(lgbm_1, params, cv=3, scoring="roc_auc")

In [22]:
grid_search_4.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 8466, number of negative: 130661
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 692
[LightGBM] [Info] Number of data points in the train set: 139127, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.060851 -> initscore=-2.736548
[LightGBM] [Info] Start training from score -2.736548
[LightGBM] [Info] Number of positive: 8466, number of negative: 130661
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 688
[LightGBM] [Info] Number of data points in the train set: 139127, number of used features: 10
[LightGBM] [Info] 

In [23]:
display(grid_search_4.best_params_)
display(grid_search_4.best_score_)

{'boosting_type': 'goss',
 'learning_rate': 0.01,
 'max_depth': 50,
 'min_split_gain': 0.05,
 'n_estimators': 500,
 'num_leaves': 10,
 'subsample': 0.5}

np.float64(0.6201558501258555)

### 參考資料

GridSearchCV

https://www.cnblogs.com/wj-1314/p/10422159.html

RandomForest 參數

https://blog.csdn.net/qq_16633405/article/details/61200502

LogisticRegression 參數

https://blog.csdn.net/jark_/article/details/78342644

LGBMClassifier 參數

https://blog.csdn.net/qq_41185868/article/details/109252743

XGBoost 參數

https://blog.csdn.net/han_xiaoyang/article/details/52665396
