In [23]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [24]:
new_data = pd.read_csv('./dataset/features.csv')

##### 預處理

In [25]:
# 壓縮資料
fcols = new_data.select_dtypes('float').columns
icols = new_data.select_dtypes('integer').columns
new_data[fcols] = new_data[fcols].apply(pd.to_numeric, downcast='float')
new_data[icols] = new_data[icols].apply(pd.to_numeric, downcast='integer')

new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47511 entries, 0 to 47510
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          47511 non-null  int32  
 1   merchant_id      47511 non-null  int16  
 2   label            23687 non-null  float32
 3   data             47511 non-null  object 
 4   age_range        47511 non-null  float32
 5   gender           47511 non-null  float32
 6   uniq_item_id     47511 non-null  int16  
 7   total_cat_id     47511 non-null  int8   
 8   total_time_temp  47511 non-null  int8   
 9   clicks           47511 non-null  int16  
 10  shopping_cart    47511 non-null  int8   
 11  purchases        47511 non-null  int8   
 12  favourites       47511 non-null  int8   
 13  purchases_ctr    47511 non-null  float32
dtypes: float32(4), int16(3), int32(1), int8(5), object(1)
memory usage: 1.8+ MB


In [26]:
train = new_data[new_data["data"] == "train"]
test = new_data[new_data["data"] == "test"].drop(["data", "label"], axis = 1)

In [27]:
X, y = train.drop(["user_id", "merchant_id", "label", 'data'], axis = 1), train["label"]

# 無限值替換成0
X.replace([np.inf, -np.inf], 0, inplace = True)

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

### 找出最佳的模型參數

In [30]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

##### 隨機森林(RandomForest)

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
# 調整參數

rf_1 = RandomForestClassifier()

# n_estimators:決策數的各數, max_depath:樹的最大深度, min_samples_split:根據屬性劃分節點時，每個劃分最少的樣本數, min_samples_leaf:葉子節點最少的樣本數數
params = { "n_estimators":[50, 100],  
          "max_depth":[1, 5, 10],
          "min_samples_split":[1, 2, 100],
          "min_samples_leaf":[1, 10, 50]
         }

# 窮舉搜索
grid_search_1 = GridSearchCV(rf_1, params, cv = 3, scoring = "roc_auc") # estimator：機械學習算法/param_grid：優化的參數和取值/cv：交叉驗證的折數/scoring：準確度的判斷標準

In [33]:
grid_search_1.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 10],
                         'min_samples_leaf': [1, 10, 50],
                         'min_samples_split': [1, 2, 100],
                         'n_estimators': [50, 100]},
             scoring='roc_auc')

In [34]:
display(grid_search_1.best_params_)
display(grid_search_1.best_score_)

{'max_depth': 5,
 'min_samples_leaf': 50,
 'min_samples_split': 2,
 'n_estimators': 50}

0.6325991680535511

##### 邏輯迴歸(LogisticRegression)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
# 調整參數

clf_1 = LogisticRegression()

# slover: 優化算法選擇參數, C:正規化係數 λ 的倒數, penalty:懲罰項
params = {  "solver":["liblinear", "saga", 'lbfgs', 'newton-cg'],
        "C":[0.01, 0.1, 1],
        "penalty":["l1", "l2"]
        }

grid_search_2 = GridSearchCV(clf_1, params, cv = 5, scoring = "roc_auc")

In [37]:
grid_search_2.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg']},
             scoring='roc_auc')

In [38]:
display(grid_search_2.best_params_)
display(grid_search_2.best_score_)

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

0.6246054585918518

##### XGBoost

In [39]:
from xgboost import XGBClassifier

In [40]:
# 調整參數

xgc_1 = XGBClassifier()

params = {"eta":[0.1, 0.2],
          "gamma":[1, 5, 50],
          "min_child_weight":[10, 100, 500],
          "max_depth":[1, 5, 50],
          "subsample":[0.5],
          "objective":["binary:logistic"],
          "eval_metric":  ["auc"]
         }

grid_search_3 = GridSearchCV(xgc_1, params,cv = 3,scoring = "roc_auc")

In [41]:
grid_search_3.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                        

In [42]:
display(grid_search_3.best_params_)
display(grid_search_3.best_score_)

{'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 1,
 'max_depth': 1,
 'min_child_weight': 10,
 'objective': 'binary:logistic',
 'subsample': 0.5}

0.6326085133804678

##### LightGBM

In [43]:
from lightgbm import LGBMClassifier

In [44]:
lgbm_1 = LGBMClassifier()
params = {    "boosting_type":["gbdt", "dart", "goss"],
          "learning_rate":[0.01, 0.05],
          "n_estimators":[100, 500],
          "num_leaves":[10, 30, 100],
          "max_depth":[50, 60, 70],
          "subsample":[0.5],
          "min_split_gain":[0.05]
         }

grid_search_4 = GridSearchCV(lgbm_1,params,cv = 3,scoring = "roc_auc")

In [45]:
grid_search_4.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LGBMClassifier(),
             param_grid={'boosting_type': ['gbdt', 'dart', 'goss'],
                         'learning_rate': [0.01, 0.05],
                         'max_depth': [50, 60, 70], 'min_split_gain': [0.05],
                         'n_estimators': [100, 500],
                         'num_leaves': [10, 30, 100], 'subsample': [0.5]},
             scoring='roc_auc')

In [46]:
display(grid_search_4.best_params_)
display(grid_search_4.best_score_)

{'boosting_type': 'goss',
 'learning_rate': 0.05,
 'max_depth': 50,
 'min_split_gain': 0.05,
 'n_estimators': 100,
 'num_leaves': 10,
 'subsample': 0.5}

0.6279458491634625

### 參考資料

1.GridSearchCV
 
https://www.cnblogs.com/wj-1314/p/10422159.html

2.RandomForest 參數

https://blog.csdn.net/qq_16633405/article/details/61200502

3.LogisticRegression 參數

https://blog.csdn.net/jark_/article/details/78342644

4.LGBMClassifier 參數

https://blog.csdn.net/qq_41185868/article/details/109252743

5.XGBoost 參數

https://blog.csdn.net/han_xiaoyang/article/details/52665396