# Ex1 - 監督式學習

## 筆記

關於XGBoost參數:
* eta(0.3) // learning rate(典型值0.01~0.2)
* min_child_weight(1) // 最小樣本權重和，值調大可避免overfitting，太大會underfitting
* max_depth(6) // 樹的最大深度，避免overfitting(典型值3~10)
* gamma(0) // 指定節點分裂所需的最小loss function下降值，值越大算法越保守
* subsample(1) //隨機采樣的比例，越小越不會overfitting(典型值0.5~1)
* scale_pos_weight(1) //處理樣本比例不平衡問題

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import kagglehub

# general setting. do not change TEST_SIZE
# 這個不能動
RANDOM_SEED = 42
TEST_SIZE = 0.3

################################ 資料處理 ####################################

# load dataset（from kagglehub）
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
# 0 for nonfraud 1 for fraud
data["Class"] = data["Class"].astype(int)

# prepare data
data = data.drop(["Time"], axis=1)  # 去除 time
data["Amount"] = StandardScaler().fit_transform(
    data["Amount"].values.reshape(-1, 1)
)  # 標準化

# 計算詐騙和正常交易的數量 (資料集極度不平衡)
fraud = data[data["Class"] == 1]
nonfraud = data[data["Class"] == 0]
print(f"Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}")
print(
    f"the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)"
)

# 選擇非class的值轉成numpy array 且如果原本就是np array時不複製
X = np.asarray(data.iloc[:, ~data.columns.isin(["Class"])])
# 最佳化為 Pandas → NumPy 的不複製轉換
Y = data["Class"].to_numpy()

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

########################## train model ################################

xgb_model = XGBClassifier(
    n_estimators=50,
    max_depth=8,
    # 處理資料集不平衡
    scale_pos_weight=len(nonfraud) * 2 / len(fraud),
    eval_metric="logloss",
    random_state=RANDOM_SEED,  # 42
)

xgb_model.fit(
    X_train,
    y_train,
)

y_proba = xgb_model.predict_proba(X_test)[:, 1]

# 嘗試不同 threshold
for threshold in [0.5, 0.55, 0.6, 0.65, 0.7]:
    y_pred_thresh = (y_proba > threshold).astype(int)
    print(classification_report(y_test, y_pred_thresh))

y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

print("Train F1:", f1_train)
print("Test  F1:", f1_test)

  from .autonotebook import tqdm as notebook_tqdm


Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 492/284807 (0.173%)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.90      0.86      0.88       136

    accuracy                           1.00     85443
   macro avg       0.95      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.90      0.86      0.88       136

    accuracy                           1.00     85443
   macro avg       0.95      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.93      0.86      0.89       136

    accuracy                           1.00     85443
   macro avg       0.96      0