In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_curve
from sklearn.utils import resample

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import kagglehub

RANDOM_SEED = 42
TEST_SIZE = 0.3

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 載入資料集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 移除Time欄位，標準化Amount
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 檢視資料分佈
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'詐欺交易: {len(fraud)}, 正常交易: {len(nonfraud)}')
print(f'詐欺交易比例: {len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%')

詐欺交易: 492, 正常交易: 284315
詐欺交易比例: 0.173%


In [4]:
print("\n=== 監督式學習 ===")

X = data.drop('Class', axis=1).values
y = data['Class'].values

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, 
                                                    random_state=RANDOM_SEED, stratify=y)

# 先過採樣到合理比例，再適度欠採樣
oversample = SMOTE(sampling_strategy=0.3, random_state=RANDOM_SEED, k_neighbors=5)  # 30%比例
undersample = RandomUnderSampler(sampling_strategy=0.3, random_state=RANDOM_SEED)   # 最終30%比例

# 使用pipeline組合
pipeline = Pipeline([('oversample', oversample), ('undersample', undersample)])
X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train)

print(f"平衡後資料分佈: 正常交易:{np.sum(y_train_balanced==0)}, 詐欺交易:{np.sum(y_train_balanced==1)}")

def create_enhanced_features(X):
    """創建增強特徵"""
    X_enhanced = X.copy()
    
    # 添加統計特徵
    X_enhanced = np.column_stack([
        X_enhanced,
        np.sum(X**2, axis=1),  # 平方和
        np.mean(X, axis=1),    # 平均值
        np.std(X, axis=1),     # 標準差
        np.max(X, axis=1),     # 最大值
        np.min(X, axis=1)      # 最小值
    ])
    
    return X_enhanced

X_train_enhanced = create_enhanced_features(X_train_balanced)
X_test_enhanced = create_enhanced_features(X_test)

print(f"增強特徵後維度: {X_train_enhanced.shape}")



# 創建多個不同的模型
rf_model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=15,
    min_samples_leaf=10,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

rf_model.fit(X_train_enhanced, y_train_balanced)


# 使用更高的閾值提高精確度
y_pred_proba = rf_model.predict_proba(X_test_enhanced)[:, 1]
print(f"詐欺交易預測機率分佈 - 最小值: {y_pred_proba.min():.3f}, 最大值: {y_pred_proba.max():.3f}, 平均值: {y_pred_proba.mean():.3f}")

# 動態閾值選擇
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_threshold_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_threshold_idx]

print(f"最佳閾值: {best_threshold:.3f}")
y_pred_improved = (y_pred_proba >= best_threshold).astype(int)

# 評估函數
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f'\n{model_name} 評估結果:')
    print('=' * 45)
    print(f' Accuracy: {accuracy:.4f}')
    print(f' Precision: {precision:.4f}')
    print(f' Recall: {recall:.4f}')
    print(f' F1 Score: {f1:.4f}')
    print(f"\n詳細分類報告:")
    print(classification_report(y_true, y_pred))
    return accuracy, precision, recall, f1

# 評估改進的監督學習結果
sup_results = evaluation(y_test, y_pred_improved, "監督學習")


=== 監督式學習 ===
平衡後資料分佈: 正常交易:199020, 詐欺交易:59706
增強特徵後維度: (258726, 34)
詐欺交易預測機率分佈 - 最小值: 0.000, 最大值: 1.000, 平均值: 0.008
最佳閾值: 0.842

監督學習 評估結果:
 Accuracy: 0.9995
 Precision: 0.9565
 Recall: 0.7432
 F1 Score: 0.8365

詳細分類報告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.96      0.74      0.84       148

    accuracy                           1.00     85443
   macro avg       0.98      0.87      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [5]:
from pygame import init


print("\n=== 非監督式學習 ===")

# === 資料預處理 ===
scaler = RobustScaler()  # 對異常值更穩健
X_scaled = scaler.fit_transform(X)


# === 欠抽樣正常樣本 ===
# 分離正常樣本和異常樣本
normal_samples = X[y == 0]
fraud_samples = X[y == 1]

# 確認異常樣本數量
num_fraud_samples = len(fraud_samples)

# 欠抽樣正常樣本，使其與異常樣本數量一致
normal_downsampled = resample(
    normal_samples,
    replace=False,  # 不重複抽樣
    n_samples=num_fraud_samples,  # 與異常樣本數量一致
    random_state=42
)

# 合併欠抽樣的正常樣本與異常樣本
X_resampled = np.vstack([normal_downsampled, fraud_samples])
y_resampled = np.hstack([np.zeros(num_fraud_samples), np.ones(num_fraud_samples)])

# 重新分割資料集
X_train_scaled, X_test_scaled, y_train_unsup, y_test_unsup = train_test_split(
    X_resampled, y_resampled, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y_resampled)



# === 使用PCA降維來改善聚類效果 ===
pca = PCA(random_state=42)
pca.fit(X_train_scaled)
explained_variance = np.cumsum(pca.explained_variance_ratio_)
optimal_components = np.argmax(explained_variance >= 0.95) + 1
print(f"最佳主成分數目: {optimal_components}")
pca = PCA(n_components=optimal_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# === 聚類與異常檢測 ===
best_score = -1
best_k = 2
best_model = None

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, init='random', random_state=42, n_init=100)
    kmeans.fit(X_train_pca[y_train_unsup == 0])  # 僅用正常樣本訓練
    
    # 預測測試集
    test_labels = kmeans.predict(X_test_pca)
    
    # 計算每個 cluster 的詐欺率
    cluster_fraud_rates = []
    for i in range(k):
        mask = (test_labels == i)
        if np.sum(mask) > 0:
            fraud_rate = np.sum(y_test_unsup[mask] == 1) / np.sum(mask)
            cluster_fraud_rates.append(fraud_rate)
        else:
            cluster_fraud_rates.append(0)
    
    # 選擇異常 cluster
    mean_fraud_rate = np.mean(cluster_fraud_rates)
    anomaly_clusters = [i for i, rate in enumerate(cluster_fraud_rates) if rate > mean_fraud_rate]
    y_pred_cluster = np.isin(test_labels, anomaly_clusters).astype(int)
    
    # 計算 F1 分數
    f1 = f1_score(y_test_unsup, y_pred_cluster)
    if f1 > best_score:
        best_score = f1
        best_k = k
        best_model = kmeans
        best_pred = y_pred_cluster

print(f"最佳聚類數: {best_k}")


# 評估改進的非監督學習結果
unsup_results = evaluation(y_test_unsup, best_pred, "改進的非監督學習 (KMeans + PCA)")

pygame 2.6.1 (SDL 2.28.4, Python 3.11.9)
Hello from the pygame community. https://www.pygame.org/contribute.html

=== 非監督式學習 ===
最佳主成分數目: 8
最佳聚類數: 7

改進的非監督學習 (KMeans + PCA) 評估結果:
 Accuracy: 0.8480
 Precision: 0.7910
 Recall: 0.9459
 F1 Score: 0.8615

詳細分類報告:
              precision    recall  f1-score   support

         0.0       0.93      0.75      0.83       148
         1.0       0.79      0.95      0.86       148

    accuracy                           0.85       296
   macro avg       0.86      0.85      0.85       296
weighted avg       0.86      0.85      0.85       296



In [6]:
print("\n" + "="*60)
print("結果總結:")
print("="*60)
print(f"監督學習 - F1 Score: {sup_results[3]:.4f}")
print(f"非監督學習 - F1 Score: {unsup_results[3]:.4f}")


結果總結:
監督學習 - F1 Score: 0.8365
非監督學習 - F1 Score: 0.8615
