In [None]:
import pandas as pd
import kagglehub
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# 獲取並讀取資料
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 資料切分，測試集比例
X = data.drop(columns=['Class'])
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Baseline模型訓練（有標籤資料）
baseline_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
baseline_clf.fit(X_train, y_train)

# 先使用baseline模型預測訓練集，選擇未標記的偽標籤 (pseudo-labeling)
y_train_pred_prob = baseline_clf.predict_proba(X_train)[:, 1]
threshold = 0.5
pseudo_labels = (y_train_pred_prob >= threshold).astype(int)

# 將偽標籤加入訓練集(半監督策略簡單示範)
X_semi = pd.concat([X_train, X_train])
y_semi = np.concatenate([y_train, pseudo_labels])

# 重新訓練模型 (半監督)
semi_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
semi_clf.fit(X_semi, y_semi)

# 預測測試集 (半監督模型，閾值0.5)
y_pred_semi_prob = semi_clf.predict_proba(X_test)[:, 1]
y_pred_semi = (y_pred_semi_prob >= 0.5).astype(int)

# 評估函式
def print_evaluation(y_true, y_pred, model_name="Model Evaluation"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"{model_name}:")
    print("=============================================")
    print(f"        Accuracy: {acc}")
    print(f" Precision Score: {prec}")
    print(f"    Recall Score: {rec}")
    print(f"        F1 Score: {f1}\n")
    print("Classification Report:")
    print(report)

print_evaluation(y_test, y_pred_semi, model_name="Semi-Supervised Model Evaluation")


Semi-Supervised Model Evaluation:
        Accuracy: 0.9994499256814484
 Precision Score: 0.9719626168224299
    Recall Score: 0.7027027027027027
        F1 Score: 0.8156862745098039

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.70      0.82       148

    accuracy                           1.00     85443
   macro avg       0.99      0.85      0.91     85443
weighted avg       1.00      1.00      1.00     85443

