In [2]:
# Challenge 2: Ensemble Learning to Improve Prediction Results
# Goal: Combine supervised and unsupervised learning to improve prediction results

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, recall_score, f1_score)
from sklearn.cluster import KMeans
import xgboost as xgb
import kagglehub

# General settings
RANDOM_SEED = 42
TEST_SIZE = 0.3

# Load dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# Preprocessing
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).values
y = data['Class'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Supervised Models
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=RANDOM_SEED, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Unsupervised Models
print("Training Isolation Forest...")
iso_forest = IsolationForest(contamination=0.002, random_state=RANDOM_SEED)
iso_forest.fit(X_train_scaled[y_train == 0])
iso_pred_scores = iso_forest.decision_function(X_test_scaled)
iso_pred_proba = (1 - (iso_pred_scores - iso_pred_scores.min()) / (iso_pred_scores.max() - iso_pred_scores.min()))

print("Training K-Means...")
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(X_train_scaled[y_train == 0])
distances = kmeans.transform(X_test_scaled)
kmeans_pred_proba = (np.min(distances, axis=1) - distances.min()) / (distances.max() - distances.min())

# Ensemble Methods
print("\nEvaluating Ensemble Methods...")

def evaluate_model(y_true, y_pred, name):
    print(f"\n{name} Evaluation")
    print("=" * 50)
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.6f}")
    print(f"Precision: {precision_score(y_true, y_pred, zero_division=0):.6f}")
    print(f"Recall: {recall_score(y_true, y_pred):.6f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.6f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Simple Average
ensemble_avg = np.mean([rf_pred_proba, xgb_pred_proba, iso_pred_proba, kmeans_pred_proba], axis=0)
ensemble_avg_pred = (ensemble_avg > 0.5).astype(int)
evaluate_model(y_test, ensemble_avg_pred, "Ensemble - Simple Average")

# Weighted Average
weights = [0.4, 0.4, 0.1, 0.1]
ensemble_weighted = sum(w * p for w, p in zip(weights, [rf_pred_proba, xgb_pred_proba, iso_pred_proba, kmeans_pred_proba]))
ensemble_weighted_pred = (ensemble_weighted > 0.5).astype(int)
evaluate_model(y_test, ensemble_weighted_pred, "Ensemble - Weighted Average")

# Threshold-based Voting
rf_pred = (rf_pred_proba > 0.5).astype(int)
xgb_pred = (xgb_pred_proba > 0.5).astype(int)
iso_pred_binary = (iso_pred_proba > 0.7).astype(int)
kmeans_pred_binary = (kmeans_pred_proba > 0.7).astype(int)
ensemble_vote = ((rf_pred + xgb_pred + iso_pred_binary + kmeans_pred_binary) >= 2).astype(int)
evaluate_model(y_test, ensemble_vote, "Ensemble - Threshold Voting")

# Identify best ensemble method
ensemble_methods = {
    "Simple Average": ensemble_avg_pred,
    "Weighted Average": ensemble_weighted_pred,
    "Threshold Voting": ensemble_vote
}

best_method, best_score = max(
    ((name, f1_score(y_test, pred)) for name, pred in ensemble_methods.items()),
    key=lambda item: item[1]
)

print("\n" + "="*60)
print(f"CHALLENGE 2 COMPLETED")
print(f"Best Ensemble Method: {best_method}")
print(f"Best F1 Score: {best_score:.6f}")
print("="*60)


ModuleNotFoundError: No module named 'numpy'