In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.semi_supervised import LabelSpreading
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import kagglehub

# General settings
RANDOM_SEED = 42
TEST_SIZE = 0.3

# Load dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# Prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# Print class distribution
fraud = data[data['Class'] == 1]
nonfraud = data[data['Class'] == 0]
print(f'Fraudulent: {len(fraud)}, Non-fraudulent: {len(nonfraud)}')
print(f'The positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} '
      f'({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

# Extract features and labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Split data
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)
fraud_idx = np.where(y_train_full == 1)[0]
nonfraud_idx = np.where(y_train_full == 0)[0]
n_labeled_normals = 500
np.random.seed(RANDOM_SEED)
labeled_normals_idx = np.random.choice(nonfraud_idx, size=n_labeled_normals, replace=False)
labeled_idx = np.concatenate([fraud_idx, labeled_normals_idx])
unlabeled_idx = np.setdiff1d(nonfraud_idx, labeled_normals_idx)
y_semi = np.full_like(y_train_full, -1)
y_semi[labeled_idx] = y_train_full[labeled_idx]
ls_model = LabelSpreading(kernel='knn', n_neighbors=10, alpha=0.1)
ls_model.fit(X_train_full, y_semi)
label_distributions = ls_model.label_distributions_
confidence = label_distributions.max(axis=1)
high_confidence_mask = confidence >= 0.9
X_filtered = X_train_full[high_confidence_mask]
y_filtered = ls_model.transduction_[high_confidence_mask]
rf = RandomForestClassifier(class_weight='balanced', random_state=RANDOM_SEED)
rf.fit(X_filtered, y_filtered)
y_proba = rf.predict_proba(X_test_scaled)[:, 1]
threshold = 0.8
y_pred = (y_proba > threshold).astype(int)

# Define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f'\n{model_name} Evaluation:')
    print('===' * 15)
    print('         Accuracy:', accuracy)
    print('  Precision Score:', precision)
    print('     Recall Score:', recall)
    print('         F1 Score:', f1)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

evaluation(y_test, y_pred)

Fraudulent: 492, Non-fraudulent: 284315
The positive class (frauds) percentage: 492/284807 (0.173%)

Model Evaluation:
         Accuracy: 0.9987594068560327
  Precision Score: 0.7058823529411765
     Recall Score: 0.4864864864864865
         F1 Score: 0.576

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.71      0.49      0.58       148

    accuracy                           1.00     85443
   macro avg       0.85      0.74      0.79     85443
weighted avg       1.00      1.00      1.00     85443

