**Prepare the Dataset**

In [1]:
import numpy as np
import pandas as pd

# Generate a synthetic dataset for financial transactions
np.random.seed(42)

data = {
    'transaction_id': np.arange(1, 10001),
    'amount': np.random.uniform(1, 10000, 10000),
    'transaction_type': np.random.choice(['withdrawal', 'deposit', 'transfer'], 10000),
    'oldbalanceOrg': np.random.uniform(0, 10000, 10000),
    'newbalanceOrig': np.random.uniform(0, 10000, 10000),
    'oldbalanceDest': np.random.uniform(0, 10000, 10000),
    'newbalanceDest': np.random.uniform(0, 10000, 10000)
}

df = pd.DataFrame(data)

# Create a label column for anomalies (fraudulent transactions)
df['is_fraud'] = np.random.choice([0, 1], size=(10000,), p=[0.98, 0.02])  # Simulate 2% fraud cases


**Feature Engineering**

In [2]:
# Feature Engineering: creating new features
df['transaction_change'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['dest_change'] = df['newbalanceDest'] - df['oldbalanceDest']

# Convert categorical feature 'transaction_type' into dummy variables
df = pd.get_dummies(df, columns=['transaction_type'], drop_first=True)

# Drop columns that won't be used in the model
df.drop(['transaction_id'], axis=1, inplace=True)

# Split the dataset into features and labels
X = df.drop(['is_fraud'], axis=1)
y = df['is_fraud']


**Autoencoder for Anomaly Detection**

In [3]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Split the dataset into training (only non-fraud data) and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Using only non-fraudulent data for training the autoencoder
X_train_normal = X_train[y_train == 0]

# Autoencoder model
autoencoder = Sequential([
    Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(X_train.shape[1], activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_normal, X_train_normal, epochs=10, batch_size=64, validation_data=(X_test, X_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - loss: 22258240.0000 - val_loss: 22006508.0000
Epoch 2/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 22163226.0000 - val_loss: 22005582.0000
Epoch 3/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 22293392.0000 - val_loss: 22004670.0000
Epoch 4/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 22121462.0000 - val_loss: 22004198.0000
Epoch 5/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 22061412.0000 - val_loss: 22004188.0000
Epoch 6/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - loss: 22201930.0000 - val_loss: 22004176.0000
Epoch 7/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 22308868.0000 - val_loss: 22004152.0000
Epoch 8/10
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x78dd4963efe0>

**Isolation Forest Model**

In [4]:
from sklearn.ensemble import IsolationForest

# Train Isolation Forest
isolation_forest = IsolationForest(contamination=0.02, random_state=42)
isolation_forest.fit(X_train)

# Predict anomalies
y_pred_if = isolation_forest.predict(X_test)
y_pred_if = np.where(y_pred_if == -1, 1, 0)  # Convert -1 to 1 for anomaly


**One-Class SVM Model**

In [5]:
from sklearn.svm import OneClassSVM

# Train One-Class SVM
ocsvm = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.05)
ocsvm.fit(X_train_normal)

# Predict anomalies
y_pred_svm = ocsvm.predict(X_test)
y_pred_svm = np.where(y_pred_svm == -1, 1, 0)


**Risk Scoring System**

In [6]:
# Combine predictions from all models (Autoencoder, Isolation Forest, One-Class SVM)
y_pred_autoencoder = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - y_pred_autoencoder, 2), axis=1)
y_pred_ae = np.where(mse > np.percentile(mse, 98), 1, 0)

# Average the results from all models for a final risk score
risk_score = (y_pred_if + y_pred_svm + y_pred_ae) / 3.0

# Define a risk threshold (you can fine-tune this based on the distribution)
risk_threshold = 0.5
y_final_pred = np.where(risk_score > risk_threshold, 1, 0)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


**Model Evaluation**

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_test, y_final_pred)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_report(y_test, y_final_pred))


Confusion Matrix:
 [[1890   73]
 [  36    1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      1963
           1       0.01      0.03      0.02        37

    accuracy                           0.95      2000
   macro avg       0.50      0.49      0.49      2000
weighted avg       0.96      0.95      0.95      2000

