In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier  # For building the fraud detection model
from sklearn.model_selection import train_test_split  # To split data into training and testing sets
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
    roc_curve, auc  # For evaluating the model
)

In [6]:
import matplotlib.pyplot as plt  # For visualization

In [7]:
file_path = r"C:\Users\e1ure\Jupyter Notebook Files from Laptop\Jupyter\PS_log.csv"  # Ensure this file exists in your directory

df = pd.read_csv(file_path)   #this will read the cleaned csv file that was saved and store it in df now to be used for analysis

In [8]:
# Step 2: Explore and preprocess the data
# ------------------------------------------------------------
# Let's preprocess the data to prepare it for training the model.

# Encode the 'type' column into numerical values (e.g., CASH-IN -> 0, PAYMENT -> 1, etc.)
df['type'] = df['type'].astype('category').cat.codes

In [9]:
# Fill any missing values with 0 (common for balance columns)
df.fillna(0, inplace=True)

In [10]:
# Feature Explanation:
# - 'type': Encoded transaction type
# - 'amount': Amount involved in the transaction
# - 'oldbalanceOrg', 'newbalanceOrig': Original account balances before and after the transaction
# - 'oldbalanceDest', 'newbalanceDest': Destination account balances before and after the transaction

#Step 3: Define features (X) and the target (y)
# ------------------------------------------------------------
# X: Features we use to predict fraud (independent variables)
# y: The target column ('isFraud'), indicating if a transaction is fraudulent #dependent variable
X = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
y = df['isFraud']

In [11]:
# Step 4: Split the data into training and testing sets
# ------------------------------------------------------------
# Splitting the dataset into 70% training and 30% testing.
# This ensures the model is evaluated on unseen data for reliability.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
# Step 5: Train the model
# ------------------------------------------------------------
# Using a Random Forest Classifier because:
# - It works well with imbalanced datasets
# - It handles both categorical and numerical features well
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)  # Train the model on the training data

In [13]:
# Step 6: Make predictions
# ------------------------------------------------------------
# Predict the outcomes for the test data
y_pred = model.predict(X_test)  # Predicted class labels (0 or 1)
y_prob = model.predict_proba(X_test)[:, 1]  # Predicted probabilities for the positive class (isFraud = 1)

In [14]:
# Step 7: Evaluate the model
# ------------------------------------------------------------
# Metrics are used to assess the model's performance
print("=== Evaluation Metrics ===")
accuracy = accuracy_score(y_test, y_pred)  # Overall correctness
precision = precision_score(y_test, y_pred)  # Fraction of correct fraud predictions
recall = recall_score(y_test, y_pred)  # Fraction of fraud cases detected
f1 = f1_score(y_test, y_pred)  # Harmonic mean of precision and recall
conf_matrix = confusion_matrix(y_test, y_pred)  # Breakdown of predictions (true/false positives/negatives)

=== Evaluation Metrics ===


In [15]:
# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.9997081914892503
Precision: 0.9790816326530613
Recall: 0.7880903490759753
F1 Score: 0.8732650739476678
Confusion Matrix:
[[1906310      41]
 [    516    1919]]
