In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
)
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# DataSet

In [2]:
# Load the dataset
data = pd.read_csv('Data/Fraud.csv')

In [None]:
# checking the data types
data.dtypes

In [None]:
# checking the head of the data
data.head()

In [None]:
# checking the info of the data
data.info()

In [None]:
print("Shape of the dataset:", data.shape)

In [None]:
data.isnull().sum()

In [None]:
data.describe().transpose()

In [None]:
# Enhanced Fraud vs Non-Fraud Transactions plot
plt.figure(figsize=(10, 6))  # Larger figure size for better readability
sns.countplot(
    x='isFraud', 
    data=data, 
    palette=['#00C853', '#D50000'],  # Custom colors: green for non-fraud, red for fraud
    alpha=0.85  # Slight transparency for better visuals
)
plt.title('Fraud vs Non-Fraud Transactions', fontsize=16, fontweight='bold')
plt.xlabel('Fraudulent Transaction', fontsize=14)
plt.ylabel('Transaction Count', fontsize=14)
plt.xticks(ticks=[0, 1], labels=['Not Fraud (0)', 'Fraud (1)'], fontsize=12)
plt.yticks(fontsize=12)

# Adding percentage labels to bars
total = len(data)  # Total number of transactions
for bar in plt.gca().patches:
    count = bar.get_height()
    percentage = f'{(count / total) * 100:.2f}%'
    plt.gca().text(
        bar.get_x() + bar.get_width() / 2,  # X-coordinate
        bar.get_height() + (total * 0.001),  # Y-coordinate
        percentage,  # Percentage text
        ha='center', fontsize=12, color='black', fontweight='bold'
    )

plt.tight_layout()  # Adjust layout for better appearance
plt.show()


In [10]:
# Reshape the dataset to 100,000 rows using stratified sampling
reshaped_data, _ = train_test_split(data, train_size=100000, stratify=data['isFraud'], random_state=42)

In [None]:
# Confirm the shape of the reshaped data
print(f"Reshaped data has {reshaped_data.shape[0]} rows and {reshaped_data.shape[1]} columns.")

In [12]:
# Save reshaped data to a new CSV file
reshaped_data.to_csv('reshaped_fraud_data.csv', index=False)

In [None]:
# Initial exploration
print("Dataset Info:")
print(reshaped_data.info())
print("\nMissing Values:")
print(reshaped_data.isnull().sum())

In [None]:
# Visualizations
# Transaction type distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='type', data=reshaped_data, palette='viridis', order=reshaped_data['type'].value_counts().index)
plt.title('Transaction Type Distribution')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(reshaped_data['amount'], bins=50, kde=True, color='green')
plt.title('Transaction Amount Distribution')
plt.xlabel('Transaction Amount')
plt.ylabel('Density')
plt.xlim(0, reshaped_data['amount'].quantile(0.95))  # Limiting to the 95th percentile for clarity
plt.show()

In [None]:
# Enhanced Fraud vs Non-Fraud Transactions plot
plt.figure(figsize=(10, 6))  # Larger figure size for better readability
sns.countplot(
    x='isFraud', 
    data=reshaped_data, 
    palette=['#00C853', '#D50000'],  # Custom colors: green for non-fraud, red for fraud
    alpha=0.85  # Slight transparency for better visuals
)
plt.title('Fraud vs Non-Fraud Transactions', fontsize=16, fontweight='bold')
plt.xlabel('Fraudulent Transaction', fontsize=14)
plt.ylabel('Transaction Count', fontsize=14)
plt.xticks(ticks=[0, 1], labels=['Not Fraud (0)', 'Fraud (1)'], fontsize=12)
plt.yticks(fontsize=12)

# Adding percentage labels to bars
total = len(reshaped_data)  # Total number of transactions
for bar in plt.gca().patches:
    count = bar.get_height()
    percentage = f'{(count / total) * 100:.2f}%'
    plt.gca().text(
        bar.get_x() + bar.get_width() / 2,  # X-coordinate
        bar.get_height() + (total * 0.001),  # Y-coordinate
        percentage,  # Percentage text
        ha='center', fontsize=12, color='black', fontweight='bold'
    )

plt.tight_layout()  # Adjust layout for better appearance
plt.show()


<h3>Data Cleaning & Preprocessing<h3>

In [17]:
# Encode categorical 'type' column
reshaped_data['type'] = LabelEncoder().fit_transform(reshaped_data['type'])

In [18]:
# Feature and target selection
X = reshaped_data.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
y = reshaped_data['isFraud']

In [19]:
# Downcast numerical types for memory efficiency
X = X.apply(pd.to_numeric, downcast='float')

In [20]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print(f"Before Resampling: {y_train.value_counts().to_dict()}")
print(f"After Resampling: {pd.Series(y_train_resampled).value_counts().to_dict()}")

# Building the Model

In [None]:
# Model Selection
print("Welcome to the Fraud Detection System!")
print("Please select a model for training:")
print("Press 'L' for LightGBM")
print("Press 'S' for Support Vector Machine (SVM)")
print("Press 'R' for Random Forest")
print("Press 'G' for Logistic Regression")

user_choice = input("Enter your choice: ").strip().upper()

# Initialize selected model
if user_choice == "L":
    model_name = "LightGBM"
    model = LGBMClassifier(device='gpu', random_state=42)  # GPU acceleration enabled
elif user_choice == "S":
    model_name = "SVM"
    model = SVC(kernel='linear', probability=True, random_state=42)
elif user_choice == "R":
    model_name = "Random Forest"
    model = RandomForestClassifier(n_jobs=-1, random_state=42)  # Parallel processing
elif user_choice == "G":
    model_name = "Logistic Regression"
    model = LogisticRegression(max_iter=1000, random_state=42)
else:
    print("Invalid choice! Defaulting to LightGBM.")
    model_name = "LightGBM"
    model = LGBMClassifier(device='gpu', random_state=42)

<h3>Training Part<h3>

In [None]:
# Train the selected model
print(f"\nTraining {model_name} model...")
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)
print("\nEvaluation Metrics:")
print(classification_report(y_test, y_pred, target_names=['Not Fraud', 'Fraud']))

In [None]:

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
# Display results
print(f"\nResults for {model_name}:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("-" * 50)

<br>Now you can enter transaction details to check for fraud.
Follow these instructions for each input:
1. Transaction Type (type): 0 = CASH_IN, 1 = CASH_OUT, 2 = DEBIT, 3 = PAYMENT, 4 = TRANSFER
2. Transaction Amount (amount): Positive float, e.g., 1000.50
3. Original Account Balance (oldbalanceOrg): Positive float, e.g., 50000.00
4. New Account Balance (newbalanceOrig): Positive float, e.g., 45000.00
5. Recipient's Original Balance (oldbalanceDest): Positive float, e.g., 200000.00
6. Recipient's New Balance (newbalanceDest): Positive float, e.g., 250000.50
<br>

In [31]:
transaction_type = int(input("Enter transaction type (0-4): "))
amount = float(input("Enter transaction amount: "))
oldbalanceOrg = float(input("Enter original account balance: "))
newbalanceOrig = float(input("Enter new account balance after transaction: "))
oldbalanceDest = float(input("Enter recipient's original account balance: "))
newbalanceDest = float(input("Enter recipient's new account balance: "))

In [32]:
# Prepare input data for prediction
input_data = pd.DataFrame({
    'type': [transaction_type],
    'amount': [amount],
    'oldbalanceOrg': [oldbalanceOrg],
    'newbalanceOrig': [newbalanceOrig],
    'oldbalanceDest': [oldbalanceDest],
    'newbalanceDest': [newbalanceDest]
})

In [33]:
# Normalize input data
input_data[numerical_cols] = scaler.transform(input_data[numerical_cols])

In [None]:
# Add the 'step' column to the input data
input_data['step'] = 0  # Assuming step is 0 for the new transaction

# Reorder columns to match the training data
input_data = input_data[X.columns]

# Predict fraud
fraud_prediction = model.predict(input_data)[0]


<h3>Final Prediction<h3>

In [None]:
if fraud_prediction == 1:
    print("\n⚠️ Prediction: This transaction is FRAUDULENT!")
else:
    print("\n✅ Prediction: This transaction is NOT FRAUDULENT.")