<a href="https://colab.research.google.com/github/Aasish357/Fraud-Detection-System-Using/blob/main/Fraud_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
import pickle
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
from typing import List
import warnings
warnings.filterwarnings("ignore")


In [37]:
# Step 1: Generate Synthetic Dataset (mimics Kaggle Credit Card Fraud)
def generate_synthetic_data(n_samples: int = 10000, fraud_ratio: float = 0.0017) -> pd.DataFrame:
    np.random.seed(42)
    # 30 features: V1-V28 (PCA components), Amount, Time
    data = {
        **{f'V{i}': np.random.normal(0, 1, n_samples) for i in range(1, 29)},
        'Amount': np.random.exponential(100, n_samples),
        'Time': np.random.uniform(0, 172800, n_samples)  # 2 days in seconds
    }

    df = pd.DataFrame(data)

    # Generate fraud labels (0 = legit, 1 = fraud)
    df['Class'] = 0
    fraud_indices = np.random.choice(n_samples, size=int(n_samples * fraud_ratio), replace=False)
    df.loc[fraud_indices, 'Class'] = 1

    # Add fraud patterns: higher Amount, slight feature shifts
    df.loc[fraud_indices, 'Amount'] *= np.random.uniform(2, 5)
    for i in range(1, 29):
        df.loc[fraud_indices, f'V{i}'] += np.random.normal(1, 0.5, len(fraud_indices))
    return df

In [40]:

# Step 2: Preprocess Data
def preprocess_data(df: pd.DataFrame, test_size: float = 0.2):
    X = df.drop('Class', axis=1)
    y = df['Class']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Handle imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

    return X_train_smote, X_test_scaled, y_train_smote, y_test, scaler


In [71]:
# Step 3: Train Models
class FraudModels:
    def __init__(self):
        self.lr_model = LogisticRegression(random_state=42)
        self.rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
        self.autoencoder = None

    def train_logistic_regression(self, X_train, y_train):
        self.lr_model.fit(X_train, y_train)
        pickle.dump(self.lr_model, open('lr_model.pkl', 'wb'))

    def train_random_forest(self, X_train, y_train):
        self.rf_model.fit(X_train, y_train)
        pickle.dump(self.rf_model, open('rf_model.pkl', 'wb'))

    def train_autoencoder(self, X_train, y_train, epochs=50, batch_size=256):
        # Use only non-fraud data for training
        X_normal = X_train[y_train == 0]

        # Autoencoder architecture
        input_dim = X_train.shape[1]
        input_layer = Input(shape=(input_dim,))
        encoder = Dense(14, activation='relu')(input_layer)
        encoder = Dense(7, activation='relu')(encoder)
        decoder = Dense(14, activation='relu')(encoder)
        decoder = Dense(input_dim, activation='linear')(decoder)

        self.autoencoder = Model(inputs=input_layer, outputs=decoder)
        self.autoencoder.compile(optimizer='adam', loss='mse')

        self.autoencoder.fit(
            X_normal, X_normal,
            epochs=epochs, batch_size=batch_size,
            shuffle=True, verbose=0
        )
        self.autoencoder.save('autoencoder.h5')

    def predict_lr(self, X):
        return self.lr_model.predict(X)

    def predict_rf(self, X):
        return self.rf_model.predict(X)

    def predict_autoencoder(self, X, threshold=2.0):
        reconstructions = self.autoencoder.predict(X, verbose=0)
        mse = np.mean(np.power(X - reconstructions, 2), axis=1)
        return (mse > threshold).astype(int)

In [72]:

# Step 4: Evaluate Models
def evaluate_model(y_true, y_pred, model_name: str):
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    return recall


In [73]:
# Step 5: FastAPI Deployment
app = FastAPI(title="Fraud Detection API")

class Transaction(BaseModel):
    features: List[float]

@app.post("/predict")
async def predict_fraud(transaction: Transaction):
    # Load models and scaler
    lr_model = pickle.load(open('lr_model.pkl', 'rb'))
    rf_model = pickle.load(open('rf_model.pkl', 'rb'))
    autoencoder = tf.keras.models.load_model('autoencoder.h5')
    scaler = pickle.load(open('scaler.pkl', 'rb'))

    # Prepare input
    X = np.array([transaction.features])
    X_scaled = scaler.transform(X)

    # Predict
    lr_pred = lr_model.predict(X_scaled)[0]
    rf_pred = rf_model.predict(X_scaled)[0]
    reconstructions = autoencoder.predict(X_scaled, verbose=0)
    mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)
    auto_pred = int(mse[0] > 2.0)

    return {
        "LogisticRegression": "Fraud" if lr_pred else "Legit",
        "RandomForest": "Fraud" if rf_pred else "Legit",
        "Autoencoder": "Fraud" if auto_pred else "Legit"
    }

In [74]:
# Main Execution
def run_fraud_detection():
    print("=== Fraud Detection in Financial Transactions ===\n")

    # Generate and preprocess data
    df = generate_synthetic_data(n_samples=10000)
    X_train, X_test, y_train, y_test, scaler = preprocess_data(df)
    pickle.dump(scaler, open('scaler.pkl', 'wb'))

    # Initialize and train models
    models = FraudModels()
    print("Training Logistic Regression...")
    models.train_logistic_regression(X_train, y_train)
    print("Training Random Forest...")
    models.train_random_forest(X_train, y_train)
    print("Training Autoencoder...")
    models.train_autoencoder(X_train, y_train)

    # Evaluate models
    lr_pred = models.predict_lr(X_test)
    rf_pred = models.predict_rf(X_test)
    auto_pred = models.predict_autoencoder(X_test)

    lr_recall = evaluate_model(y_test, lr_pred, "Logistic Regression")
    rf_recall = evaluate_model(y_test, rf_pred, "Random Forest")
    auto_recall = evaluate_model(y_test, auto_pred, "Autoencoder")

    # Compare to baseline (no SMOTE, simple LR)
    baseline_lr = LogisticRegression(random_state=42)
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
        df.drop('Class', axis=1), df['Class'], test_size=0.2, stratify=df['Class'], random_state=42
    )
    scaler_baseline = StandardScaler()
    X_train_raw_scaled = scaler_baseline.fit_transform(X_train_raw)
    baseline_lr.fit(X_train_raw_scaled, y_train_raw)
    baseline_pred = baseline_lr.predict(scaler_baseline.transform(X_test_raw))
    baseline_recall = evaluate_model(y_test_raw, baseline_pred, "Baseline Logistic Regression")

    # Quantify false negative reduction
    fn_reduction = ((baseline_recall - max(lr_recall, rf_recall, auto_recall)) / baseline_recall) * 100
    print(f"\nFalse Negative Reduction: ~{abs(fn_reduction):.0f}% (target: 30%)")

    print("\nTo test API, run: uvicorn fraud_detection:app --reload")
    print("Then visit http://127.0.0.1:8000/docs to test predictions.")
if __name__ == "__main__":
   run_fraud_detection()

=== Fraud Detection in Financial Transactions ===

Training Logistic Regression...
Training Random Forest...
Training Autoencoder...





Logistic Regression Results:
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
ROC-AUC: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1997
           1       1.00      1.00      1.00         3

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Random Forest Results:
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
ROC-AUC: 0.5000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1997
           1       0.00      0.00      0.00         3

    accuracy                           1.00      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       1.00      1.00      1.00      2000


Autoencoder Results:
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
ROC-AUC: 0.5000
Classification Report:
   