In [1]:
!pip install faker flwr

Collecting faker
  Downloading Faker-27.0.0-py3-none-any.whl.metadata (15 kB)
Collecting flwr
  Downloading flwr-1.10.0-py3-none-any.whl.metadata (15 kB)
Collecting iterators<0.0.3,>=0.0.2 (from flwr)
  Downloading iterators-0.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting pathspec<0.13.0,>=0.12.1 (from flwr)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Collecting protobuf<5.0.0,>=4.25.2 (from flwr)
  Downloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pycryptodome<4.0.0,>=3.18.0 (from flwr)
  Downloading pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting tomli-w<2.0.0,>=1.0.0 (from flwr)
  Downloading tomli_w-1.0.0-py3-none-any.whl.metadata (4.9 kB)
Collecting typer<0.10.0,>=0.9.0 (from typer[all]<0.10.0,>=0.9.0->flwr)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Collecting colorama<0.5.0,>=0.4.3 (from typer[all]<0.10.0,>=0.9.0->flwr)
  Downloading col

In [None]:
import numpy as np
import pandas as pd
import random
from faker import Faker
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import flwr as fl
import pickle
import multiprocessing as mp

# Initialize Faker for generating synthetic data
fake = Faker()

# Function to generate synthetic credit card transaction data
def generate_credit_card_data(num_records, fraud_percentage):
    data = {
        "ID": range(1, num_records + 1),
        "Transaction Date": [fake.date_this_year() for _ in range(num_records)],
        "Credit Card Number": [fake.credit_card_number(card_type=None) for _ in range(num_records)],
        "Merchant": [fake.company() for _ in range(num_records)],
        "Transaction Amount": [round(random.uniform(1, 1000), 2) for _ in range(num_records)],
        "Transaction Type": [random.choice(["POS", "Online", "Direct Debit"]) for _ in range(num_records)],
        "Account Balance": [round(random.uniform(0, 100000), 2) for _ in range(num_records)],
        "Customer Age": [random.randint(18, 90) for _ in range(num_records)],
        "Customer Gender": [random.choice(["Male", "Female"]) for _ in range(num_records)],
        "Is Fraud": [1 if random.random() < fraud_percentage else 0 for _ in range(num_records)]
    }
    return pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    df = df.drop(columns=["ID", "Transaction Date", "Credit Card Number", "Merchant"])
    le_transaction_type = LabelEncoder()
    le_customer_gender = LabelEncoder()
    df["Transaction Type"] = le_transaction_type.fit_transform(df["Transaction Type"])
    df["Customer Gender"] = le_customer_gender.fit_transform(df["Customer Gender"])
    scaler = StandardScaler()
    df[["Transaction Amount", "Account Balance", "Customer Age"]] = scaler.fit_transform(df[["Transaction Amount", "Account Balance", "Customer Age"]])
    X = df.drop(columns=["Is Fraud"])
    y = df["Is Fraud"]
    return X, y

# Define FL client class
class CreditCardClient(fl.client.NumPyClient):
    def __init__(self, model, x_train, y_train, x_test, y_test):
        self.model = model
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = y_test
        self.y_test = y_test

    def get_parameters(self, config=None):
        return [self.model.coef_.ravel(), self.model.intercept_]

    def fit(self, parameters, config):
        self.model.coef_ = np.array(parameters[0]).reshape(1, -1)  # Reshaping if necessary
        self.model.intercept_ = np.array(parameters[1])
        self.model.fit(self.x_train, self.y_train)
        return [self.model.coef_.ravel(), self.model.intercept_], len(self.x_train), {}

    def evaluate(self, parameters, config):
        self.model.coef_ = np.array(parameters[0]).reshape(1, -1)
        self.model.intercept_ = np.array(parameters[1])
        predictions = self.model.predict(self.x_test)
        accuracy = accuracy_score(self.y_test, predictions)
        return 1 - accuracy, len(self.x_test), {"accuracy": accuracy}

# Federated Learning
def start_fl_server(num_rounds=3):
    strategy = fl.server.strategy.FedAvg()
    fl.server.start_server(server_address="127.0.0.1:8081", strategy=strategy, config={"num_rounds": num_rounds})

def start_fl_client(client_id):
    with open(f'client_{client_id}_data.pkl', 'rb') as f:
        X, y = pickle.load(f)

    model = LogisticRegression()
    split_index = int(0.8 * len(X))
    x_train, x_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    client = CreditCardClient(model, x_train, y_train, x_test, y_test)
    fl.client.start_numpy_client(server_address="127.0.0.1:8081", client=client)

def run_federated_learning(num_clients, num_rounds=3):
    try:
        mp.set_start_method("spawn")
    except RuntimeError:
        pass

    server_process = mp.Process(target=start_fl_server, args=(num_rounds,))
    server_process.start()

    client_processes = []
    for i in range(num_clients):
        p = mp.Process(target=start_fl_client, args=(i,))
        client_processes.append(p)
        p.start()

    for p in client_processes:
        p.join()

    server_process.terminate()

# Centralized training and evaluation
def centralized_training_evaluation(X, y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    model = LogisticRegression()
    model.fit(X, y)

    predictions = model.predict(X)
    accuracy = accuracy_score(y, predictions)
    return accuracy

# User Inputs
num_records = int(input("Enter the number of records (up to 100,000): "))
fraud_percentage = float(input("Enter the percentage of fraud cases (up to 10%): ")) / 100
num_clients = int(input("Enter the number of client datasets (between 3-10): "))

# Generate and preprocess data
client_data = []
for i in range(num_clients):
    df = generate_credit_card_data(num_records, fraud_percentage)
    X, y = preprocess_data(df)
    client_data.append((X, y))
    with open(f'client_{i}_data.pkl', 'wb') as f:
        pickle.dump((X, y), f)

# Run Federated Learning on num_clients - 1 clients
print("\nStarting Federated Learning...")
run_federated_learning(num_clients - 1, num_rounds=3)

# Evaluate Federated Model on the remaining dataset
print("\nEvaluating Federated Model on the remaining dataset...")
X_test, y_test = client_data[-1]
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

# Load the final federated model (from the last client)
with open(f'client_{num_clients - 2}_data.pkl', 'rb') as f:
    X_last_client, y_last_client = pickle.load(f)

model = LogisticRegression()
model.fit(X_last_client, y_last_client)
predictions = model.predict(X_test)
federated_accuracy = accuracy_score(y_test, predictions)

# Centralized Model for Comparison
print("\nRunning Centralized Model on one of the client datasets...")
X, y = client_data[0]
centralized_accuracy = centralized_training_evaluation(X, y)

# Output the results
print(f"\nFederated model accuracy on the remaining dataset: {federated_accuracy:.4f}")
print(f"Centralized model accuracy: {centralized_accuracy:.4f}")
