In [5]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import flwr as fl
import pickle
import multiprocessing as mp

# Load the Excel data
file_path = 'Customer data.xlsx'  # Ensure the file is in the correct directory or provide the full path
df = pd.read_excel(file_path)

  df = pd.read_excel(file_path)


**The preprocess_customer_data function prepares customer data for analysis by performing the following steps**

Dropping Irrelevant Columns: It removes the 'customer_id' and 'name' columns from the DataFrame, as they are not useful for numerical analysis or modeling.

Encoding Categorical Features: It encodes several categorical columns ('gender', 'job_title', 'job_industry_category', 'wealth_segment', 'deceased_indicator', and 'owns_car') into numeric format using LabelEncoder, facilitating their use in machine learning algorithms.

Standardizing Numerical Features: It standardizes selected numerical columns ('past_3_years_bike_related_purchases', 'age', 'tenure') using StandardScaler, ensuring that all features contribute equally to the model.

Defining Target Variable: It separates the target variable ('gender') from the feature set, returning a DataFrame of features (X) and a Series of target values (y)

In [6]:
# Function to preprocess the Excel data
def preprocess_customer_data(df):
    # Drop irrelevant columns like 'name' since they are not numeric or relevant for scaling
    df = df.drop(columns=["customer_id", "name"])  # Remove name and customer_id columns
    
    # Encode categorical columns
    label_encoders = {}
    categorical_cols = ["gender", "job_title", "job_industry_category", "wealth_segment", "deceased_indicator", "owns_car"]
    
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))  # Convert to string before encoding
        label_encoders[col] = le  # Store the encoder if needed for inverse transformation

    # Standardize numerical features (only numeric columns)
    scaler = StandardScaler()
    numerical_cols = ["past_3_years_bike_related_purchases", "age", "tenure"]
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Assuming 'gender' as target for the sake of example (adjust as needed)
    X = df.drop(columns=["gender"])
    y = df["gender"]
    
    return X, y


**The CustomerDataClient class enables federated learning on the client side by inheriting from fl.client.NumPyClient**

Key Features:

Initialization: Accepts a model and training/testing datasets, storing them for later use.

Get Parameters: Returns the model's coefficients and intercept for use in federated updates.

Fit Method: Updates the model with received parameters and trains it on local data, returning the new parameters and training sample count.

Evaluate Method: Sets model parameters, evaluates performance on the test dataset, and returns accuracy metrics.

In [7]:
# Define a class for handling federated learning on client side
class CustomerDataClient(fl.client.NumPyClient):
    def __init__(self, model, x_train, y_train, x_test, y_test):
        self.model = model
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def get_parameters(self):
        return [self.model.coef_.ravel(), self.model.intercept_]

    def fit(self, parameters, config):
        self.model.coef_ = np.array(parameters[0]).reshape(1, -1)
        self.model.intercept_ = np.array(parameters[1])
        self.model.fit(self.x_train, self.y_train)
        return [self.model.coef_.ravel(), self.model.intercept_], len(self.x_train), {}

    def evaluate(self, parameters, config):
        self.model.coef_ = np.array(parameters[0]).reshape(1, -1)
        self.model.intercept_ = np.array(parameters[1])
        predictions = self.model.predict(self.x_test)
        accuracy = accuracy_score(self.y_test, predictions)
        return 1 - accuracy, len(self.x_test), {"accuracy": accuracy}

In [8]:
# Federated Learning server
def start_fl_server(num_rounds=3):
    strategy = fl.server.strategy.FedAvg()
    fl.server.start_server(server_address="127.0.0.1:8081", strategy=strategy, config={"num_rounds": num_rounds})


In [9]:
# Federated Learning client
def start_fl_client(client_id):
    with open(f'client_{client_id}_data.pkl', 'rb') as f:
        X, y = pickle.load(f)

    model = LogisticRegression()
    split_index = int(0.8 * len(X))
    x_train, x_test = X[:split_index], X[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    client = CustomerDataClient(model, x_train, y_train, x_test, y_test)
    fl.client.start_numpy_client(server_address="127.0.0.1:8081", client=client)

In [10]:
# Run Federated Learning experiment
def run_federated_learning(num_clients, num_rounds=3):
    try:
        mp.set_start_method("spawn")
    except RuntimeError:
        pass

    server_process = mp.Process(target=start_fl_server, args=(num_rounds,))
    server_process.start()

    client_processes = []
    for i in range(num_clients):
        p = mp.Process(target=start_fl_client, args=(i,))
        client_processes.append(p)
        p.start()

    for p in client_processes:
        p.join()

    server_process.terminate()

**The centralized_training_evaluation function trains and evaluates a Logistic Regression model using a centralized dataset.**

**Key Steps:**

Numeric Data Selection: Filters the input feature set (X) to retain only numeric columns.

Data Cleaning: Creates a mask to remove rows with missing values in either X or the target variable (y), ensuring alignment between features and labels.

Data Scaling: Standardizes the cleaned feature set using StandardScaler.

Model Training: Trains a Logistic Regression model on the scaled data.

Prediction and Evaluation: Makes predictions on the training data and calculates the accuracy score.

In [22]:
def centralized_training_evaluation(X, y):
    # Ensure we are working only with numeric data for this function
    X_numeric = X.select_dtypes(include=[np.number])
    
    # Make sure both X and y don't have mismatched rows
    mask = ~np.isnan(y) & ~np.isnan(X_numeric).any(axis=1)
    
    # Filter X and y based on the mask
    X_cleaned = X_numeric[mask]
    y_cleaned = y[mask]

    # Train a Logistic Regression model on the entire dataset
    scaler = StandardScaler()
    X_cleaned_scaled = scaler.fit_transform(X_cleaned)
    
    model = LogisticRegression()
    model.fit(X_cleaned_scaled, y_cleaned)
    
    # Predict and evaluate using accuracy score
    predictions = model.predict(X_cleaned_scaled)
    centralized_accuracy = accuracy_score(y_cleaned, predictions)
    
    return centralized_accuracy


**The simulate_federated_learning function simulates a federated learning scenario with three clients.**

**Key Steps:**

Client Data Preparation: Splits the dataset into three subsets for each client, saving the data to files.

Federated Learning: Initiates federated learning by calling run_federated_learning for a specified number of rounds.

Model Evaluation: Evaluates the federated model on the last client's data after cleaning it to ensure only numeric values and handling NaN/infinite values.

Training and Prediction: Trains a Logistic Regression model on the second-to-last client's data and predicts on the last client's test data.

Accuracy Calculation: Computes and prints the accuracy for both the federated and centralized models for comparison.

In [23]:
def simulate_federated_learning():
    num_clients = 3  # Set the number of clients
    client_data = []

    # Split data into multiple clients
    for i in range(num_clients):
        X_split = X.sample(frac=1/num_clients, random_state=i)
        y_split = y.loc[X_split.index]
        client_data.append((X_split, y_split))
        with open(f'client_{i}_data.pkl', 'wb') as f:
            pickle.dump((X_split, y_split), f)

    print("\nStarting Federated Learning...")
    run_federated_learning(num_clients - 1, num_rounds=3)

    print("\nEvaluating Federated Model on the remaining client dataset...")
    X_test, y_test = client_data[-1]
    
    # Ensure only numeric columns are scaled and NaN/infinite values are handled
    X_test = X_test.select_dtypes(include=[np.number])  # Only select numeric columns
    X_test = np.nan_to_num(X_test, nan=0.0, posinf=1e10, neginf=-1e10)  # Replace NaN, inf values

    with open(f'client_{num_clients - 2}_data.pkl', 'rb') as f:
        X_last_client, y_last_client = pickle.load(f)

    # Apply same cleaning for last client data
    X_last_client = X_last_client.select_dtypes(include=[np.number])  # Ensure only numeric data
    X_last_client = np.nan_to_num(X_last_client, nan=0.0, posinf=1e10, neginf=-1e10)  # Replace NaN, inf values
    y_last_client = np.nan_to_num(y_last_client, nan=0.0, posinf=1e10, neginf=-1e10)  # Same for labels
    
    # Train and evaluate the model
    model = LogisticRegression()
    model.fit(X_last_client, y_last_client)
    
    # Predict on test data
    predictions = model.predict(X_test)
    
    # Calculate federated accuracy
    federated_accuracy = accuracy_score(y_test, predictions)

    print("\nRunning Centralized Model for comparison...")
    centralized_accuracy = centralized_training_evaluation(X, y)

    print(f"\nFederated model accuracy: {federated_accuracy:.4f}")
    print(f"Centralized model accuracy: {centralized_accuracy:.4f}")

In [24]:
# Start the simulation
simulate_federated_learning()


Starting Federated Learning...

Evaluating Federated Model on the remaining client dataset...

Running Centralized Model for comparison...

Federated model accuracy: 0.5116
Centralized model accuracy: 0.5263


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
