In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset into a pandas dataframe
df = pd.read_csv('ADSXLIST_07Sep2023.csv')

common_features = ['AXRASH', 'AXMUSCLE', 'AXURNFRQ', 'AXENERGY', 'AXDROWSY', 'AXDIZZY', 'AXBREATH', 'AXCOUGH']

# Define categorical features for one-hot encoding
categorical_features = ['VISCODE', 'VISCODE2', 'SITEID']
df = pd.get_dummies(df, columns=categorical_features)

# Exclude any non-numeric columns (e.g., dates or other string columns)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'Phase' (target variable) is not in the features list
if 'Phase' in numeric_columns:
    numeric_columns.remove('Phase')

# Define the number of iterations and splits
num_iterations = 10
num_splits = 6

# Lists to store global accuracy and precision for each iteration
global_accuracy_all_iterations = []
global_precision_all_iterations = []

for iteration in range(num_iterations):

    iteration_global_accuracy = [0] * num_splits
    iteration_global_precision = [0] * num_splits

    # Shuffle the data randomly for each iteration
    df_shuffled = df.sample(frac=1, random_state=iteration * 123)

    # Initial Splitting of the data into sections
    split_size = len(df_shuffled) // num_splits
    data_splits = [df_shuffled.iloc[i * split_size: (i + 1) * split_size] for i in range(num_splits)]

    # Data Shifting: Move 25% data from each node to the next
    for i in range(num_splits):
        next_index = (i + 1) % num_splits
        data_to_shift = data_splits[i].sample(frac=0.25, random_state=iteration)
        data_splits[i] = data_splits[i].drop(data_to_shift.index)
        data_splits[next_index] = pd.concat([data_splits[next_index], data_to_shift])

    # Lists for storing model parameters
    coefficients_list = []
    intercepts_list = []

    # Train local models and collect their parameters
    for i in range(num_splits):
        split_data = data_splits[i]
        # Prepare the data for each node
        if i == 0:  # Node 1
            split_data = split_data[split_data['AXRASH'] == 2]
            features = ['AXRASH']
        elif i == 1:  # Node 2
            split_data = split_data[split_data['AXCOUGH'] == 1]
            features = ['AXCOUGH']
        else:  # Nodes 3 to 6
            features = common_features

        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        # Data preprocessing steps
        X.fillna(0, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=359)

        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        coefficients_list.append(model.coef_)
        intercepts_list.append(model.intercept_)

        y_pred = model.predict(X_test)
        local_accuracy = accuracy_score(y_test, y_pred)
        local_precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

    # Federated Averaging: Calculate mean of coefficients and intercepts
    avg_coefficients = np.mean(coefficients_list, axis=0)
    avg_intercepts = np.mean(intercepts_list, axis=0)

    global_model = LogisticRegression(max_iter=1000)
    global_model.coef_ = avg_coefficients
    global_model.intercept_ = avg_intercepts

    # Fit the global model on a small but representative subset of data to initialize 'classes_'
    subset = df_shuffled.drop_duplicates(subset='Phase').head(10)
    subset_X = subset[numeric_columns].copy()
    subset_X.fillna(0, inplace=True)
    subset_y = subset['Phase'].copy()
    global_model.fit(subset_X, subset_y)

    # Sending global model back to nodes 3 to 6 for testing
    nodes_to_test = [2, 3, 4, 5]  # Python uses 0-indexing, so node 3 is index 2, etc.
    for node_index in nodes_to_test:
        split_data = data_splits[node_index]
        X_test = split_data[numeric_columns].copy()
        y_test = split_data['Phase'].copy()

        # Data preprocessing (if necessary)
        X_test.fillna(0, inplace=True)

        y_pred = global_model.predict(X_test)
        node_accuracy = accuracy_score(y_test, y_pred)
        node_precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

        # Store the results for each node
        iteration_global_accuracy[node_index] = node_accuracy
        iteration_global_precision[node_index] = node_precision
    # Store global accuracy and precision for this iteration
    global_accuracy_all_iterations.append(iteration_global_accuracy)
    global_precision_all_iterations.append(iteration_global_precision)

# Organize the results into a DataFrame
results_df = pd.DataFrame({
    'Global Accuracy': global_accuracy_all_iterations,
    'Global Precision': global_precision_all_iterations
})

# Save the results to a CSV file
results_df.to_csv('model_results_global.csv', index=False)

print("Results saved to 'model_results_global.csv'")


Results saved to 'model_results_global.csv'


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset into a pandas dataframe
df = pd.read_csv('ADSXLIST_07Sep2023.csv')

# Common features
common_features = ['AXRASH', 'AXMUSCLE', 'AXURNFRQ', 'AXENERGY', 'AXDROWSY', 'AXDIZZY', 'AXBREATH', 'AXCOUGH']

# Define categorical features for one-hot encoding
categorical_features = ['VISCODE', 'VISCODE2', 'SITEID']
df = pd.get_dummies(df, columns=categorical_features)

# Exclude any non-numeric columns (e.g., dates or other string columns)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'Phase' (target variable) is not in the features list
if 'Phase' in numeric_columns:
    numeric_columns.remove('Phase')

# Define the number of iterations and splits
num_iterations = 10
num_splits = 6

# Lists to store global accuracy and precision for each iteration
global_accuracy_all_iterations = []
global_precision_all_iterations = []

for iteration in range(num_iterations):

    iteration_global_accuracy = [0] * num_splits
    iteration_global_precision = [0] * num_splits

    # Shuffle the data randomly for each iteration
    df_shuffled = df.sample(frac=1, random_state=iteration * 123)

    # Initial Splitting of the data into sections
    split_size = len(df_shuffled) // num_splits
    data_splits = [df_shuffled.iloc[i * split_size: (i + 1) * split_size] for i in range(num_splits)]

    # Data Shifting: Move 25% data from each node to the next
    for i in range(num_splits):
        next_index = (i + 1) % num_splits
        data_to_shift = data_splits[i].sample(frac=0.25, random_state=iteration)
        data_splits[i] = data_splits[i].drop(data_to_shift.index)
        data_splits[next_index] = pd.concat([data_splits[next_index], data_to_shift])

    # Lists for storing model parameters
    coefficients_list = []
    intercepts_list = []

    # Train local models and collect their parameters (only for nodes 3 to 6)
    for i in range(2, num_splits):  # Start from index 2 (Node 3) to num_splits (Node 6)
        split_data = data_splits[i]

        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        # Data preprocessing steps
        X.fillna(0, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=359)

        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        coefficients_list.append(model.coef_)
        intercepts_list.append(model.intercept_)

    # Federated Averaging: Calculate mean of coefficients and intercepts (only from nodes 3 to 6)
    avg_coefficients = np.mean(coefficients_list, axis=0)
    avg_intercepts = np.mean(intercepts_list, axis=0)

    global_model = LogisticRegression(max_iter=1000)
    global_model.coef_ = avg_coefficients
    global_model.intercept_ = avg_intercepts

    # Fit the global model on a small but representative subset of data to initialize 'classes_'
    subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
    subset_X = subset[numeric_columns].copy()
    subset_X.fillna(0, inplace=True)
    subset_y = subset['Phase'].copy()
    global_model.fit(subset_X, subset_y)

    # Testing the global model on nodes 3 to 6
    for node_index in range(2, num_splits):  # Nodes 3 to 6
        split_data = data_splits[node_index]
        X_test = split_data[numeric_columns].copy()
        y_test = split_data['Phase'].copy()

        # Data preprocessing
        X_test.fillna(0, inplace=True)

        y_pred = global_model.predict(X_test)
        node_accuracy = accuracy_score(y_test, y_pred)
        node_precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

        # Store results for each node
        iteration_global_accuracy[node_index] = node_accuracy
        iteration_global_precision[node_index] = node_precision

    # Store global accuracy and precision for this iteration
    global_accuracy_all_iterations.append(iteration_global_accuracy)
    global_precision_all_iterations.append(iteration_global_precision)

# Organize the results into a DataFrame
results_df = pd.DataFrame({
    'Global Accuracy': global_accuracy_all_iterations,
    'Global Precision': global_precision_all_iterations
})

# Save the results to a CSV file
results_df.to_csv('model_results_global.csv', index=False)

print("Results saved to 'model_results_global.csv'")


  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4

Results saved to 'model_results_global.csv'


  subset = df_shuffled[df_shuffled.index.isin(data_splits[2].index | data_splits[3].index | data_splits[4].index | data_splits[5].index)].drop_duplicates(subset='Phase').head(10)
