In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset into a pandas dataframe
df = pd.read_csv('ADSXLIST_07Sep2023.csv')

common_features = ['AXRASH', 'AXMUSCLE', 'AXURNFRQ', 'AXENERGY', 'AXDROWSY', 'AXDIZZY', 'AXBREATH', 'AXCOUGH']

# Define categorical features for one-hot encoding
categorical_features = ['VISCODE', 'VISCODE2', 'SITEID']
df = pd.get_dummies(df, columns=categorical_features)

# Exclude any non-numeric columns (e.g., dates or other string columns)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'Phase' (target variable) is not in the features list
if 'Phase' in numeric_columns:
    numeric_columns.remove('Phase')

# Define the number of iterations and splits
num_iterations = 10
num_splits = 6

# Lists to store global accuracy and precision for each iteration
global_accuracy_all_iterations = []
global_precision_all_iterations = []

for iteration in range(num_iterations):
    # Shuffle the data randomly for each iteration
    df_shuffled = df.sample(frac=1, random_state=iteration * 123)

    # Initial Splitting of the data into sections
    split_size = len(df_shuffled) // num_splits
    data_splits = [df_shuffled.iloc[i * split_size: (i + 1) * split_size] for i in range(num_splits)]

    # Data Shifting: Move 25% data from each node to the next
    for i in range(num_splits):
        next_index = (i + 1) % num_splits
        data_to_shift = data_splits[i].sample(frac=0.25, random_state=iteration)
        data_splits[i] = data_splits[i].drop(data_to_shift.index)
        data_splits[next_index] = pd.concat([data_splits[next_index], data_to_shift])

    coefficients_list = []
    intercepts_list = []

    # Train local models and collect their parameters
    for i in range(num_splits):
        split_data = data_splits[i]

        # Filter the 'AXRASH' data for Nodes 1 and 2
        if i == 0:  # Node 1
            split_data = split_data[split_data['AXRASH'] == 1]
            features = ['AXRASH']
        elif i == 1:  # Node 2
            split_data = split_data[split_data['AXRASH'] == 2]
            features = ['AXRASH']
        else:  # Nodes 3 to 6
            features = common_features

        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        # Data preprocessing steps
        X.fillna(0, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=359)

        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        coefficients_list.append(model.coef_)
        intercepts_list.append(model.intercept_)

    # Federated Averaging: Calculate mean of coefficients and intercepts
    avg_coefficients = np.mean(coefficients_list, axis=0)
    avg_intercepts = np.mean(intercepts_list, axis=0)

    global_model = LogisticRegression(max_iter=1000)
    global_model.coef_ = avg_coefficients
    global_model.intercept_ = avg_intercepts

    # Fit the global model on a small but representative subset of data to initialize 'classes_'
    subset = df_shuffled.drop_duplicates(subset='Phase').head(10)
    subset_X = subset[features].copy()
    subset_X.fillna(0, inplace=True)
    subset_y = subset['Phase'].copy()
    global_model.fit(subset_X, subset_y)

    # Evaluate the global model on each node's data, excluding nodes 1 and 2, and store the results
    iteration_global_accuracy = []
    iteration_global_precision = []

    for i in range(num_splits):
        if i == 0 or i == 1:  # Skip evaluation for nodes 1 and 2
            continue

        split_data = data_splits[i]
        X = split_data[features].copy()
        y = split_data['Phase'].copy()

        # Data preprocessing
        X.fillna(0, inplace=True)

        y_pred = global_model.predict(X)
        iteration_global_accuracy.append(accuracy_score(y, y_pred))
        iteration_global_precision.append(precision_score(y, y_pred, average='weighted', zero_division=0))

    # Store global accuracy and precision for this iteration
    global_accuracy_all_iterations.append(iteration_global_accuracy)
    global_precision_all_iterations.append(iteration_global_precision)

# Organize the results into a DataFrame
results_df = pd.DataFrame({
    'Global Accuracy': global_accuracy_all_iterations,
    'Global Precision': global_precision_all_iterations
})

# Save the results to a CSV file
results_df.to_csv('model_resultsg.csv', index=False)

print("Results saved to 'model_resultsg.csv'")


Results saved to 'model_resultsg.csv'


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset into a pandas dataframe
df = pd.read_csv('ADSXLIST_07Sep2023.csv')

common_features = ['AXRASH', 'AXMUSCLE', 'AXURNFRQ', 'AXENERGY', 'AXDROWSY', 'AXDIZZY', 'AXBREATH', 'AXCOUGH']

# Define categorical features for one-hot encoding
categorical_features = ['VISCODE', 'VISCODE2', 'SITEID']
df = pd.get_dummies(df, columns=categorical_features)

# Exclude any non-numeric columns (e.g., dates or other string columns)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'Phase' (target variable) is not in the features list
if 'Phase' in numeric_columns:
    numeric_columns.remove('Phase')

# Define the number of iterations and splits
num_iterations = 10
num_splits = 6

# Lists to store global accuracy and precision for each iteration
global_accuracy_all_iterations = []
global_precision_all_iterations = []

for iteration in range(num_iterations):
    df_shuffled = df.sample(frac=1, random_state=iteration * 123)

    split_size = len(df_shuffled) // num_splits
    data_splits = [df_shuffled.iloc[i * split_size: (i + 1) * split_size] for i in range(num_splits)]

    for i in range(num_splits):
        next_index = (i + 1) % num_splits
        data_to_shift = data_splits[i].sample(frac=0.25, random_state=iteration)
        data_splits[i] = data_splits[i].drop(data_to_shift.index)
        data_splits[next_index] = pd.concat([data_splits[next_index], data_to_shift])

    coefficients_list = []
    intercepts_list = []

    for i in range(num_splits):
        split_data = data_splits[i]

        if i in [0, 1]:  # Node 1 and Node 2
            split_data = split_data[split_data['AXRASH'] == (i + 1)]
            features = ['AXRASH']
        else:  # Nodes 3 to 6
            features = common_features

        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        X.fillna(0, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=359)

        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        coefficients_list.append(model.coef_)
        intercepts_list.append(model.intercept_)

    avg_coefficients = np.mean(coefficients_list, axis=0)
    avg_intercepts = np.mean(intercepts_list, axis=0)

    global_model = LogisticRegression(max_iter=1000)
    global_model.coef_ = avg_coefficients
    global_model.intercept_ = avg_intercepts

    # Fit the global model on a small but representative subset of data
    subset = df_shuffled.drop_duplicates(subset='Phase').head(10)
    subset_X = subset[numeric_columns].copy()
    subset_X.fillna(0, inplace=True)
    subset_y = subset['Phase'].copy()
    global_model.fit(subset_X, subset_y)

    # Evaluate the global model on each node's data, excluding nodes 1 and 2, and store the results
    iteration_global_accuracy = []
    iteration_global_precision = []

    for i in range(2, num_splits):
        split_data = data_splits[i]
        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        X.fillna(0, inplace=True)

        y_pred = global_model.predict(X)
        iteration_global_accuracy.append(accuracy_score(y, y_pred))
        iteration_global_precision.append(precision_score(y, y_pred, average='weighted', zero_division=0))

    # Additional evaluation for Node 1 and Node 2
    for i in range(2):
        node_data = df[df['AXRASH'] == i + 1]
        X_node = node_data[numeric_columns].copy()
        y_node = node_data['Phase'].copy()

        X_node.fillna(0, inplace=True)
        y_pred_node = global_model.predict(X_node)

        accuracy_node = accuracy_score(y_node, y_pred_node)
        precision_node = precision_score(y_node, y_pred_node, average='weighted', zero_division=0)

        # Store the results for Node 1 and Node 2
        if i == 0:
            iteration_global_accuracy.insert(i, accuracy_node)
            iteration_global_precision.insert(i, precision_node)
        else:
            iteration_global_accuracy.append(accuracy_node)
            iteration_global_precision.append(precision_node)

    global_accuracy_all_iterations.append(iteration_global_accuracy)
    global_precision_all_iterations.append(iteration_global_precision)

# Organize the results into a DataFrame
results_df = pd.DataFrame({
    'Global Accuracy': global_accuracy_all_iterations,
    'Global Precision': global_precision_all_iterations
})

# Save the results to a CSV file
results_df.to_csv('model_resultsg1.csv', index=False)

print("Results saved to 'model_resultsg1.csv'")


Results saved to 'model_resultsg1.csv'


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset into a pandas dataframe
df = pd.read_csv('ADSXLIST_07Sep2023.csv')

common_features = ['AXRASH', 'AXMUSCLE', 'AXURNFRQ', 'AXENERGY', 'AXDROWSY', 'AXDIZZY', 'AXBREATH', 'AXCOUGH']

# Define categorical features for one-hot encoding
categorical_features = ['VISCODE', 'VISCODE2', 'SITEID']
df = pd.get_dummies(df, columns=categorical_features)

# Exclude any non-numeric columns (e.g., dates or other string columns)
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'Phase' (target variable) is not in the features list
if 'Phase' in numeric_columns:
    numeric_columns.remove('Phase')

# Define the number of iterations and splits
num_iterations = 10
num_splits = 6

# Lists to store global accuracy and precision for Node 1 and Node 2 for each iteration
global_accuracy_node_1_2 = []
global_precision_node_1_2 = []

for iteration in range(num_iterations):
    df_shuffled = df.sample(frac=1, random_state=iteration * 123)

    split_size = len(df_shuffled) // num_splits
    data_splits = [df_shuffled.iloc[i * split_size: (i + 1) * split_size] for i in range(num_splits)]

    for i in range(num_splits):
        next_index = (i + 1) % num_splits
        data_to_shift = data_splits[i].sample(frac=0.25, random_state=iteration)
        data_splits[i] = data_splits[i].drop(data_to_shift.index)
        data_splits[next_index] = pd.concat([data_splits[next_index], data_to_shift])

    coefficients_list = []
    intercepts_list = []

    for i in range(num_splits):
        split_data = data_splits[i]

        if i in [0, 1]:  # Node 1 and Node 2
            split_data = split_data[split_data['AXRASH'] == (i + 1)]
            features = ['AXRASH']
        else:  # Nodes 3 to 6
            features = common_features

        X = split_data[numeric_columns].copy()
        y = split_data['Phase'].copy()

        X.fillna(0, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=359)

        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)

        coefficients_list.append(model.coef_)
        intercepts_list.append(model.intercept_)

    avg_coefficients = np.mean(coefficients_list, axis=0)
    avg_intercepts = np.mean(intercepts_list, axis=0)

    global_model = LogisticRegression(max_iter=1000)
    global_model.coef_ = avg_coefficients
    global_model.intercept_ = avg_intercepts

    # Fit the global model on a small but representative subset of data
    subset = df_shuffled.drop_duplicates(subset='Phase').head(10)
    subset_X = subset[numeric_columns].copy()
    subset_X.fillna(0, inplace=True)
    subset_y = subset['Phase'].copy()
    global_model.fit(subset_X, subset_y)

    # Evaluate the global model only for Node 1 and Node 2
    accuracy_precision_node_1_2 = []

    for i in range(2):
        node_data = df[df['AXRASH'] == i + 1]
        X_node = node_data[numeric_columns].copy()
        y_node = node_data['Phase'].copy()

        X_node.fillna(0, inplace=True)
        y_pred_node = global_model.predict(X_node)

        accuracy_node = accuracy_score(y_node, y_pred_node)
        precision_node = precision_score(y_node, y_pred_node, average='weighted', zero_division=0)

        accuracy_precision_node_1_2.append((accuracy_node, precision_node))

    global_accuracy_node_1_2.append(accuracy_precision_node_1_2[0][0])
    global_precision_node_1_2.append(accuracy_precision_node_1_2[0][1])
    global_accuracy_node_1_2.append(accuracy_precision_node_1_2[1][0])
    global_precision_node_1_2.append(accuracy_precision_node_1_2[1][1])

# Organize the results into a DataFrame
results_df = pd.DataFrame({
    'Node 1 Global Accuracy': global_accuracy_node_1_2[::2],  # Extracting even indices (Node 1)
    'Node 1 Global Precision': global_precision_node_1_2[::2],
    'Node 2 Global Accuracy': global_accuracy_node_1_2[1::2],  # Extracting odd indices (Node 2)
    'Node 2 Global Precision': global_precision_node_1_2[1::2]
})

# Save the results to a CSV file
results_df.to_csv('model_results_nodes_1_2.csv', index=False)

print("Results saved to 'model_results_nodes_1_2.csv'")


Results saved to 'model_results_nodes_1_2.csv'
