In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import train_test_split

In [3]:
# Preprocessing function
def preprocess_data(file_path, n_components=35, batch_size=500):
    dataset = pd.read_csv(file_path)
    features = dataset.drop('Target', axis=1)
    attacks = dataset['Target']

    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)

    # Incremental PCA
    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    for batch in np.array_split(scaled_features, len(features) // batch_size):
        ipca.partial_fit(batch)

    transformed_features = ipca.transform(scaled_features)
    new_data = pd.DataFrame(transformed_features, columns=[f'PC{i+1}' for i in range(n_components)])
    new_data['Target'] = attacks.values

    # Split data
    X_new = new_data.drop('Target', axis=1)
    y_new = new_data['Target']
    return train_test_split(X_new, y_new, test_size=0.3, random_state=0)

In [5]:
import pandas as pd
import xgboost as xgb

# Parameters for XGBoostClassifier
best_params = {
    'colsample_bytree': 0.6,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 50,
    'subsample': 1.0,
    "eval_metric": "logloss"
}

client_data = [
    preprocess_data("processed_data/data1.csv"),
    preprocess_data("processed_data/data2.csv"),
    preprocess_data("processed_data/data3.csv"),
    preprocess_data("processed_data/data4.csv"),
]

# Initialize a global model
global_model = xgb.XGBClassifier(**best_params)



In [6]:
X_train, X_test, y_train, y_test = client_data[0]
global_model.fit(X_train, y_train)

In [17]:
# Train the model sequentially on each client's data
for i, (X_train, X_test, y_train, y_test) in enumerate(client_data):
    print(f"Training on client {i + 1}...")
    global_model.fit(X_train, y_train, xgb_model=global_model)  # Continue training the existing model

# Save the final model
global_model.save_model("final_global_model.json")
print("Final model trained and saved as 'final_global_model.json'.")


Training on client 1...
Training on client 2...
Training on client 3...
Training on client 4...
Final model trained and saved as 'final_global_model.json'.


In [9]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Parameters for XGBoostClassifier
best_params = {
    'colsample_bytree': 0.6,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 50,
    'subsample': 1.0,
    "eval_metric": "logloss"
}

# Initialize a global model
global_model = xgb.XGBClassifier(**best_params)
X_train, X_test, y_train, y_test = client_data[0]
global_model.fit(X_train, y_train)

# DataFrame to store evaluation metrics
evaluation = pd.DataFrame(columns=[
    'Client', 
    'Data',  # 'Train' or 'Test'
    'Accuracy', 
    'Precision', 
    'Recall', 
    'F1_Score'
])

# Train the model sequentially on each client's data and evaluate
for i, (X_train, X_test, y_train, y_test) in enumerate(client_data[1:]):
    print(f"\nTraining on client {i + 1}...")
    
    # Fit the model on this client's data
    global_model.fit(X_train, y_train, xgb_model=global_model)  # Continue training the existing model
    
    # Evaluate on training data
    y_train_pred = global_model.predict(X_train)
    train_metrics = {
        'Client': i + 1,
        'Data': 'Train',
        'Accuracy': accuracy_score(y_train, y_train_pred),
        'Precision': precision_score(y_train, y_train_pred, average='weighted'),
        'Recall': recall_score(y_train, y_train_pred, average='weighted'),
        'F1_Score': f1_score(y_train, y_train_pred, average='weighted')
    }
    
    # Evaluate on test data
    y_test_pred = global_model.predict(X_test)
    test_metrics = {
        'Client': i + 1,
        'Data': 'Test',
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'Precision': precision_score(y_test, y_test_pred, average='weighted'),
        'Recall': recall_score(y_test, y_test_pred, average='weighted'),
        'F1_Score': f1_score(y_test, y_test_pred, average='weighted')
    }
    
    # Store metrics in DataFrame
    evaluation = pd.concat([evaluation, pd.DataFrame([train_metrics, test_metrics])], ignore_index=True)
    
    # Print metrics for this client
    print(f"Client {i + 1} - Training Metrics:")
    print(f"  Accuracy:  {train_metrics['Accuracy']:.4f}")
    print(f"  Precision: {train_metrics['Precision']:.4f}")
    print(f"  Recall:    {train_metrics['Recall']:.4f}")
    print(f"  F1 Score:  {train_metrics['F1_Score']:.4f}")
    
    print(f"Client {i + 1} - Test Metrics:")
    print(f"  Accuracy:  {test_metrics['Accuracy']:.4f}")
    print(f"  Precision: {test_metrics['Precision']:.4f}")
    print(f"  Recall:    {test_metrics['Recall']:.4f}")
    print(f"  F1 Score:  {test_metrics['F1_Score']:.4f}")

# Save the final model
global_model.save_model("final_global_model.json")
print("\nFinal model trained and saved as 'final_global_model.json'.")

# Save evaluation metrics to a CSV file for later analysis
evaluation.to_csv("evaluation_metrics.csv", index=False)
print("\nEvaluation metrics saved to 'evaluation_metrics.csv'.")



Training on client 1...


  evaluation = pd.concat([evaluation, pd.DataFrame([train_metrics, test_metrics])], ignore_index=True)


Client 1 - Training Metrics:
  Accuracy:  0.9985
  Precision: 0.9985
  Recall:    0.9985
  F1 Score:  0.9985
Client 1 - Test Metrics:
  Accuracy:  0.9980
  Precision: 0.9980
  Recall:    0.9980
  F1 Score:  0.9980

Training on client 2...
Client 2 - Training Metrics:
  Accuracy:  0.9975
  Precision: 0.9975
  Recall:    0.9975
  F1 Score:  0.9975
Client 2 - Test Metrics:
  Accuracy:  0.9967
  Precision: 0.9967
  Recall:    0.9967
  F1 Score:  0.9967

Training on client 3...
Client 3 - Training Metrics:
  Accuracy:  0.9960
  Precision: 0.9960
  Recall:    0.9960
  F1 Score:  0.9960
Client 3 - Test Metrics:
  Accuracy:  0.9946
  Precision: 0.9946
  Recall:    0.9946
  F1 Score:  0.9946

Final model trained and saved as 'final_global_model.json'.

Evaluation metrics saved to 'evaluation_metrics.csv'.


In [21]:
evaluation

Unnamed: 0,Client,Data,Accuracy,Precision,Recall,F1_Score
0,1,Train,0.998548,0.998548,0.998548,0.998548
1,1,Test,0.997968,0.997969,0.997968,0.997968
2,2,Train,0.997501,0.997498,0.997501,0.997498
3,2,Test,0.996717,0.996714,0.996717,0.996715
4,3,Train,0.996014,0.996013,0.996014,0.996013
5,3,Test,0.994638,0.994637,0.994638,0.994637
