In [None]:
import os
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Configuration
CONFIG = {
    'data_path': r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset',
    'num_clients': 3,
    'epochs': 5,
    'learning_rate': 0.001,
    'batch_size': 32,
    'communication_rounds': 5,
    'train_split_ratio': 0.8  # 80% train, 20% validation
}

# Improved Android Malware Dataset
class AndroidMalwareDataset:
    def __init__(self, file_paths):
        self.data = []
        self.labels = []
        self.max_features = 0
        
        # First pass: determine the maximum number of features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    print(f"'Category' column not found in {file_path}")
                    continue
                
                # Count features (excluding Category column)
                num_features = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore').shape[1]
                self.max_features = max(self.max_features, num_features)
                
            except Exception as e:
                print(f"Error checking features in {file_path}: {e}")
        
        print(f"Maximum feature dimension across all files: {self.max_features}")
        
        # Collect all feature data
        all_features = []
        
        # Second pass: load data and collect features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                all_features.append(features)
                
            except Exception as e:
                print(f"Error collecting features from {file_path}: {e}")
        
        # Third pass: pad and add to dataset
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                # Extract labels from the 'Category' column
                labels = df['Category'].values
                
                # Add to dataset
                for i in range(features.shape[0]):
                    self.data.append(features[i])
                    self.labels.append(labels[i])
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Convert category names to numerical labels
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)
        self.num_classes = len(self.label_encoder.classes_)
        
        print(f"Loaded {len(self.data)} samples across {self.num_classes} malware families")
        print(f"Each sample has {self.max_features} features")
        print(f"Malware families: {self.label_encoder.classes_}")

    def get_data(self):
        return np.array(self.data), np.array(self.encoded_labels)

# Federated Learning Server
class FederatedServer:
    def __init__(self, dataset):
        self.dataset = dataset
        self.global_model = self._create_ensemble_model()
        self.global_accuracy_history = []
        
        # Prepare distributed datasets for clients
        self.client_datasets = self._prepare_client_datasets()
    
    def _create_ensemble_model(self):
        # Define base classifiers
        svm = SVC(probability=True)
        rf = RandomForestClassifier()
        dt = DecisionTreeClassifier()
        
        # Create ensemble model
        ensemble_model = VotingClassifier(estimators=[
            ('svm', svm),
            ('rf', rf),
            ('dt', dt)
        ], voting='soft')
        
        return ensemble_model
    
    def _prepare_client_datasets(self):
        # Split into train and validation
        X, y = self.dataset.get_data()
        train_size = int(CONFIG['train_split_ratio'] * len(X))
        val_size = len(X) - train_size
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, test_size=val_size, random_state=42)
        
        # Distribute training data among clients
        client_datasets = []
        total_train_samples = len(X_train)
        samples_per_client = total_train_samples // CONFIG['num_clients']
        
        for i in range(CONFIG['num_clients']):
            start_idx = i * samples_per_client
            end_idx = start_idx + samples_per_client if i < CONFIG['num_clients'] - 1 else total_train_samples
            client_subset = (X_train[start_idx:end_idx], y_train[start_idx:end_idx])
            client_datasets.append(client_subset)
        
        return {
            'train_datasets': client_datasets,
            'validation_dataset': (X_val, y_val)
        }
    
    def distribute_model(self):
        return self.global_model
    
    def evaluate_global_model(self):
        X_val, y_val = self.client_datasets['validation_dataset']
        self.global_model.fit(X_val, y_val)  # Fit the global model before evaluation
        y_pred = self.global_model.predict(X_val)
        
        accuracy = np.mean(y_pred == y_val) * 100
        self.global_accuracy_history.append(accuracy)
        
        return accuracy
    
    def generate_confusion_matrix(self, title="Global Model Confusion Matrix"):
        X_val, y_val = self.client_datasets['validation_dataset']
        y_pred = self.global_model.predict(X_val)
        
        # Convert numeric labels back to family names for better readability
        class_names = self.dataset.label_encoder.classes_
        cm = confusion_matrix(y_val, y_pred)
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names,
                   yticklabels=class_names)
        plt.title(title)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig(f'{title.replace(" ", "_")}.png')
        plt.close()
        
        print(f"\nClassification Report for {title}:")
        print(classification_report(y_val, y_pred, target_names=class_names))
        
        return cm
    
    def run_federated_learning(self):
        print("\nClient Dataset Sizes:")
        for client_id, client_dataset in enumerate(self.client_datasets['train_datasets']):
            print(f"Client {client_id}: {len(client_dataset[0])} samples")
        
        round_client_accuracies = []
        
        for round in range(CONFIG['communication_rounds']):
            print(f"\nCommunication Round {round + 1}")
            
            client_models = []
            client_accuracies = []
            
            for client_id, (X_train, y_train) in enumerate(self.client_datasets['train_datasets']):
                client_model = self._create_ensemble_model()
                
                # Train client model
                client_model.fit(X_train, y_train)
                
                # Evaluate client model
                client_accuracy = self._evaluate_client_model(client_model, X_train, y_train)
                client_accuracies.append(client_accuracy)
                
                # Send model to server
                client_models.append(client_model)
            
            global_accuracy = self.evaluate_global_model()
            round_client_accuracies.append(client_accuracies)
            
            print(f"Client Accuracies: {[f'{acc:.2f}%' for acc in client_accuracies]}")
            print(f"Global Validation Accuracy: {global_accuracy:.2f}%")
        
        # Generate final confusion matrix
        self.generate_confusion_matrix("Final Global Model Confusion Matrix")
        
        return global_accuracy
    
    def _evaluate_client_model(self, model, X_train, y_train):
        y_pred = model.predict(X_train)
        accuracy = np.mean(y_pred == y_train) * 100
        return accuracy

# Main execution block
def main():
    print("Android Malware Detection using Federated Learning")
    print("-------------------------------------------------")
    
    results_dir = 'federated_learning_results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Set random seeds for reproducibility
    np.random.seed(42)
    
    print("\nLoading Android malware datasets...")
    # Load all CSV files with 'after_reboot' in the filename
    files = glob.glob(CONFIG['data_path'] + '/*after_reboot*.csv')
    print(f"Found {len(files)} files matching the pattern")
    
    # Create dataset
    dataset = AndroidMalwareDataset(files)
    
    if dataset.num_classes == 0:
        print("Error: Failed to load dataset correctly. Check the file paths and data format.")
        return
    
    print(f"\nDataset Statistics:")
    print(f"Number of samples: {len(dataset.data)}")
    print(f"Number of features: {dataset.max_features}")
    print(f"Number of malware families: {dataset.num_classes}")
    
    print("\nInitializing Federated Learning Server...")
    server = FederatedServer(dataset)
    
    try:
        print("\nStarting Federated Learning Training...")
        final_accuracy = server.run_federated_learning()
        
        # Save results
        results_file = os.path.join(results_dir, 'final_results.txt')
        with open(results_file, 'w') as f:
            f.write(f"Final Validation Accuracy: {final_accuracy:.2f}%\n")
            f.write(f"Number of malware families: {dataset.num_classes}\n")
            f.write(f"Malware families: {', '.join(dataset.label_encoder.classes_)}\n")
        
        print(f"\nResults saved to {results_file}")
        print("Training completed successfully!")
    
    except Exception as e:
        print(f"An error occurred during training: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Android Malware Detection using Federated Learning
-------------------------------------------------

Loading Android malware datasets...
Found 12 files matching the pattern
Maximum feature dimension across all files: 121


AttributeError: 'AndroidMalwareDataset' object has no attribute 'scaler'

In [2]:
import os
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Configuration
CONFIG = {
    'data_path': r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset',
    'num_clients': 3,
    'epochs': 5,
    'learning_rate': 0.001,
    'batch_size': 32,
    'communication_rounds': 5,
    'train_split_ratio': 0.8  # 80% train, 20% validation
}

# Improved Android Malware Dataset
class AndroidMalwareDataset:
    def __init__(self, file_paths):
        self.data = []
        self.labels = []
        self.max_features = 0
        
        # First pass: determine the maximum number of features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    print(f"'Category' column not found in {file_path}")
                    continue
                
                # Count features (excluding Category column)
                num_features = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore').shape[1]
                self.max_features = max(self.max_features, num_features)
                
            except Exception as e:
                print(f"Error checking features in {file_path}: {e}")
        
        print(f"Maximum feature dimension across all files: {self.max_features}")
        
        # Collect all feature data
        all_features = []
        
        # Second pass: load data and collect features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                all_features.append(features)
                
            except Exception as e:
                print(f"Error collecting features from {file_path}: {e}")
        
        # Third pass: pad and add to dataset
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                # Extract labels from the 'Category' column
                labels = df['Category'].values
                
                # Add to dataset
                for i in range(features.shape[0]):
                    self.data.append(features[i])
                    self.labels.append(labels[i])
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Convert category names to numerical labels
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)
        self.num_classes = len(self.label_encoder.classes_)
        
        print(f"Loaded {len(self.data)} samples across {self.num_classes} malware families")
        print(f"Each sample has {self.max_features} features")
        print(f"Malware families: {self.label_encoder.classes_}")

    def get_data(self):
        return np.array(self.data), np.array(self.encoded_labels)

# Federated Learning Server
class FederatedServer:
    def __init__(self, dataset):
        self.dataset = dataset
        self.global_model = self._create_ensemble_model()
        self.global_accuracy_history = []
        
        # Prepare distributed datasets for clients
        self.client_datasets = self._prepare_client_datasets()
    
    def _create_ensemble_model(self):
        # Define base classifiers
        svm = SVC(probability=True)
        rf = RandomForestClassifier()
        dt = DecisionTreeClassifier()
        
        # Create ensemble model
        ensemble_model = VotingClassifier(estimators=[
            ('svm', svm),
            ('rf', rf),
            ('dt', dt)
        ], voting='soft')
        
        return ensemble_model
    
    def _prepare_client_datasets(self):
        # Split into train and validation
        X, y = self.dataset.get_data()
        train_size = int(CONFIG['train_split_ratio'] * len(X))
        val_size = len(X) - train_size
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, test_size=val_size, random_state=42)
        
        # Distribute training data among clients
        client_datasets = []
        total_train_samples = len(X_train)
        samples_per_client = total_train_samples // CONFIG['num_clients']
        
        for i in range(CONFIG['num_clients']):
            start_idx = i * samples_per_client
            end_idx = start_idx + samples_per_client if i < CONFIG['num_clients'] - 1 else total_train_samples
            client_subset = (X_train[start_idx:end_idx], y_train[start_idx:end_idx])
            client_datasets.append(client_subset)
        
        return {
            'train_datasets': client_datasets,
            'validation_dataset': (X_val, y_val)
        }
    
    def distribute_model(self):
        return self.global_model
    
    def evaluate_global_model(self):
        X_val, y_val = self.client_datasets['validation_dataset']
        self.global_model.fit(X_val, y_val)  # Fit the global model before evaluation
        y_pred = self.global_model.predict(X_val)
        
        accuracy = np.mean(y_pred == y_val) * 100
        self.global_accuracy_history.append(accuracy)
        
        return accuracy
    
    def generate_confusion_matrix(self, title="Global Model Confusion Matrix"):
        X_val, y_val = self.client_datasets['validation_dataset']
        y_pred = self.global_model.predict(X_val)
        
        # Convert numeric labels back to family names for better readability
        class_names = self.dataset.label_encoder.classes_
        cm = confusion_matrix(y_val, y_pred)
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names,
                   yticklabels=class_names)
        plt.title(title)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig(f'{title.replace(" ", "_")}.png')
        plt.close()
        
        print(f"\nClassification Report for {title}:")
        print(classification_report(y_val, y_pred, target_names=class_names))
        
        return cm
    
    def run_federated_learning(self):
        print("\nClient Dataset Sizes:")
        for client_id, client_dataset in enumerate(self.client_datasets['train_datasets']):
            print(f"Client {client_id}: {len(client_dataset[0])} samples")
        
        round_client_accuracies = []
        
        for round in range(CONFIG['communication_rounds']):
            print(f"\nCommunication Round {round + 1}")
            
            client_models = []
            client_accuracies = []
            
            for client_id, (X_train, y_train) in enumerate(self.client_datasets['train_datasets']):
                client_model = self._create_ensemble_model()
                
                # Train client model
                client_model.fit(X_train, y_train)
                
                # Evaluate client model
                client_accuracy = self._evaluate_client_model(client_model, X_train, y_train)
                client_accuracies.append(client_accuracy)
                
                # Send model to server
                client_models.append(client_model)
            
            global_accuracy = self.evaluate_global_model()
            round_client_accuracies.append(client_accuracies)
            
            print(f"Client Accuracies: {[f'{acc:.2f}%' for acc in client_accuracies]}")
            print(f"Global Validation Accuracy: {global_accuracy:.2f}%")
        
        # Generate final confusion matrix
        self.generate_confusion_matrix("Final Global Model Confusion Matrix")
        
        return global_accuracy
    
    def _evaluate_client_model(self, model, X_train, y_train):
        y_pred = model.predict(X_train)
        accuracy = np.mean(y_pred == y_train) * 100
        return accuracy

# Main execution block
def main():
    print("Android Malware Detection using Federated Learning")
    print("-------------------------------------------------")
    
    results_dir = 'federated_learning_results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Set random seeds for reproducibility
    np.random.seed(42)
    
    print("\nLoading Android malware datasets...")
    # Load all CSV files with 'after_reboot' in the filename
    files = glob.glob(CONFIG['data_path'] + '/*after_reboot*.csv')
    print(f"Found {len(files)} files matching the pattern")
    
    # Create dataset
    dataset = AndroidMalwareDataset(files)
    
    if dataset.num_classes == 0:
        print("Error: Failed to load dataset correctly. Check the file paths and data format.")
        return
    
    print(f"\nDataset Statistics:")
    print(f"Number of samples: {len(dataset.data)}")
    print(f"Number of features: {dataset.max_features}")
    print(f"Number of malware families: {dataset.num_classes}")
    
    print("\nInitializing Federated Learning Server...")
    server = FederatedServer(dataset)
    
    try:
        print("\nStarting Federated Learning Training...")
        final_accuracy = server.run_federated_learning()
        
        # Save results
        results_file = os.path.join(results_dir, 'final_results.txt')
        with open(results_file, 'w') as f:
            f.write(f"Final Validation Accuracy: {final_accuracy:.2f}%\n")
            f.write(f"Number of malware families: {dataset.num_classes}\n")
            f.write(f"Malware families: {', '.join(dataset.label_encoder.classes_)}\n")
        
        print(f"\nResults saved to {results_file}")
        print("Training completed successfully!")
    
    except Exception as e:
        print(f"An error occurred during training: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Android Malware Detection using Federated Learning
-------------------------------------------------

Loading Android malware datasets...
Found 12 files matching the pattern
Maximum feature dimension across all files: 121
Loaded 22029 samples across 12 malware families
Each sample has 121 features
Malware families: ['Adware' 'Backdoor' 'FileInfector' 'PUA' 'Ransomware' 'Riskware'
 'Scareware' 'Trojan' 'Trojan_Banker' 'Trojan_Dropper' 'Trojan_SMS'
 'Trojan_Spy']

Dataset Statistics:
Number of samples: 22029
Number of features: 121
Number of malware families: 12

Initializing Federated Learning Server...

Starting Federated Learning Training...

Client Dataset Sizes:
Client 0: 5874 samples
Client 1: 5874 samples
Client 2: 5875 samples

Communication Round 1
Client Accuracies: ['100.00%', '100.00%', '100.00%']
Global Validation Accuracy: 100.00%

Communication Round 2
Client Accuracies: ['100.00%', '100.00%', '100.00%']
Global Validation Accuracy: 100.00%

Communication Round 3
Client Acc

In [4]:
# Genetic Algorithm for Client Selection in Federated Learning
# Improved Android Malware Detection using Federated Learning
# Date: 2021-09-30
import os
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Configuration
CONFIG = {
    'data_path': r'C:\Users\USER\Documents\NTUST\Conference_Workshop_Seminar\Android\Dataset\AndMal2020-dynamic-BeforeAndAfterReboot\Cleaned_Files\normalized_dataset',
    'num_clients': 3,
    'epochs': 5,
    'learning_rate': 0.001,
    'batch_size': 32,
    'communication_rounds': 5,
    'train_split_ratio': 0.8  # 80% train, 20% validation
}

# Improved Android Malware Dataset
class AndroidMalwareDataset:
    def __init__(self, file_paths):
        self.data = []
        self.labels = []
        self.max_features = 0
        
        # First pass: determine the maximum number of features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    print(f"'Category' column not found in {file_path}")
                    continue
                
                # Count features (excluding Category column)
                num_features = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore').shape[1]
                self.max_features = max(self.max_features, num_features)
                
            except Exception as e:
                print(f"Error checking features in {file_path}: {e}")
        
        print(f"Maximum feature dimension across all files: {self.max_features}")
        
        # Collect all feature data
        all_features = []
        
        # Second pass: load data and collect features
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                all_features.append(features)
                
            except Exception as e:
                print(f"Error collecting features from {file_path}: {e}")
        
        # Third pass: pad and add to dataset
        for file_path in file_paths:
            try:
                df = pd.read_csv(file_path)
                if 'Category' not in df.columns:
                    continue
                
                # Process features: Keep only numerical columns
                features_df = df.select_dtypes(include=[np.number]).drop(['Category'], axis=1, errors='ignore')
                features = features_df.values
                
                # Pad features to match max_features
                if features.shape[1] < self.max_features:
                    padding = np.zeros((features.shape[0], self.max_features - features.shape[1]))
                    features = np.hstack((features, padding))
                
                # Extract labels from the 'Category' column
                labels = df['Category'].values
                
                # Add to dataset
                for i in range(features.shape[0]):
                    self.data.append(features[i])
                    self.labels.append(labels[i])
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        # Convert category names to numerical labels
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)
        self.num_classes = len(self.label_encoder.classes_)
        
        print(f"Loaded {len(self.data)} samples across {self.num_classes} malware families")
        print(f"Each sample has {self.max_features} features")
        print(f"Malware families: {self.label_encoder.classes_}")

    def get_data(self):
        return np.array(self.data), np.array(self.encoded_labels)

# Genetic Algorithm for Client Selection
class GeneticAlgorithm:
    def __init__(self, num_clients, population_size, generations, mutation_rate):
        self.num_clients = num_clients
        self.population_size = population_size
        self.generations = generations
        self.mutation_rate = mutation_rate

    def initialize_population(self):
        return [self.random_chromosome() for _ in range(self.population_size)]

    def random_chromosome(self):
        return [random.randint(0, 1) for _ in range(self.num_clients)]

    def fitness(self, chromosome, server):
        selected_clients = [i for i, gene in enumerate(chromosome) if gene == 1]
        if not selected_clients:
            return 0
        accuracy = server.evaluate_clients(selected_clients)
        return accuracy

    def selection(self, population, fitnesses):
        selected = random.choices(population, weights=fitnesses, k=self.population_size)
        return selected

    def crossover(self, parent1, parent2):
        point = random.randint(1, self.num_clients - 1)
        child1 = parent1[:point] + parent2[point:]
        child2 = parent2[:point] + parent1[point:]
        return child1, child2

    def mutate(self, chromosome):
        for i in range(self.num_clients):
            if random.random() < self.mutation_rate:
                chromosome[i] = 1 - chromosome[i]
        return chromosome

    def run(self, server):
        population = self.initialize_population()
        for generation in range(self.generations):
            fitnesses = [self.fitness(chromosome, server) for chromosome in population]
            population = self.selection(population, fitnesses)
            next_population = []
            for i in range(0, self.population_size, 2):
                parent1, parent2 = population[i], population[i + 1]
                child1, child2 = self.crossover(parent1, parent2)
                next_population.extend([self.mutate(child1), self.mutate(child2)])
            population = next_population
        best_chromosome = max(population, key=lambda chrom: self.fitness(chrom, server))
        return [i for i, gene in enumerate(best_chromosome) if gene == 1]

# Federated Learning Server
class FederatedServer:
    def __init__(self, dataset):
        self.dataset = dataset
        self.global_model = self._create_ensemble_model()
        self.global_accuracy_history = []
        
        # Prepare distributed datasets for clients
        self.client_datasets = self._prepare_client_datasets()
    
    def _create_ensemble_model(self):
        # Define base classifiers
        svm = SVC(probability=True)
        rf = RandomForestClassifier()
        dt = DecisionTreeClassifier()
        
        # Create ensemble model
        ensemble_model = VotingClassifier(estimators=[
            ('svm', svm),
            ('rf', rf),
            ('dt', dt)
        ], voting='soft')
        
        return ensemble_model
    
    def _prepare_client_datasets(self):
        # Split into train and validation
        X, y = self.dataset.get_data()
        train_size = int(CONFIG['train_split_ratio'] * len(X))
        val_size = len(X) - train_size
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, test_size=val_size, random_state=42)
        
        # Distribute training data among clients
        client_datasets = []
        total_train_samples = len(X_train)
        samples_per_client = total_train_samples // CONFIG['num_clients']
        
        for i in range(CONFIG['num_clients']):
            start_idx = i * samples_per_client
            end_idx = start_idx + samples_per_client if i < CONFIG['num_clients'] - 1 else total_train_samples
            client_subset = (X_train[start_idx:end_idx], y_train[start_idx:end_idx])
            client_datasets.append(client_subset)
        
        return {
            'train_datasets': client_datasets,
            'validation_dataset': (X_val, y_val)
        }
    
    def distribute_model(self):
        return self.global_model
    
    def evaluate_global_model(self):
        X_val, y_val = self.client_datasets['validation_dataset']
        self.global_model.fit(X_val, y_val)  # Fit the global model before evaluation
        y_pred = self.global_model.predict(X_val)
        
        accuracy = np.mean(y_pred == y_val) * 100
        self.global_accuracy_history.append(accuracy)
        
        return accuracy
    
    def generate_confusion_matrix(self, title="Global Model Confusion Matrix"):
        X_val, y_val = self.client_datasets['validation_dataset']
        y_pred = self.global_model.predict(X_val)
        
        # Convert numeric labels back to family names for better readability
        class_names = self.dataset.label_encoder.classes_
        cm = confusion_matrix(y_val, y_pred)
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=class_names,
                   yticklabels=class_names)
        plt.title(title)
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.tight_layout()
        plt.savefig(f'{title.replace(" ", "_")}.png')
        plt.close()
        
        print(f"\nClassification Report for {title}:")
        print(classification_report(y_val, y_pred, target_names=class_names))
        
        return cm
    
    def run_federated_learning_with_ga(self):
        ga = GeneticAlgorithm(num_clients=CONFIG['num_clients'], population_size=10, generations=5, mutation_rate=0.1)
        for round in range(CONFIG['communication_rounds']):
            print(f"\nCommunication Round {round + 1}")
            selected_clients = ga.run(self)
            print(f"Selected Clients: {selected_clients}")
            client_models = []
            for client_id in selected_clients:
                X_train, y_train = self.client_datasets['train_datasets'][client_id]
                client_model = self._create_ensemble_model()
                client_model.fit(X_train, y_train)
                client_models.append(client_model)
            global_accuracy = self.evaluate_global_model()
            print(f"Global Validation Accuracy: {global_accuracy:.2f}%")
        self.generate_confusion_matrix("Final Global Model Confusion Matrix")
        return global_accuracy
    
    def evaluate_clients(self, selected_clients):
        accuracies = []
        for client_id in selected_clients:
            X_train, y_train = self.client_datasets['train_datasets'][client_id]
            client_model = self._create_ensemble_model()
            client_model.fit(X_train, y_train)
            accuracy = self._evaluate_client_model(client_model, X_train, y_train)
            accuracies.append(accuracy)
        return sum(accuracies) / len(accuracies) if accuracies else 0
    
    def _evaluate_client_model(self, model, X_train, y_train):
        y_pred = model.predict(X_train)
        accuracy = np.mean(y_pred == y_train) * 100
        return accuracy

# Main execution block
def main():
    print("Android Malware Detection using Federated Learning")
    print("-------------------------------------------------")
    
    results_dir = 'federated_learning_results'
    os.makedirs(results_dir, exist_ok=True)
    
    # Set random seeds for reproducibility
    np.random.seed(42)
    
    print("\nLoading Android malware datasets...")
    # Load all CSV files with 'after_reboot' in the filename
    files = glob.glob(CONFIG['data_path'] + '/*after_reboot*.csv')
    print(f"Found {len(files)} files matching the pattern")
    
    # Create dataset
    dataset = AndroidMalwareDataset(files)
    
    if dataset.num_classes == 0:
        print("Error: Failed to load dataset correctly. Check the file paths and data format.")
        return
    
    print(f"\nDataset Statistics:")
    print(f"Number of samples: {len(dataset.data)}")
    print(f"Number of features: {dataset.max_features}")
    print(f"Number of malware families: {dataset.num_classes}")
    
    print("\nInitializing Federated Learning Server...")
    server = FederatedServer(dataset)
    
    try:
        print("\nStarting Federated Learning Training with Genetic Algorithm...")
        final_accuracy = server.run_federated_learning_with_ga()
        
        # Save results
        results_file = os.path.join(results_dir, 'final_results.txt')
        with open(results_file, 'w') as f:
            f.write(f"Final Validation Accuracy: {final_accuracy:.2f}%\n")
            f.write(f"Number of malware families: {dataset.num_classes}\n")
            f.write(f"Malware families: {', '.join(dataset.label_encoder.classes_)}\n")
        
        print(f"\nResults saved to {results_file}")
        print("Training completed successfully!")
    
    except Exception as e:
        print(f"An error occurred during training: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

Android Malware Detection using Federated Learning
-------------------------------------------------

Loading Android malware datasets...
Found 12 files matching the pattern


Maximum feature dimension across all files: 121
Loaded 22029 samples across 12 malware families
Each sample has 121 features
Malware families: ['Adware' 'Backdoor' 'FileInfector' 'PUA' 'Ransomware' 'Riskware'
 'Scareware' 'Trojan' 'Trojan_Banker' 'Trojan_Dropper' 'Trojan_SMS'
 'Trojan_Spy']

Dataset Statistics:
Number of samples: 22029
Number of features: 121
Number of malware families: 12

Initializing Federated Learning Server...

Starting Federated Learning Training with Genetic Algorithm...

Communication Round 1
Selected Clients: [1]
Global Validation Accuracy: 100.00%

Communication Round 2
Selected Clients: [0, 2]
Global Validation Accuracy: 100.00%

Communication Round 3
Selected Clients: [0, 1]
Global Validation Accuracy: 100.00%

Communication Round 4
Selected Clients: [0]
Global Validation Accuracy: 100.00%

Communication Round 5
Selected Clients: [0, 1, 2]
Global Validation Accuracy: 100.00%

Classification Report for Final Global Model Confusion Matrix:
                pre