In [None]:
!pip install datasets


In [None]:
import tensorflow as tf
from sklearn.utils import shuffle
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from datasets import load_dataset
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import load_model,Model
from sklearn.metrics import classification_report
import tenseal as ts
import numpy as np
import glob
import os
import cv2

In [None]:
# 1. Load AG News dataset
dataset = load_dataset('ag_news')

# Split into train and test sets
train_texts = dataset['train']['text']
train_labels = dataset['train']['label']
test_texts = dataset['test']['text']
test_labels = dataset['test']['label']


# Subset the dataset (similar to image classification approach)
def subset_data(texts, labels, num_classes=4, samples_per_class=1000):
    selected_texts = []
    selected_labels = []

    for label in range(num_classes):
        indices = np.where(np.array(labels) == label)[0]
        selected_indices = indices[:samples_per_class]  # Select first N samples per class
        selected_texts.extend(np.array(texts)[selected_indices])
        selected_labels.extend(np.array(labels)[selected_indices])

    # Shuffle the texts and labels together to maintain the mapping
    selected_texts, selected_labels = shuffle(selected_texts, selected_labels, random_state=42)

    return selected_texts, selected_labels


# Extract subsets (1000 samples per class)
train_texts, train_label = subset_data(train_texts, train_labels, num_classes=4, samples_per_class=1000)

# 3. Preprocess the data (Tokenization and padding)
vocab_size = 10000  # Limit vocabulary size
max_length = 100  # Limit the length of input sequences
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_texts)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert labels to TensorFlow format
train_labels = np.array(train_label)
test_labels = np.array(test_labels)

# One-hot encode labels
train_labels = to_categorical(train_labels, num_classes=4)
test_labels = to_categorical(test_labels, num_classes=4)

# 4. Function to build the model
def define_text_classification_model():
    input_layer = Input(shape=(max_length,), name='input')

    # Embedding layer
    embedding_layer = Embedding(vocab_size, 64, input_length=max_length, name='embedding')(input_layer)

    # Global average pooling layer
    pooling_layer = GlobalAveragePooling1D(name='global_avg_pooling')(embedding_layer)

    # Hidden Dense layer
    hidden_layer = Dense(64, activation='relu', name='hidden_dense')(pooling_layer)

    # Output logits layer (without softmax)
    logits_layer = Dense(4, name='logits')(hidden_layer)

    # Define the model (excluding softmax)
    model = Model(inputs=input_layer, outputs=logits_layer)

    # Compile the model (from_logits=True because we didn't apply softmax)
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

# 5. Save initial weights
initial_model = define_text_classification_model()
initial_weights = initial_model.get_weights()

# Function to create a new model with the same initial weights
def create_model_with_initial_weights():
    model = define_text_classification_model()
    model.set_weights(initial_weights)
    return model

# 6. Run the test harness with subset data
def run_text_classification_test_harness():
    # Create new model instance
    model = create_model_with_initial_weights()

    # Train the model
    history = model.fit(train_padded, train_labels, epochs=20,
                        validation_data=(test_padded, test_labels), batch_size=64)

    # Evaluate the model
    _, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
    print(f"> Test Accuracy: {accuracy * 100:.2f}%")

    # Save the model
    model.save('text_classification_model.keras')

# 7. Run the test harness
run_text_classification_test_harness()

# 8. Predict sample text (without softmax in the model)
sample_texts = ["The stock market is volatile."]
sample_sequences = tokenizer.texts_to_sequences(sample_texts)
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Load model and make predictions
loaded_model = tf.keras.models.load_model('text_classification_model.keras')

# Get logits
logits = loaded_model.predict(sample_padded)

# Apply softmax manually to get probabilities
softmax_output = tf.nn.softmax(logits, axis=-1)

# Get predicted class
predicted_class = tf.argmax(softmax_output, axis=-1).numpy()[0]
print(f"Predicted class: {predicted_class}")

In [None]:
loaded_model.summary()

In [None]:
def subset_ag_news(args, num_classes=4):
    if len(args) != num_classes:
        raise ValueError(f"Exactly {num_classes} integer arguments are required for the first {num_classes} classes.")

    # Load AG News dataset
    dataset = load_dataset('ag_news')

    # Extract texts and labels
    train_texts = dataset['train']['text']
    train_labels = dataset['train']['label']

    # Shuffle the dataset before selecting a subset
    train_texts, train_labels = shuffle(train_texts, train_labels)

    selected_texts = []
    selected_labels = []

    # Select the subset of samples for the first `num_classes` classes
    for label in range(num_classes):
        # Get indices of the current label
        indices = np.where(np.array(train_labels) == label)[0]
        num_samples = args[label]

        if num_samples > len(indices):
            raise ValueError(f"Requested {num_samples} samples for class {label}, but there are only {len(indices)} samples available.")

        # Select the specified number of samples for this class
        selected_indices = indices[:num_samples]
        selected_texts.extend(np.array(train_texts)[selected_indices])
        selected_labels.extend(np.array(train_labels)[selected_indices])

    # Shuffle the selected texts and labels together to maintain the mapping
    selected_texts, selected_labels = shuffle(selected_texts, selected_labels, random_state=42)

    return selected_texts, selected_labels


def feature_extractor(texts, model = loaded_model):
    # Preprocess the texts (tokenization and padding)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

    # Get feature vectors from the model
    feature_vectors = model.predict(padded_sequences, verbose=0)

    # Normalize the feature vectors
    row_norms = np.linalg.norm(feature_vectors, axis=1, keepdims=True)
    normalized_feature_vectors = feature_vectors / row_norms

    return normalized_feature_vectors

In [None]:
def count_zeros(number):
    if number <= 0 or number >= 1:
        raise ValueError("Number should be between 0 and 1, exclusive.")

    count = 0
    while number < 1:
        number *= 10
        count += 1
        if number >= 1:
            break

    return count - 1

# Base under-sampling method for text classification
def base_under_sampling(texts, labels, LD):
    LI = min(LD) / max(LD)
    if LI >= 0.05:
        print("Client already balanced")
        return texts, labels, LD

    x_train = np.array(texts)
    y_train = np.array(labels)
    distr = LD

    while LI < 0.05:
        selected_class = np.argmax(distr)
        num = count_zeros(LI)

        # Get indices of the most over-represented class
        class_indices = np.where(y_train == selected_class)[0]
        class_texts = x_train[class_indices]

        # Extract feature vectors for the class (embeddings from the model)
        feature_vectors = feature_extractor(class_texts)

        # Compute cosine similarity matrix
        cosine_sim_matrix = cosine_similarity(feature_vectors)
        n = cosine_sim_matrix.shape[0]
        print(n)

        # Compute mean and variance for each row in the similarity matrix
        row_means = np.mean(cosine_sim_matrix, axis=1)
        row_variances = np.var(cosine_sim_matrix, axis=1)

        # Sort rows based on mean values in descending order
        sorted_indices = np.argsort(row_means)[::-1]
        print(num + 1)

        # Select the top rows with the highest mean values
        selected_indices = sorted_indices[:int((10 ** (num + 1)) / 5)]

        # Calculate variance of the selected rows
        selected_var = np.var(cosine_sim_matrix[selected_indices], axis=0)

        # Iterate through remaining rows and add rows to minimize variance
        for i in sorted_indices[int((10 ** (num + 1)) / 5):]:
            temp_indices = np.append(selected_indices, i)
            temp_var = np.var(cosine_sim_matrix[temp_indices], axis=0)
            if np.sum(temp_var) < np.sum(selected_var):
                selected_var = temp_var
                selected_indices = temp_indices
            if len(selected_indices) == int((10 ** (num + 1)) / 5):
                break

        print(f"Texts to be removed: {len(selected_indices)} from class {selected_class}")

        # Translate selected indices to the original dataset
        remove_indices = class_indices[selected_indices]

        # Remove selected samples
        mask = np.ones(len(y_train), dtype=bool)
        mask[remove_indices] = False
        x_train = x_train[mask]
        y_train = y_train[mask]

        # Update the class distribution
        new_distr = np.copy(distr)
        new_distr[selected_class] -= len(remove_indices)
        distr = new_distr
        LI = min(distr) / max(distr)
        print(LI)

    return x_train, y_train, distr

In [None]:
import random
from nltk.corpus import wordnet
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

def over_sampling_from_folder_client(texts, labels, LD, min_imbalance=0.05):
    LI = min(LD) / max(LD)
    print(f"Initial imbalance ratio: {LI}")
    if LI >= min_imbalance:
        print("Class imbalance is already below the threshold")
        return texts, labels, LD

    # Vectorize the texts
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)

    # Track original labels to preserve order
    original_label_count = len(labels)
    ts_total = min(LD)
    while LI < min_imbalance:
        # Identify the minority class
        minority_class = np.argmin(LD)
        # Use a dictionary for sampling_strategy to target the minority class
        target_samples = int((1 / LI) + ts_total)
        print(target_samples)
        ts_total += target_samples
        max_samples = int(max(LD) * 0.5)  # Adjust this value as needed
        print(max_samples)
        sampling_strategy = {minority_class : min(target_samples, max_samples)}
        smote = SMOTE(sampling_strategy=sampling_strategy)

        # Apply SMOTE and resample both features (X) and labels (y)
        X_resampled, y_resampled = smote.fit_resample(X, labels)

        # Only keep the newly added samples
        new_samples = len(y_resampled) - len(labels)
        new_X_resampled = X_resampled[-new_samples:]
        new_y_resampled = y_resampled[-new_samples:]

        # Reconstruct text data from augmented features
        augmented_texts = vectorizer.inverse_transform(new_X_resampled)

        # Update texts and labels with only the newly created data
        texts.extend([' '.join(text) for text in augmented_texts])  # Reconstruct text from features
        labels.extend(new_y_resampled)

        # Re-vectorize to ensure texts and labels stay in sync
        X = vectorizer.fit_transform(texts)

        # Recalculate class distribution
        distr = np.bincount(labels, minlength=len(set(labels)))
        LI = min(distr) / max(distr)
        print(f"Updated imbalance ratio: {LI}")

    return texts, labels, distr


In [None]:
class Server:
    _instance = None  # Class variable to store the single instance

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super().__new__(cls, *args, **kwargs)
            # Generate CKKS keys upon instantiation
            cls._instance.__context = ts.context(
                ts.SCHEME_TYPE.CKKS,
                poly_modulus_degree=8192,
                coeff_mod_bit_sizes=[60, 40, 40, 60]
            )
            cls._instance.__context.generate_galois_keys()
            cls._instance.__context.global_scale = 2**40
            cls._instance.__public_key = cls._instance.__context.public_key()
            cls._instance.__relin_keys = cls._instance.__context.relin_keys()
            cls._instance.__total_distr = None
            cls._instance.enc_norm_distr = None
            cls._instance.clients = []
        return cls._instance

    def encrypt_with_public_key(self, data):
        """Encrypt a vector of data using the public key."""
        return ts.ckks_vector(self.__context, data)

    def decryption(self):
        return Client.cumulative_encrypted_distribution.decrypt()

    def decrypt(self, encrypted_data):
        """Decrypt the encrypted data."""
        return encrypted_data.decrypt()

    def global_imbalance(self):
        """Calculate the global imbalance in class distributions across clients."""
        total_distr = [round(x) for x in self.__total_distr]
        return min(total_distr) / max(total_distr)

    def norm_enc(self):
        """Normalize the class distribution and return the encrypted normalized vector."""
        total_distr = [round(x) for x in self.__total_distr]
        total_distribution = np.linalg.norm(total_distr)
        normalized_distr = [x / total_distribution for x in total_distr]
        return ts.ckks_vector(self.__context, normalized_distr)

    def add_client(self, client):
        """Add a client to the server's list."""
        self.clients.append(client)

    def cos_sim(self):
        """Compute cosine similarity between the server's global distribution and each client's distribution."""
        data = []
        client_similarities = np.array(data)
        self.enc_norm_distr = self.norm_enc()
        for client in self.clients:
            temp = client.cos_sim_calc(self.enc_norm_distr)  # Call client's cosine similarity function
            temp_decrypt = temp.decrypt()  # Decrypt the result
            client_similarities = np.append(client_similarities, temp_decrypt)
        return client_similarities

    def similarity_comparison(self, enc_selected_vectors, calling_client):
        """Compare encrypted vectors between clients."""
        results = []
        for client in self.clients:
            if client != calling_client:
                result = client.plain_enc_mul(enc_selected_vectors)
                results.append(result)
        return results

    def balance_check(self):
        """Main function to balance the class distribution across clients."""
        self.__total_distr = self.decryption()  # Get class distribution
        print("Initial Distribution: ", self.__total_distr)

        global_imbalance = self.global_imbalance()
        print("Initial Imbalance: ", global_imbalance)

        global_similarity_cl = self.cos_sim()
        print("Initial Global Similarity: ", global_similarity_cl)

        k = 1
        GI_flag1, GI_flag2 = 0, 0
        sorted_indices = np.argsort(global_similarity_cl)
        rounds = 0

        while global_imbalance < 0.1:
            selected_client_index = sorted_indices[-k]
            selected_client = self.clients[selected_client_index]
            print("Selected client: ", selected_client_index)

            enc_results = selected_client.trigger(GI_flag1, GI_flag2, 0)
            if enc_results == 0:
                k = k + 1
                if k == 5:  # Handle wrap-around
                    k = 1
                    sorted_indices = np.argsort(global_similarity_cl)
                    selected_client_index = sorted_indices[-k]
                    selected_client = self.clients[selected_client_index]
                    if(selected_client.trigger(GI_flag1,GI_flag2, 0) == 0):
                        print("That's the best balance you can get")
                        print("number of rounds  : ", round)
                        break
                GI_flag1, GI_flag2 = 0, 0
                continue

            prev_global_imbalance = global_imbalance
            self.__total_distr = enc_results.decrypt()
            global_imbalance = self.global_imbalance()
            global_similarity_cl = self.cos_sim()

            print("Global Imbalance: ", global_imbalance)
            print("Global Similarities: ", global_similarity_cl)

            if prev_global_imbalance >= global_imbalance:
                GI_flag1 = 1

            enc_results = selected_client.trigger(GI_flag1, GI_flag2, 1)
            if enc_results == 0:
                k += 1
                if k == 5:
                    k = 1
                    sorted_indices = np.argsort(global_similarity_cl)
                    selected_client_index = sorted_indices[-k]
                    selected_client = self.clients[selected_client_index]
                    if(selected_client.trigger(GI_flag1,GI_flag2, 0) == 0):
                        print("That's the best balance you can get")
                        print("number of rounds  : ", round)
                        break
                GI_flag1, GI_flag2 = 0, 0
                continue

            prev_global_imbalance = global_imbalance
            self.__total_distr = enc_results.decrypt()
            global_imbalance = self.global_imbalance()
            global_similarity_cl = self.cos_sim()

            print("Updated Global Imbalance: ", global_imbalance)
            print("Updated Global Similarities: ", global_similarity_cl)

            if prev_global_imbalance >= global_imbalance:
                GI_flag2 = 1

            rounds += 1

        print("Balancing completed in rounds: ", rounds)
        return 0

In [None]:
import math

class Client:
    cumulative_encrypted_distribution = None  # Class variable to store cumulative encrypted distribution

    def __init__(self, distr, server, client_id):
        if len(distr) != 4:
            raise ValueError("class_samples must be a list or array of four integers.")

        self.context = ts.context(
            ts.SCHEME_TYPE.CKKS,
            poly_modulus_degree=8192,
            coeff_mod_bit_sizes=[60, 40, 40, 60]
        )
        self.context.generate_galois_keys()
        self.context.global_scale = 2**40

        self.__distr = distr  # Private attribute
        self.__dataset = self.__allocate_dataset(distr)  # Allocate and store the dataset
        self.__feature_vectors = feature_extractor(self.__dataset[0])  # Assuming the dataset has text data
        self.__imbalance = min(self.__distr) / max(self.__distr)
        self.client_id = client_id
        self.server = server
        self._method_called = False
        self.i = None
        self.j = None
        self.server.add_client(self)
        self.__update_cumulative_distribution()

    def __allocate_dataset(self, distr):
        return subset_ag_news(distr)  # Implement this function to allocate text data

    def cos_sim_calc(self, enc_distr):
        total = np.linalg.norm(self.__distr)
        norm_distr = [x/total for x in self.__distr]
        vec = enc_distr.mul(norm_distr)
        return vec.sum()

    def encrypt_server_pub_key(self):
        total_distribution = sum(self.__distr)
        normalized_distr = [x/total_distribution for x in self.__distr]
        return self.server.encrypt_with_public_key(self.__distr), self.server.encrypt_with_public_key(normalized_distr)

    def use_dataset(self):
        x_subset, y_subset = self.__dataset
        print("Dataset Shape:", len(x_subset), len(y_subset))
        print("Feature vector:", self.__feature_vectors.shape)
        return x_subset, y_subset, self.__distr

    def __update_cumulative_distribution(self):
        if Client.cumulative_encrypted_distribution is None:
            encrypted_distribution, _ = self.encrypt_server_pub_key()
            Client.cumulative_encrypted_distribution = encrypted_distribution
        else:
            Client.cumulative_encrypted_distribution = Client.cumulative_encrypted_distribution.add_(self.__distr)

    def plain_enc_mul(self, enc_vector):
        plain_matrix = np.array(self.__feature_vectors)
        plain_matrix = np.transpose(plain_matrix)
        results = []
        for i in range(math.ceil(plain_matrix.shape[1] / 3000)):
            result = enc_vector.matmul(plain_matrix[:, (i * 3000):min(((i + 1) * 3000), plain_matrix.shape[1])])
            results.append(result)
        return results

    def under_sampling(self, selected_class, feature_vectors):
        cosine_sim_matrix = cosine_similarity(feature_vectors)
        n = cosine_sim_matrix.shape[0]
        print(n)

        # Compute the mean and variance for each row
        row_means = np.mean(cosine_sim_matrix, axis=1)
        row_variances = np.var(cosine_sim_matrix, axis=1)
        # Sort the rows based on mean values in descending order
        sorted_indices = np.argsort(row_means)[::-1]

        # Select the top 1/10 of the rows with the highest mean values
        selected_indices = sorted_indices[:int(n/15)]

        # Calculate the variance of the selected rows
        selected_var = np.var(cosine_sim_matrix[selected_indices], axis=0)

        # Iterate through the remaining rows and select additional rows one by one
        for i in sorted_indices[int(n/15):]:
            temp_indices = np.append(selected_indices, i)
            temp_var = np.var(cosine_sim_matrix[temp_indices], axis=0)
            # If variance is lower, add the row to the selection
            if np.sum(temp_var) < np.sum(selected_var):
                selected_var = temp_var
                selected_indices = temp_indices
            # Break if the desired number of rows is reached
            if len(selected_indices) == int(n/15):
                break
        print("Samples selected : ",len(selected_indices))
        samples_to_remove = []
        # Return the indices of the selected rows
        data =[]
        for i in selected_indices:
            similarities = np.array(data)
            enc_sim_1, enc_sim_2, enc_sim_3 = server.similarity_comparison(ts.ckks_vector(self.context,feature_vectors[i]),self)
            for j in range(len(enc_sim_1)):
                temp = enc_sim_1[j].decrypt()
                similarities = np.concatenate((similarities,temp))
            for j in range(len(enc_sim_2)):
                temp = enc_sim_2[j].decrypt()
                similarities = np.concatenate((similarities,temp))
            for j in range(len(enc_sim_3)):
                temp = enc_sim_3[j].decrypt()
                similarities = np.concatenate((similarities,temp))
            c = np.sum(similarities > 0.98)
            if(c >= 300):
                samples_to_remove.append(i)
        print("images to be removed : ",len(samples_to_remove)," of class ",(selected_class))
        # Convert lists to numpy arrays for easy indexing
        X_train = np.array(self.__dataset[0])
        Y_train = np.array(self.__dataset[1])
        f_vectors = np.array(self.__feature_vectors)

        # Get indices of the samples belonging to the selected class
        selected_class_indices = np.where(Y_train == selected_class)[0]

        # Translate samples_to_remove indices to indices in the original dataset
        remove_indices = selected_class_indices[samples_to_remove]

        # Create masks for removing samples
        mask = np.ones(len(Y_train), dtype=bool)
        mask[remove_indices] = False

        # Apply masks to x_train and y_train to remove selected samples
        X_train = X_train[mask]
        Y_train = Y_train[mask]
        f_vectors = f_vectors[mask]
        Client.cumulative_encrypted_distribution = Client.cumulative_encrypted_distribution.sub_(self.__distr)
        # Update the class distribution
        new_distr = np.copy(self.__distr)
        new_distr[selected_class] -= len(remove_indices)
        self.__distr = new_distr
        my_list = list(self.__dataset)
        my_list[0] = X_train
        my_list[1] = Y_train
        self.__dataset = tuple(my_list)
        self.__feature_vectors = f_vectors
        Client.cumulative_encrypted_distribution = Client.cumulative_encrypted_distribution.add_(self.__distr)
        return 0


    def over_sampling_from_folder_client(self, min_imbalance = 0.05):
        LD = self.__distr
        print(LD)
        LI = min(LD) / max(LD)
        print(f"Initial imbalance ratio: {LI}")
        if LI >= min_imbalance:
            print("Class imbalance is already below the threshold")
            return texts, labels, LD
        texts = list(self.__dataset[0])
        labels = self.__dataset[1]
        # Vectorize the texts
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(texts)

        # Track original labels to preserve order
        original_label_count = len(labels)
        # Identify the minority class
        minority_class = np.argmin(LD)
        # Use a dictionary for sampling_strategy to target the minority class
        target_samples = int((1 / LI) + LD[minority_class])
        print(target_samples)
        sampling_strategy = {minority_class : target_samples}
        smote = SMOTE(sampling_strategy=sampling_strategy)
        # Apply SMOTE and resample both features (X) and labels (y)
        X_resampled, y_resampled = smote.fit_resample(X, labels)
        # Only keep the newly added samples
        new_samples = len(y_resampled) - len(labels)
        new_X_resampled = X_resampled[-new_samples:]
        new_y_resampled = y_resampled[-new_samples:]

        # Reconstruct text data from augmented features
        augmented_texts = vectorizer.inverse_transform(new_X_resampled)

        # Update texts and labels with only the newly created data
        texts.extend([' '.join(text) for text in augmented_texts])  # Reconstruct text from features
        labels = np.concatenate((labels, new_y_resampled))
        print(new_samples," : samples added successfully in class",minority_class)
        # Re-vectorize to ensure texts and labels stay in sync
        X = vectorizer.fit_transform(texts)

        # Recalculate class distribution
        distr = np.bincount(labels, minlength=len(set(labels)))
        print(f"Updated imbalance ratio: {LI}")
        self.__distr = distr
        my_list = list(self.__dataset)
        my_list[0] = texts
        my_list[1] = labels
        self.__dataset = tuple(my_list)
        self.__feature_vectors = feature_extractor(texts)
        Client.cumulative_encrypted_distribution = Client.cumulative_encrypted_distribution.add_(self.__distr)
        return 0


    def trigger(self, GI_flag1, GI_flag2, sampling):
        self.__imbalance = min(self.__distr) / max(self.__distr)
        print(self.__imbalance)
        if self.__imbalance >= 0.05:
            print("Client ", self.client_id, " is already balanced")
            return 0

        if not self._method_called:
            self.i = 1
            self.j = 0
            self._method_called = True

        if sampling == 0:
            if GI_flag1 == 1:
                self.i += 1
                if self.i == 3:
                    self.i = 0
            sorted_indices = np.argsort(self.__distr)
            selected_class = sorted_indices[-self.i]
            selected_class_indices = np.where(self.__dataset[1] == selected_class)[0]
            selected_feature_vectors = [self.__feature_vectors[index] for index in selected_class_indices]
            self.under_sampling(selected_class, selected_feature_vectors)
            return Client.cumulative_encrypted_distribution

        if sampling == 1:
            if GI_flag2 == 1:
                self.j += 1
            """sorted_indices = np.argsort(self.__distr)
            selected_class = sorted_indices[0]
            multiple = int(1 / (4 * self.__imbalance))"""
            self.over_sampling_from_folder_client()
            return Client.cumulative_encrypted_distribution

In [None]:
# Example usage:
server = Server()

distr1 = [10, 500, 700, 4000]
distr2 = [20, 700, 500, 3000]
distr3 = [30, 400, 600, 3000]
distr4 = [100, 50, 200, 10]

"""distr1 = [10, 30, 700, 4000]
distr2 = [20, 40, 500, 3000]
distr3 = [30, 40, 600, 3000]
distr4 = [50, 50, 200, 10]"""


"""distr1 = [10, 30, 3600, 4000]
distr2 = [20, 40, 2700, 3000]
distr3 = [30, 40, 2600, 3000]
distr4 = [100, 50, 200, 10]"""

"""distr1 = [2, 100, 600, 4000]
distr2 = [3, 200, 700, 3000]
distr3 = [5, 150, 800, 3000]
distr4 = [30, 50, 20, 10]"""

client_1 = Client(distr1,server,0)
client_2 = Client(distr2, server, 1)
client_3 = Client(distr3, server, 2)
client_4 = Client(distr4, server, 3)

In [None]:
x_train = []
y_train = []
X = 0
x_t, y_t, a = client_1.use_dataset()
X+= len(y_t)
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, b = client_2.use_dataset()
X+= len(y_t)
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, c = client_3.use_dataset()
X+= len(y_t)
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, d = client_4.use_dataset()
X+= len(y_t)
x_train.append(x_t)
y_train.append(y_t)
print(a)
print(b)
print(c)
print(d)
print(X)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Function to update the client's local model
def client_update(local_model, texts, labels):
    # Train the local model on client data
    local_model.fit(texts, labels, epochs=4 ,batch_size=64 ,verbose=0)  # Adjust epochs as needed
    return local_model

# Function to update the global model on the server
def server_update(local_models):
    local_weights = [model.get_weights() for model in local_models]
    # Average the weights from all local models
    averaged_weights = [np.mean(weights, axis=0) for weights in zip(*local_weights)]
    # Update the global model with the averaged weights
    updated_global_model = create_model_with_initial_weights()
    updated_global_model.set_weights(averaged_weights)
    return updated_global_model

# Function to evaluate the global model
def evaluate(global_model, texts, labels):
    _, accuracy = global_model.evaluate(texts, labels, verbose=0)
    return accuracy


#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[0])
train_sequences0 = tokenizer.texts_to_sequences(x_train[0])
train_padded0 = pad_sequences(train_sequences0, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[1])
train_sequences1 = tokenizer.texts_to_sequences(x_train[1])
train_padded1 = pad_sequences(train_sequences1, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[2])
train_sequences2 = tokenizer.texts_to_sequences(x_train[2])
train_padded2 = pad_sequences(train_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[3])
train_sequences3 = tokenizer.texts_to_sequences(x_train[3])
train_padded3 = pad_sequences(train_sequences3, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert labels to TensorFlow format
train_labels = np.array(train_labels)

# One-hot encode labels
train_labels = to_categorical(train_labels, num_classes=4)

# Load the datasets
x_train_A, y_train_A = train_padded0, y_train[0]
x_train_B, y_train_B = train_padded1, y_train[1]
x_train_C, y_train_C = train_padded2, y_train[2]
x_train_D, y_train_D = train_padded3, y_train[3]
x_test, y_test = test_padded, test_labels


# Prepare labels (if necessary, depending on your label encoding)
y_train_A = np.array(y_train_A)
y_train_B = np.array(y_train_B)
y_train_C = np.array(y_train_C)
y_train_D = np.array(y_train_D)
y_train_A = to_categorical(y_train_A, num_classes=4)
y_train_B = to_categorical(y_train_B, num_classes=4)
y_train_C = to_categorical(y_train_C, num_classes=4)
y_train_D = to_categorical(y_train_D, num_classes=4)


print(x_train_A.shape)
print(y_train_A.shape)

# Initialize the models for clients
initial_model_A = create_model_with_initial_weights()  # Define your text model here
initial_model_B = create_model_with_initial_weights()
initial_model_C = create_model_with_initial_weights()
initial_model_D = create_model_with_initial_weights()
global_model = create_model_with_initial_weights()

# Federated learning
num_rounds = 20
rounds = []
accuracies = []

for round_num in range(num_rounds):
    accuracy = evaluate(global_model, x_test, y_test)
    print(f"Round {round_num}: Accuracy = {accuracy * 100}")

    global_weights = global_model.get_weights()
    initial_model_A.set_weights(global_weights)
    initial_model_B.set_weights(global_weights)
    initial_model_C.set_weights(global_weights)
    initial_model_D.set_weights(global_weights)

    initial_model_A = client_update(initial_model_A, x_train_A, y_train_A)
    initial_model_B = client_update(initial_model_B, x_train_B, y_train_B)
    initial_model_C = client_update(initial_model_C, x_train_C, y_train_C)
    initial_model_D = client_update(initial_model_D, x_train_D, y_train_D)

    # Aggregate model updates on the server
    global_model = server_update([initial_model_A, initial_model_B, initial_model_C, initial_model_D])
    rounds.append(round_num)
    accuracies.append(accuracy * 100)

# Final evaluation of the global model
accuracy = evaluate(global_model,x_test ,y_test)
print(f"Round {round_num + 1}: Final Accuracy = {accuracy * 100}")
rounds.append(round_num+1)
accuracies.append(accuracy * 100)
# Generate classification report
probabilities = global_model.predict(x_test)
predicted_labels = np.argmax(probabilities, axis=1)
predicted_labels_onehot = to_categorical(predicted_labels)
report = classification_report(y_test.argmax(axis=1), predicted_labels)
print(report)

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(rounds, accuracies, marker='o', label='Federated Model')
plt.title("Round vs Accuracy")
plt.xlabel("Round Number")
plt.ylabel("Accuracy (%)")
plt.grid(True)
plt.legend()
plt.ylim(0, 100)
plt.show()

# Save accuracy and model
np_accuracy = np.array(accuracies)
np.savetxt('accuracy.txt', np_accuracy, fmt='%f', delimiter=',')
global_model.save('text_model.keras')

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Assuming global_model is your trained Keras model and x_test and y_test are your test data

# Predict probabilities
y_pred_prob = global_model.predict(x_test)

# Assuming y_test is one-hot encoded
n_classes = y_test.shape[1]
y_test_binary = y_test

# Calculate ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binary[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green']  # Adjust the number of colors based on the number of classes
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i % len(colors)], lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Print AUC for each class
for i in range(n_classes):
    print(f'AUC for class {i}: {roc_auc[i]}')

In [None]:
x_train[0], y_train[0], LD_0 = base_under_sampling(x_train[0], y_train[0], distr1)
x_train[1], y_train[1], LD_1 = base_under_sampling(x_train[1], y_train[1], distr2)
x_train[2], y_train[2], LD_2 = base_under_sampling(x_train[2], y_train[2], distr3)
x_train[3], y_train[3], LD_3 = base_under_sampling(x_train[3], y_train[3], distr4)
y = LD_0 + LD_1 + LD_2 + LD_3
Y = np.sum(y)

In [None]:
print(LD_0)
print(LD_1)
print(LD_2)
print(LD_3)
print(Y)
print()
print(distr1)
print(distr2)
print(distr3)
print(distr4)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# Function to update the client's local model
def client_update(local_model, texts, labels):
    # Train the local model on client data
    local_model.fit(texts, labels, epochs=4 ,batch_size=64 ,verbose=0)  # Adjust epochs as needed
    return local_model

# Function to update the global model on the server
def server_update(local_models):
    local_weights = [model.get_weights() for model in local_models]
    # Average the weights from all local models
    averaged_weights = [np.mean(weights, axis=0) for weights in zip(*local_weights)]
    # Update the global model with the averaged weights
    updated_global_model = create_model_with_initial_weights()
    updated_global_model.set_weights(averaged_weights)
    return updated_global_model

# Function to evaluate the global model
def evaluate(global_model, texts, labels):
    _, accuracy = global_model.evaluate(texts, labels, verbose=0)
    return accuracy


#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[0])
train_sequences0 = tokenizer.texts_to_sequences(x_train[0])
train_padded0 = pad_sequences(train_sequences0, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[1])
train_sequences1 = tokenizer.texts_to_sequences(x_train[1])
train_padded1 = pad_sequences(train_sequences1, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[2])
train_sequences2 = tokenizer.texts_to_sequences(x_train[2])
train_padded2 = pad_sequences(train_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[3])
train_sequences3 = tokenizer.texts_to_sequences(x_train[3])
train_padded3 = pad_sequences(train_sequences3, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert labels to TensorFlow format
train_labels = np.array(train_labels)

# One-hot encode labels
train_labels = to_categorical(train_labels, num_classes=4)

# Load the datasets
x_train_A, y_train_A = train_padded0, y_train[0]
x_train_B, y_train_B = train_padded1, y_train[1]
x_train_C, y_train_C = train_padded2, y_train[2]
x_train_D, y_train_D = train_padded3, y_train[3]
x_test, y_test = test_padded, test_labels


# Prepare labels (if necessary, depending on your label encoding)
y_train_A = np.array(y_train_A)
y_train_B = np.array(y_train_B)
y_train_C = np.array(y_train_C)
y_train_D = np.array(y_train_D)
y_train_A = to_categorical(y_train_A, num_classes=4)
y_train_B = to_categorical(y_train_B, num_classes=4)
y_train_C = to_categorical(y_train_C, num_classes=4)
y_train_D = to_categorical(y_train_D, num_classes=4)


print(x_train_A.shape)
print(y_train_A.shape)

# Initialize the models for clients
initial_model_A = create_model_with_initial_weights()  # Define your text model here
initial_model_B = create_model_with_initial_weights()
initial_model_C = create_model_with_initial_weights()
initial_model_D = create_model_with_initial_weights()
global_model = create_model_with_initial_weights()

# Federated learning
num_rounds = 20
rounds1 = []
accuracies1 = []
denom = Y/X
for round_num in range(num_rounds):
    accuracy = evaluate(global_model, x_test, y_test)
    accuracy = accuracy/max(1,denom)
    print(f"Round {round_num}: Accuracy = {accuracy * 100}")

    global_weights = global_model.get_weights()
    initial_model_A.set_weights(global_weights)
    initial_model_B.set_weights(global_weights)
    initial_model_C.set_weights(global_weights)
    initial_model_D.set_weights(global_weights)

    initial_model_A = client_update(initial_model_A, x_train_A, y_train_A)
    initial_model_B = client_update(initial_model_B, x_train_B, y_train_B)
    initial_model_C = client_update(initial_model_C, x_train_C, y_train_C)
    initial_model_D = client_update(initial_model_D, x_train_D, y_train_D)

    # Aggregate model updates on the server
    global_model = server_update([initial_model_A, initial_model_B, initial_model_C, initial_model_D])
    rounds1.append(round_num)
    accuracies1.append(accuracy * 100)

# Final evaluation of the global model
accuracy = evaluate(global_model,x_test ,y_test)
accuracy = accuracy/max(1,denom)
print(f"Round {round_num + 1}: Final Accuracy = {accuracy * 100}")
rounds1.append(round_num+1)
accuracies1.append(accuracy * 100)

# Generate classification report
probabilities = global_model.predict(x_test)
predicted_labels = np.argmax(probabilities, axis=1)
predicted_labels_onehot = to_categorical(predicted_labels)
report = classification_report(y_test.argmax(axis=1), predicted_labels)
print(report)

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(rounds1, accuracies1, marker='o', label='Federated Model')
plt.title("Round vs Accuracy")
plt.xlabel("Round Number")
plt.ylabel("Accuracy (%)")
plt.grid(True)
plt.legend()
plt.ylim(0, 100)
plt.show()

global_model.save('text_model_US.keras')

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Assuming global_model is your trained Keras model and x_test and y_test are your test data

# Predict probabilities
y_pred_prob = global_model.predict(x_test)

# Assuming y_test is one-hot encoded
n_classes = y_test.shape[1]
y_test_binary = y_test

# Calculate ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binary[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green']  # Adjust the number of colors based on the number of classes
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i % len(colors)], lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Print AUC for each class
for i in range(n_classes):
    print(f'AUC for class {i}: {roc_auc[i]}')

In [None]:
x_train = []
y_train = []
x_t, y_t, a = client_1.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, b = client_2.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, c = client_3.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, d = client_4.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
print(a)
print(b)
print(c)
print(d)

In [None]:
x_train[0], y_train[0], LD_0 = over_sampling_from_folder_client(x_train[0], y_train[0], distr1)
x_train[1], y_train[1], LD_1 = over_sampling_from_folder_client(x_train[1], y_train[1], distr2)
x_train[2], y_train[2], LD_2 = over_sampling_from_folder_client(x_train[2], y_train[2], distr3)
x_train[3], y_train[3], LD_3 = over_sampling_from_folder_client(x_train[3], y_train[3], distr4)
y = LD_0 + LD_1 + LD_2 + LD_3
Y = np.sum(y)

In [None]:
print(LD_0)
print(LD_1)
print(LD_2)
print(LD_3)
print(Y)
print()
print(distr1)
print(distr2)
print(distr3)
print(distr4)

In [None]:
# Function to update the client's local model
def client_update(local_model, texts, labels):
    # Train the local model on client data
    local_model.fit(texts, labels, epochs=4 ,batch_size=64 ,verbose=0)  # Adjust epochs as needed
    return local_model

# Function to update the global model on the server
def server_update(local_models):
    local_weights = [model.get_weights() for model in local_models]
    # Average the weights from all local models
    averaged_weights = [np.mean(weights, axis=0) for weights in zip(*local_weights)]
    # Update the global model with the averaged weights
    updated_global_model = create_model_with_initial_weights()
    updated_global_model.set_weights(averaged_weights)
    return updated_global_model

# Function to evaluate the global model
def evaluate(global_model, texts, labels):
    _, accuracy = global_model.evaluate(texts, labels, verbose=0)
    return accuracy


#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[0])
train_sequences0 = tokenizer.texts_to_sequences(x_train[0])
train_padded0 = pad_sequences(train_sequences0, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[1])
train_sequences1 = tokenizer.texts_to_sequences(x_train[1])
train_padded1 = pad_sequences(train_sequences1, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[2])
train_sequences2 = tokenizer.texts_to_sequences(x_train[2])
train_padded2 = pad_sequences(train_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[3])
train_sequences3 = tokenizer.texts_to_sequences(x_train[3])
train_padded3 = pad_sequences(train_sequences3, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert labels to TensorFlow format
train_labels = np.array(train_labels)

# One-hot encode labels
train_labels = to_categorical(train_labels, num_classes=4)

# Load the datasets
x_train_A, y_train_A = train_padded0, y_train[0]
x_train_B, y_train_B = train_padded1, y_train[1]
x_train_C, y_train_C = train_padded2, y_train[2]
x_train_D, y_train_D = train_padded3, y_train[3]
x_test, y_test = test_padded, test_labels


# Prepare labels (if necessary, depending on your label encoding)
y_train_A = np.array(y_train_A)
y_train_B = np.array(y_train_B)
y_train_C = np.array(y_train_C)
y_train_D = np.array(y_train_D)
y_train_A = to_categorical(y_train_A, num_classes=4)
y_train_B = to_categorical(y_train_B, num_classes=4)
y_train_C = to_categorical(y_train_C, num_classes=4)
y_train_D = to_categorical(y_train_D, num_classes=4)


# Initialize the models for clients
initial_model_A = create_model_with_initial_weights()  # Define your text model here
initial_model_B = create_model_with_initial_weights()
initial_model_C = create_model_with_initial_weights()
initial_model_D = create_model_with_initial_weights()
global_model = create_model_with_initial_weights()

# Federated learning
num_rounds = 20
rounds2 = []
accuracies2 = []
denom = Y/X
print(denom)
for round_num in range(num_rounds):
    accuracy = evaluate(global_model, x_test, y_test)
    accuracy = accuracy/max(1,denom)
    print(f"Round {round_num}: Accuracy = {accuracy * 100}")
    global_weights = global_model.get_weights()
    initial_model_A.set_weights(global_weights)
    initial_model_B.set_weights(global_weights)
    initial_model_C.set_weights(global_weights)
    initial_model_D.set_weights(global_weights)

    initial_model_A = client_update(initial_model_A, x_train_A, y_train_A)
    initial_model_B = client_update(initial_model_B, x_train_B, y_train_B)
    initial_model_C = client_update(initial_model_C, x_train_C, y_train_C)
    initial_model_D = client_update(initial_model_D, x_train_D, y_train_D)

    # Aggregate model updates on the server
    global_model = server_update([initial_model_A, initial_model_B, initial_model_C, initial_model_D])
    rounds2.append(round_num)
    accuracies2.append(accuracy * 100)

# Final evaluation of the global model
accuracy = evaluate(global_model,x_test ,y_test)
accuracy = accuracy/max(1,denom)
print(f"Round {round_num + 1}: Final Accuracy = {accuracy * 100}")
rounds2.append(round_num+1)
accuracies2.append(accuracy * 100)
# Generate classification report
probabilities = global_model.predict(x_test)
predicted_labels = np.argmax(probabilities, axis=1)
predicted_labels_onehot = to_categorical(predicted_labels)
report = classification_report(y_test.argmax(axis=1), predicted_labels)
print(report)

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(rounds2, accuracies2, marker='o', label='Federated Model')
plt.title("Round vs Accuracy")
plt.xlabel("Round Number")
plt.ylabel("Accuracy (%)")
plt.grid(True)
plt.legend()
plt.ylim(0, 100)
plt.show()

global_model.save('text_model_OS.keras')

In [None]:
# Predict probabilities
y_pred_prob = global_model.predict(x_test)

# Assuming y_test is one-hot encoded
n_classes = y_test.shape[1]
y_test_binary = y_test

# Calculate ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binary[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green']  # Adjust the number of colors based on the number of classes
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i % len(colors)], lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Print AUC for each class
for i in range(n_classes):
    print(f'AUC for class {i}: {roc_auc[i]}')

In [None]:
x_train = []
y_train = []
x_t, y_t, a = client_1.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, b = client_2.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, c = client_3.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, d = client_4.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
y = a + b + c + d
Y = np.sum(y)

print(a)
print(b)
print(c)
print(d)
print(Y)

In [None]:
server.balance_check()

In [None]:
x_train = []
y_train = []
x_t, y_t, a = client_1.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, b = client_2.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, c = client_3.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
x_t, y_t, d = client_4.use_dataset()
x_train.append(x_t)
y_train.append(y_t)
y = a + b + c + d
Y = np.sum(y)

print(a)
print(b)
print(c)
print(d)
print(Y)

Dataset Shape: 5382 5382
Feature vector: (5382, 4)
Dataset Shape: 4185 4185
Feature vector: (4185, 4)
Dataset Shape: 3777 3777
Feature vector: (3777, 4)
Dataset Shape: 360 360
Feature vector: (360, 4)
[ 389  500  700 3793]
[ 161  700  500 2824]
[ 144  400  600 2633]
[100, 50, 200, 10]
13704


In [None]:
# Function to update the client's local model
def client_update(local_model, texts, labels):
    # Train the local model on client data
    local_model.fit(texts, labels, epochs=4 ,batch_size=64 ,verbose=0)  # Adjust epochs as needed
    return local_model

# Function to update the global model on the server
def server_update(local_models):
    local_weights = [model.get_weights() for model in local_models]
    # Average the weights from all local models
    averaged_weights = [np.mean(weights, axis=0) for weights in zip(*local_weights)]
    # Update the global model with the averaged weights
    updated_global_model = create_model_with_initial_weights()
    updated_global_model.set_weights(averaged_weights)
    return updated_global_model

# Function to evaluate the global model
def evaluate(global_model, texts, labels):
    _, accuracy = global_model.evaluate(texts, labels, verbose=0)
    return accuracy


#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[0])
train_sequences0 = tokenizer.texts_to_sequences(x_train[0])
train_padded0 = pad_sequences(train_sequences0, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[1])
train_sequences1 = tokenizer.texts_to_sequences(x_train[1])
train_padded1 = pad_sequences(train_sequences1, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[2])
train_sequences2 = tokenizer.texts_to_sequences(x_train[2])
train_padded2 = pad_sequences(train_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
#tokenizer.fit_on_texts(x_train[3])
train_sequences3 = tokenizer.texts_to_sequences(x_train[3])
train_padded3 = pad_sequences(train_sequences3, maxlen=max_length, padding=padding_type, truncating=trunc_type)


# Convert labels to TensorFlow format
train_labels = np.array(train_labels)

# One-hot encode labels
train_labels = to_categorical(train_labels, num_classes=4)

# Load the datasets
x_train_A, y_train_A = train_padded0, y_train[0]
x_train_B, y_train_B = train_padded1, y_train[1]
x_train_C, y_train_C = train_padded2, y_train[2]
x_train_D, y_train_D = train_padded3, y_train[3]
x_test, y_test = test_padded, test_labels


# Prepare labels (if necessary, depending on your label encoding)
y_train_A = np.array(y_train_A)
y_train_B = np.array(y_train_B)
y_train_C = np.array(y_train_C)
y_train_D = np.array(y_train_D)
y_train_A = to_categorical(y_train_A, num_classes=4)
y_train_B = to_categorical(y_train_B, num_classes=4)
y_train_C = to_categorical(y_train_C, num_classes=4)
y_train_D = to_categorical(y_train_D, num_classes=4)


# Initialize the models for clients
initial_model_A = create_model_with_initial_weights()  # Define your text model here
initial_model_B = create_model_with_initial_weights()
initial_model_C = create_model_with_initial_weights()
initial_model_D = create_model_with_initial_weights()
global_model = create_model_with_initial_weights()

# Federated learning
num_rounds = 20
rounds3 = []
accuracies3 = []
denom = Y/X
for round_num in range(num_rounds):
    accuracy = evaluate(global_model, x_test, y_test)
    accuracy = accuracy/max(1,denom)
    print(f"Round {round_num}: Accuracy = {accuracy * 100}")

    global_weights = global_model.get_weights()
    initial_model_A.set_weights(global_weights)
    initial_model_B.set_weights(global_weights)
    initial_model_C.set_weights(global_weights)
    initial_model_D.set_weights(global_weights)

    initial_model_A = client_update(initial_model_A, x_train_A, y_train_A)
    initial_model_B = client_update(initial_model_B, x_train_B, y_train_B)
    initial_model_C = client_update(initial_model_C, x_train_C, y_train_C)
    initial_model_D = client_update(initial_model_D, x_train_D, y_train_D)

    # Aggregate model updates on the server
    global_model = server_update([initial_model_A, initial_model_B, initial_model_C, initial_model_D])
    rounds3.append(round_num)
    accuracies3.append(accuracy * 100)

# Final evaluation of the global model
accuracy = evaluate(global_model,x_test ,y_test)
accuracy = accuracy/max(1,denom)
print(f"Round {round_num + 1}: Final Accuracy = {accuracy * 100}")
rounds3.append(round_num+1)
accuracies3.append(accuracy * 100)
# Generate classification report
probabilities = global_model.predict(x_test)
predicted_labels = np.argmax(probabilities, axis=1)
predicted_labels_onehot = to_categorical(predicted_labels)
report = classification_report(y_test.argmax(axis=1), predicted_labels)
print(report)

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(rounds3, accuracies3, marker='o', label='Federated Model')
plt.title("Round vs Accuracy")
plt.xlabel("Round Number")
plt.ylabel("Accuracy (%)")
plt.grid(True)
plt.legend()
plt.ylim(0, 100)
plt.show()

global_model.save('text_FLICKER.keras')

In [None]:
# Predict probabilities
y_pred_prob = global_model.predict(x_test)

# Assuming y_test is one-hot encoded
n_classes = y_test.shape[1]
y_test_binary = y_test

# Calculate ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binary[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure()
colors = ['aqua', 'darkorange', 'cornflowerblue', 'green']  # Adjust the number of colors based on the number of classes
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i % len(colors)], lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multiclass')
plt.legend(loc="lower right")
plt.show()

# Print AUC for each class
for i in range(n_classes):
    print(f'AUC for class {i}: {roc_auc[i]}')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(rounds, accuracies, marker='o', markersize=8, linewidth=1, label='Training with Class Imbalance')
plt.plot(rounds1, accuracies1, marker='h', markersize=8, linewidth=1, label='Under Sampling')
plt.plot(rounds2, accuracies2, marker='s', markersize=8, linewidth=1, label='Over Sampling')
plt.plot(rounds3, accuracies3, marker='>', markersize=8, linewidth=1, label='FLICKER')

plt.title("Round vs Accuracy", fontsize=16, fontweight='bold')
plt.xlabel("Round Number", fontsize=14, fontweight='bold')
plt.ylabel("Normalized Accuracy (%)", fontsize=14, fontweight='bold')
plt.grid(True)
plt.legend(fontsize=14)  # Add a legend to differentiate the lines
plt.ylim(20, 90)
plt.show()