In [1]:
import flwr as fl
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from numpy import newaxis
import math
import os
import pandas as pd
import torch.nn as nn
from scipy.stats import chi2
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Function
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from centralized import DomainAdaptationModel,ReviewDataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
class DomainAdaptationClient(fl.client.NumPyClient):
    def __init__(self, model, train_loader, test_loader, device):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.device = device

    # def get_parameters(self):
    #     return [val.cpu().numpy() for val in self.model.state_dict().values()]

    def set_parameters(self, parameters):
        state_dict = {k: torch.tensor(v) for k, v in zip(self.model.state_dict().keys(), parameters)}
        self.model.load_state_dict(state_dict, strict=False)

    def fit(self, parameters, config):
        self.set_parameters(parameters)
        print("Starting Trainning...")
        self.model.train()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
        for epoch in range(1):  # Perform a single epoch of training
            for batch in self.train_loader:
                input_ids, attention_mask, token_type_ids, labels = [x.to(self.device) for x in batch]
                print("input_ids shape:", input_ids.shape)
                print("attention_mask shape:", attention_mask.shape)
                if token_type_ids is not None:
                    print("token_type_ids shape:", token_type_ids.shape)
                inputs = {
                        "input_ids": input_ids.squeeze(axis=1),
                        "attention_mask": attention_mask.squeeze(axis=1),
                        "token_type_ids" : token_type_ids.squeeze(axis=1),
                        "labels" : labels
                    }
                for k, v in inputs.items():
                    inputs[k] = v.to(device)
                # sentiment_pred, domain_pred = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                sentiment_pred, domain_pred = self.model(**inputs)
                loss = self.compute_loss(sentiment_pred, domain_pred, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        return self.get_parameters(), len(self.train_loader.dataset), {}

    def compute_loss(self, sentiment_pred, domain_pred, labels):
        sentiment_loss = torch.nn.CrossEntropyLoss()(sentiment_pred, labels)
        domain_loss = torch.nn.CrossEntropyLoss()(domain_pred, labels)  # Assuming domain labels are in labels
        return sentiment_loss + domain_loss

    def evaluate(self, parameters, config):
        self.set_parameters(parameters)
        self.model.eval()
        correct = 0
        total = 0
        print("Starting evaluation...")
        total_batches = len(self.test_loader)
        with torch.no_grad():
            for batch_idx, batch in enumerate(self.test_loader):
                input_ids, attention_mask, token_type_ids, labels = [x.to(self.device) for x in batch]
                inputs = {
                        "input_ids": input_ids.squeeze(axis=1),
                        "attention_mask": attention_mask.squeeze(axis=1),
                        "token_type_ids" : token_type_ids.squeeze(axis=1),
                        "labels" : labels
                    }
                for k, v in inputs.items():
                    inputs[k] = v.to(device)
                # sentiment_pred, domain_pred = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                sentiment_pred, domain_pred = self.model(**inputs)
                _, predicted = torch.max(sentiment_pred, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                print(f"Batch {batch_idx + 1}/{total_batches} completed: {(batch_idx + 1) / total_batches * 100:.2f}%")
        accuracy = correct / total
        return float(accuracy), len(self.test_loader.dataset),{"accuracy": accuracy}
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
torch.cuda.is_available()

True

In [10]:
#Model
model = DomainAdaptationModel()
tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')

model.to(device)
# state_dict_path = 'C:/Users/hl100/Downloads/' + 'size250k_1epoch_1_model.bin'
# model.load_state_dict(torch.load(state_dict_path, map_location=device))
#Data
df_full = pd.read_csv('D:\Hoc\SecBert\SecBERT\multilabel-train\dataset_capec.csv')
# df_full = pd.read_csv('E:\Work_DatPT\Study\Master\SecBERT\dataset_capec.csv')
df_full['text'] = df_full['text'].str.replace('/',' ')
df_train = df_full.groupby('label').head(30)
# df_train = df_full
df_train = df_train.dropna(subset=['label'])
label_counts = df_train['label'].value_counts()
print(label_counts)
print(df_train.size)


train_df, test_df = train_test_split(df_train, test_size=0.2, random_state=42)
train_texts = train_df['text'].values
train_labels = train_df['label'].values
test_texts = test_df['text'].values
test_labels = test_df['label'].values

X_train, X_test, Y_train, Y_test = train_test_split(df_train['text'], df_train['label'],test_size=0.3, stratify=df_train['label'], shuffle = True)
df_train = pd.concat([X_train, Y_train], axis=1)
df_test = pd.concat([X_test, Y_test], axis=1)

# Tokenize the loaded texts for training and testing
train_dataset = ReviewDataset(df_train)
test_dataset = ReviewDataset(df_test)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers = 2)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers = 2) 


  df_full = pd.read_csv('D:\Hoc\SecBert\SecBERT\multilabel-train\dataset_capec.csv')


label
000 - Normal                              30
126 - Path Traversal                      30
66 - SQL Injection                        30
272 - Protocol Manipulation               30
310 - Scanning for Vulnerable Software    30
242 - Code Injection                      30
153 - Input Data Manipulation             30
194 - Fake the Source of Data             30
34 - HTTP Response Splitting              30
Name: count, dtype: int64
540




In [16]:
# Simulate clients
client = DomainAdaptationClient(model, train_loader, test_loader, device)
fl.client.start_numpy_client(server_address="localhost:8088", client=client)

	Instead, use `flwr.client.start_client()` by ensuring you first call the `.to_client()` method as shown below: 
	flwr.client.start_client(
		server_address='<IP>:<PORT>',
		client=FlowerClient().to_client(), # <-- where FlowerClient is of type flwr.client.NumPyClient object
	)
	Using `start_numpy_client()` is deprecated.

            This is a deprecated feature. It will be removed
            entirely in future versions of Flower.
        
[92mINFO [0m:      
[92mINFO [0m:      Received: evaluate message 861785b4-b6da-472b-943e-fc5c41f3c07f


Starting evaluation...
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])
input_ids shape: torch.Size([8, 1, 512])
atten

[92mINFO [0m:      Sent reply
[92mINFO [0m:      
[92mINFO [0m:      Received: train message 7e04cf37-15ca-4da1-afd1-82a8ff08b087


Starting Trainning...
input_ids shape: torch.Size([8, 1, 512])
attention_mask shape: torch.Size([8, 1, 512])
token_type_ids shape: torch.Size([8, 1, 512])


[91mERROR [0m:     Client raised an exception.
Traceback (most recent call last):
  File "c:\Users\hl100\AppData\Local\Programs\Python\Python312\Lib\site-packages\flwr\client\app.py", line 526, in start_client_internal
    reply_message = client_app(message=message, context=context)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\hl100\AppData\Local\Programs\Python\Python312\Lib\site-packages\flwr\client\client_app.py", line 143, in __call__
    return self._call(message, context)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\hl100\AppData\Local\Programs\Python\Python312\Lib\site-packages\flwr\client\client_app.py", line 126, in ffn
    out_message = handle_legacy_message_from_msgtype(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\hl100\AppData\Local\Programs\Python\Python312\Lib\site-packages\flwr\client\message_handler\message_handler.py", line 129, in handle_legacy_message_from_msgtype
    fit_res = maybe_call_f

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
