In [1]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check CUDA version
print(f"CUDA version: {torch.version.cuda}")

Using device: cuda
PyTorch version: 2.7.0+cu126
CUDA version: 12.6


In [2]:
import pandas as pd

data_df = pd.read_csv("data/dataset.csv")
data_df['Score'] = data_df['Score'].apply(lambda x: sum(map(float, x.split(','))) / 2 / 100)  # Normalize score
data_df

Unnamed: 0,Keywords,Score,Class
0,reliable application,0.205,Reliability
1,issue alerts,0.205,Reliability
2,low complexity,0.205,Reliability
3,fast algorithms,0.205,Reliability
4,machine learning,0.205,Reliability
...,...,...,...
137,High-security safety connection,0.855,Safety
138,Comprehensive resource segmentation,0.855,Safety
139,Full device trust validation,0.855,Safety
140,Advanced asset segmentation,0.855,Safety


In [3]:
for tf in data_df["Class"].unique():
    print(tf)
    tf_df = data_df[data_df["Class"] == tf]
    print(tf_df.describe())
    print(tf_df["Score"].unique())

    
    print("\n")

Reliability
           Score
count  32.000000
mean    0.561250
std     0.289535
min     0.205000
25%     0.205000
50%     0.605000
75%     0.905000
max     0.905000
[0.205 0.605 0.905]


Privacy
           Score
count  30.000000
mean    0.638333
std     0.253708
min     0.305000
25%     0.305000
50%     0.705000
75%     0.905000
max     0.905000
[0.305 0.705 0.905]


Security
           Score
count  30.000000
mean    0.763333
std     0.102076
min     0.550000
25%     0.681250
50%     0.762500
75%     0.825000
max     0.950000
[0.9   0.75  0.825 0.675 0.775 0.875 0.65  0.95  0.725 0.625 0.7   0.85
 0.55 ]


Resilience
           Score
count  21.000000
mean    0.584762
std     0.278368
min     0.170000
25%     0.505000
50%     0.505000
75%     0.840000
max     0.840000
[0.17  0.505 0.84 ]


Safety
           Score
count  29.000000
mean    0.515345
std     0.274636
min     0.205000
25%     0.205000
50%     0.555000
75%     0.855000
max     0.855000
[0.205 0.555 0.855]




In [4]:
from sklearn.model_selection import train_test_split
# Split the data into train and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [None]:
from transformers import BertModel
import torch.nn as nn

class BERTForQuantification(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super(BERTForQuantification, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # Separate heads for each class (Reliability, Privacy, Security)
        self.reliability_head = nn.Linear(self.bert.config.hidden_size, 1)
        self.privacy_head = nn.Linear(self.bert.config.hidden_size, 1)
        self.security_head = nn.Linear(self.bert.config.hidden_size, 1)
        self.resilience_head = nn.Linear(self.bert.config.hidden_size, 1)
        self.safety_head = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, class_type):
        # BERT pooled output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        # Use the correct regression head based on class_type
        if class_type == "Reliability":
            score = self.reliability_head(pooled_output)
        elif class_type == "Privacy":
            score = self.privacy_head(pooled_output)
        elif class_type == "Security":
            score = self.security_head(pooled_output)
        elif class_type == "Resilience":
            score = self.resilience_head(pooled_output)
        elif class_type == "Safety":
            score = self.safety_head(pooled_output)
        else:
            raise ValueError(f"Invalid class_type. Must be one of: Reliability, Privacy, Security.")
        
        return score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, r2_score
import os

class Trainer:
    def __init__(
        self,
        model,
        train_dataloader,
        val_dataloader,
        device,
        save_dir,
        lr=2e-5,
        weight_decay=0.01,
        early_stopping_patience=3,
        max_grad_norm=1.0
    ):
        self.model = model.to(device)
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.device = device
        self.save_dir = save_dir
        os.makedirs(self.save_dir, exist_ok=True)

        self.optimizer = optim.AdamW(
            filter(lambda p: p.requires_grad, self.model.parameters()),
            lr=lr,
            weight_decay=weight_decay
        )
        self.criterion = nn.MSELoss()
        self.max_grad_norm = max_grad_norm
        self.early_stopping_patience = early_stopping_patience

    def train(self, epochs):
        best_val_loss = float("inf")
        patience_counter = 0

        for epoch in range(1, epochs + 1):
            print(f"\nEpoch {epoch}/{epochs}")
            self.model.train()
            running_loss = 0.0
            all_targets = []
            all_predictions = []

            for batch in tqdm(self.train_dataloader, desc="Training"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                score = batch['score'].to(self.device)
                class_type = batch['class_type']

                self.optimizer.zero_grad()

                # Forward pass
                outputs = []
                for i in range(len(input_ids)):
                    output = self.model(
                        input_ids[i].unsqueeze(0),
                        attention_mask[i].unsqueeze(0),
                        class_type[i]
                    )
                    outputs.append(output)

                outputs = torch.cat(outputs).squeeze(1)
                loss = self.criterion(outputs, score)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
                self.optimizer.step()

                running_loss += loss.item()
                all_predictions.extend(outputs.detach().cpu().numpy())
                all_targets.extend(score.cpu().numpy())

            # Training metrics
            mae, mse, rmse, r2, mape = self.calculate_metrics(all_targets, all_predictions)
            avg_train_loss = running_loss / len(self.train_dataloader)
            print(f"Training loss: {avg_train_loss:.4f}")
            print(f"Training Metrics - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, MAPE: {mape:.4f}")

            # Validation
            val_loss = self.validate(epoch)

            # Early stopping check
            if val_loss < best_val_loss:
                print("✅ Validation loss improved. Saving model.")
                best_val_loss = val_loss
                patience_counter = 0
                self.save_model(epoch)
            else:
                patience_counter += 1
                print(f"⚠️ Validation loss did not improve. Patience: {patience_counter}/{self.early_stopping_patience}")
                if patience_counter >= self.early_stopping_patience:
                    print("⏹️ Early stopping triggered.")
                    break

    def validate(self, epoch):
        self.model.eval()
        running_val_loss = 0.0
        all_targets = []
        all_predictions = []

        with torch.no_grad():
            for batch in tqdm(self.val_dataloader, desc="Validation"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                score = batch['score'].to(self.device)
                class_type = batch['class_type']

                outputs = []
                for i in range(len(input_ids)):
                    output = self.model(
                        input_ids[i].unsqueeze(0),
                        attention_mask[i].unsqueeze(0),
                        class_type[i]
                    )
                    outputs.append(output)

                outputs = torch.cat(outputs).squeeze(1)
                val_loss = self.criterion(outputs, score)
                running_val_loss += val_loss.item()

                all_predictions.extend(outputs.cpu().numpy())
                all_targets.extend(score.cpu().numpy())

        mae, mse, rmse, r2, mape = self.calculate_metrics(all_targets, all_predictions)
        avg_val_loss = running_val_loss / len(self.val_dataloader)

        print(f"Validation loss: {avg_val_loss:.4f}")
        print(f"Validation Metrics - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, MAPE: {mape:.4f}")

        self.final_val_loss = avg_val_loss
        return avg_val_loss

    def calculate_metrics(self, targets, predictions):
        mae = mean_absolute_error(targets, predictions)
        mse = ((torch.tensor(targets) - torch.tensor(predictions)) ** 2).mean().item()
        rmse = mse ** 0.5
        r2 = r2_score(targets, predictions)
        mape = torch.mean(
            torch.abs((torch.tensor(targets) - torch.tensor(predictions)) / torch.tensor(targets))
        ).item() * 100
        return mae, mse, rmse, r2, mape

    def save_model(self, epoch):
        save_path = os.path.join(self.save_dir, f"best_model.pth")
        torch.save(self.model.state_dict(), save_path)
        print(f"Model saved at {save_path}")

    def get_validation_loss(self):
        return getattr(self, "final_val_loss", None)


In [7]:
import nltk
from nltk.corpus import wordnet
import random

nltk.download("wordnet")

def synonym_replacement(text, num_replacements=1):
    words = text.split()
    new_words = words.copy()
    candidates = [i for i, w in enumerate(words) if len(wordnet.synsets(w)) > 0]

    if not candidates:
        return text

    random.shuffle(candidates)
    n_replacements = min(num_replacements, len(candidates))

    for i in candidates[:n_replacements]:
        synonyms = wordnet.synsets(words[i])
        lemmas = [l.name().replace("_", " ") for s in synonyms for l in s.lemmas()]
        lemmas = list(set([l for l in lemmas if l.lower() != words[i].lower()]))
        if lemmas:
            new_words[i] = random.choice(lemmas)

    return " ".join(new_words)


[nltk_data] Downloading package wordnet to /home/bilito/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import BertTokenizer
import random
from nltk.corpus import wordnet

# Define augmentation techniques
def synonym_replacement(text, n=2):
    """
    Replace n random words in the text with their synonyms.
    """
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for word in random_word_list:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = random.choice(synonyms).lemmas()[0].name()
            if synonym != word:  # Ensure synonym is different
                new_words = [synonym if w == word else w for w in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break
    return ' '.join(new_words)

def augment_text(text):
    """
    Apply a random augmentation technique to the input text.
    """
    techniques = [synonym_replacement]  # Add more techniques here if needed
    augmentation = random.choice(techniques)
    return augmentation(text)

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        text = sample['Keywords']  # Text input
        class_type = sample['Class']  # Class type: Reliability, Privacy, etc.
        score = sample['Score']  # The actual score as the target

        # Tokenize the text using the provided tokenizer
        tokens = self.tokenizer(text, padding='max_length', max_length=self.max_len, truncation=True, return_tensors="pt")

        # Return the tokenized inputs and the corresponding score and class
        return {
            'input_ids': tokens['input_ids'].squeeze(0),
            'attention_mask': tokens['attention_mask'].squeeze(0),
            'class_type': class_type,
            'score': torch.tensor(score, dtype=torch.float),
        }

def load_data_from_csv(file_path, n_augmentations=3):
    """
    Load dataset from CSV and augment the training data.
    """
    df = pd.read_csv(file_path)
    
    # Normalize scores between 0 and 1 if needed
    df['Score'] = df['Score'].apply(lambda x: sum(map(float, x.split(','))) / 2 / 100)  # Normalize score
    
    # Apply data augmentation
    augmented_rows = []
    for _, row in df.iterrows():
        original_text = row['Keywords']
        for _ in range(n_augmentations):
            augmented_text = augment_text(original_text)
            augmented_rows.append({'Keywords': augmented_text, 'Score': row['Score'], 'Class': row['Class']})
    
    # Append augmented rows to the original dataset
    augmented_df = pd.DataFrame(augmented_rows)
    df = pd.concat([df, augmented_df], ignore_index=True)

    return df

def load_dataset(file_path, batch_size=16, max_len=128, augment=True):
    """
    Loads the dataset from CSV, tokenizes the inputs, and returns DataLoader for training and validation.
    """
    # Load and optionally augment the data
    df = load_data_from_csv(file_path) if augment else pd.read_csv(file_path)
    data = df.to_dict(orient='records')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    dataset = TextDataset(data, tokenizer, max_len=max_len)

    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # DataLoader objects
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [9]:
# Load the dataset from CSV file
train_dataloader, val_dataloader = load_dataset(file_path='./data/dataset.csv', batch_size=32, augment=True)

# Initialize the model
model = BERTForQuantification()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


trainer = Trainer(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    device=device,
    save_dir="./models"
)

trainer.train(epochs=500)


Epoch 1/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.05s/it]


Training loss: 0.2399
Training Metrics - MAE: 0.4079, MSE: 0.2468, RMSE: 0.4968, R²: -2.6437, MAPE: 73.7513


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.16it/s]


Validation loss: 0.0793
Validation Metrics - MAE: 0.2295, MSE: 0.0792, RMSE: 0.2814, R²: -0.2424, MAPE: 60.9207
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_1.pth

Epoch 2/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.00s/it]


Training loss: 0.0646
Training Metrics - MAE: 0.2088, MSE: 0.0665, RMSE: 0.2579, R²: 0.0183, MAPE: 54.7371


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.04it/s]


Validation loss: 0.0470
Validation Metrics - MAE: 0.1710, MSE: 0.0462, RMSE: 0.2150, R²: 0.2747, MAPE: 44.0199
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_2.pth

Epoch 3/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.04s/it]


Training loss: 0.0365
Training Metrics - MAE: 0.1526, MSE: 0.0368, RMSE: 0.1919, R²: 0.4561, MAPE: 40.1383


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.14it/s]


Validation loss: 0.0346
Validation Metrics - MAE: 0.1429, MSE: 0.0339, RMSE: 0.1842, R²: 0.4679, MAPE: 37.3584
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_3.pth

Epoch 4/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


Training loss: 0.0214
Training Metrics - MAE: 0.1178, MSE: 0.0221, RMSE: 0.1486, R²: 0.6741, MAPE: 29.0098


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.16it/s]


Validation loss: 0.0228
Validation Metrics - MAE: 0.1167, MSE: 0.0234, RMSE: 0.1529, R²: 0.6334, MAPE: 24.0660
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_4.pth

Epoch 5/500


Training: 100%|██████████| 15/15 [00:14<00:00,  1.02it/s]


Training loss: 0.0150
Training Metrics - MAE: 0.0923, MSE: 0.0156, RMSE: 0.1248, R²: 0.7701, MAPE: 20.3929


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.18it/s]


Validation loss: 0.0159
Validation Metrics - MAE: 0.1012, MSE: 0.0166, RMSE: 0.1287, R²: 0.7402, MAPE: 22.6432
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_5.pth

Epoch 6/500


Training: 100%|██████████| 15/15 [00:14<00:00,  1.02it/s]


Training loss: 0.0126
Training Metrics - MAE: 0.0821, MSE: 0.0126, RMSE: 0.1122, R²: 0.8143, MAPE: 19.4923


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.21it/s]


Validation loss: 0.0122
Validation Metrics - MAE: 0.0848, MSE: 0.0127, RMSE: 0.1128, R²: 0.8006, MAPE: 18.5505
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_6.pth

Epoch 7/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


Training loss: 0.0108
Training Metrics - MAE: 0.0764, MSE: 0.0107, RMSE: 0.1032, R²: 0.8427, MAPE: 17.2254


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.08it/s]


Validation loss: 0.0187
Validation Metrics - MAE: 0.0978, MSE: 0.0187, RMSE: 0.1368, R²: 0.7064, MAPE: 20.0636
⚠️ Validation loss did not improve. Patience: 1/3

Epoch 8/500


Training: 100%|██████████| 15/15 [00:14<00:00,  1.00it/s]


Training loss: 0.0082
Training Metrics - MAE: 0.0671, MSE: 0.0083, RMSE: 0.0910, R²: 0.8777, MAPE: 14.5114


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.13it/s]


Validation loss: 0.0125
Validation Metrics - MAE: 0.0789, MSE: 0.0132, RMSE: 0.1148, R²: 0.7933, MAPE: 15.1694
⚠️ Validation loss did not improve. Patience: 2/3

Epoch 9/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


Training loss: 0.0056
Training Metrics - MAE: 0.0589, MSE: 0.0059, RMSE: 0.0770, R²: 0.9124, MAPE: 12.8349


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.17it/s]


Validation loss: 0.0116
Validation Metrics - MAE: 0.0746, MSE: 0.0123, RMSE: 0.1111, R²: 0.8064, MAPE: 14.2923
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_9.pth

Epoch 10/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.02s/it]


Training loss: 0.0045
Training Metrics - MAE: 0.0502, MSE: 0.0045, RMSE: 0.0667, R²: 0.9342, MAPE: 10.8766


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.21it/s]


Validation loss: 0.0106
Validation Metrics - MAE: 0.0724, MSE: 0.0113, RMSE: 0.1062, R²: 0.8231, MAPE: 13.9320
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_10.pth

Epoch 11/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


Training loss: 0.0041
Training Metrics - MAE: 0.0505, MSE: 0.0042, RMSE: 0.0645, R²: 0.9386, MAPE: 11.1773


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.18it/s]


Validation loss: 0.0098
Validation Metrics - MAE: 0.0695, MSE: 0.0104, RMSE: 0.1020, R²: 0.8367, MAPE: 12.6959
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_11.pth

Epoch 12/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.01s/it]


Training loss: 0.0041
Training Metrics - MAE: 0.0475, MSE: 0.0037, RMSE: 0.0612, R²: 0.9446, MAPE: 10.4879


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.15it/s]


Validation loss: 0.0085
Validation Metrics - MAE: 0.0611, MSE: 0.0089, RMSE: 0.0944, R²: 0.8602, MAPE: 11.7142
✅ Validation loss improved. Saving model.
Model saved at ./models/bert_model_epoch_12.pth

Epoch 13/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.03s/it]


Training loss: 0.0029
Training Metrics - MAE: 0.0423, MSE: 0.0030, RMSE: 0.0546, R²: 0.9561, MAPE: 9.2908


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.07it/s]


Validation loss: 0.0099
Validation Metrics - MAE: 0.0672, MSE: 0.0105, RMSE: 0.1024, R²: 0.8355, MAPE: 12.6180
⚠️ Validation loss did not improve. Patience: 1/3

Epoch 14/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.02s/it]


Training loss: 0.0027
Training Metrics - MAE: 0.0400, MSE: 0.0027, RMSE: 0.0517, R²: 0.9605, MAPE: 9.1517


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.02it/s]


Validation loss: 0.0097
Validation Metrics - MAE: 0.0650, MSE: 0.0104, RMSE: 0.1021, R²: 0.8363, MAPE: 11.7729
⚠️ Validation loss did not improve. Patience: 2/3

Epoch 15/500


Training: 100%|██████████| 15/15 [00:15<00:00,  1.04s/it]


Training loss: 0.0025
Training Metrics - MAE: 0.0397, MSE: 0.0027, RMSE: 0.0515, R²: 0.9608, MAPE: 8.5732


Validation: 100%|██████████| 4/4 [00:00<00:00,  4.03it/s]

Validation loss: 0.0086
Validation Metrics - MAE: 0.0618, MSE: 0.0093, RMSE: 0.0966, R²: 0.8537, MAPE: 11.5974
⚠️ Validation loss did not improve. Patience: 3/3
⏹️ Early stopping triggered.



