In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117


In [None]:
!pip install transformers torch wandb accelerate

In [None]:
!pip install huggingface

In [4]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import wandb

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 1: Login to W&B using the provided API key
wandb.login(key="348d74f5eaafa788d29878c98e6f78509d6e52d1")

# Initialize W&B project
wandb.init(
    project="subdomain-prediction",
    config={
        "learning_rate": 5e-5,
        "architecture": "distilgpt2",
        "dataset": "n0kovo_subdomains",
        "epochs": 10,  # Increased epochs for better learning
    }
)

# Step 2: Load the subdomain dataset from the subprediction directory
with open("subdomains.txt", "r") as f:
    subdomains = f.read().splitlines()

# Provide more diverse base domains for better training
base_domains = ["example.com", "testdomain.com", "mycompany.com"]
dataset = [{"input": base_domain, "output": subdomain} for base_domain in base_domains for subdomain in subdomains]

# Step 3: Define the dataset class
class SubdomainDataset(Dataset):
    def __init__(self, tokenizer, dataset):
        self.tokenizer = tokenizer
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_text = self.dataset[idx]["input"]
        output_text = self.dataset[idx]["output"]

        input_ids = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")["input_ids"].squeeze()
        labels = self.tokenizer(output_text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": labels}


# Step 4: Load the model and tokenizer from local path (or online if possible)
model_name = "distilgpt2"  # Use local path if downloaded manually
proxies = {
    "http": "http://localhost:7890",
    "https": "http://localhost:7890",
}

tokenizer = AutoTokenizer.from_pretrained("distilgpt2", proxies=proxies)
model = AutoModelForCausalLM.from_pretrained("distilgpt2", proxies=proxies)

model.resize_token_embeddings(len(tokenizer))
model.to(device)  # Move model to GPU or CPU based on availability

# Step 5: Prepare the dataset
train_dataset = SubdomainDataset(tokenizer, dataset)

# Step 6: Configure training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=10000,
    save_total_limit=2,
    eval_strategy="no",
    report_to="wandb",
    fp16=True,  # Enable mixed precision training for faster computation on GPU
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# Step 8: Start Training
trainer.train()

# Save the final model post-training
trainer.save_model("./results/final_model")

# Log the final metrics and close W&B
wandb.finish()


Using device: cuda




ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [5]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import wandb

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 1: Login to W&B using the provided API key
wandb.login(key="348d74f5eaafa788d29878c98e6f78509d6e52d1")  # Make sure to use your actual W&B API key here

# Initialize W&B project
wandb.init(
    project="subdomain-prediction",
    config={
        "learning_rate": 5e-5,
        "architecture": "distilgpt2",
        "dataset": "n0kovo_subdomains",
        "epochs": 10,  # Increased epochs for better learning
    }
)

# Step 2: Load the subdomain dataset from the subprediction directory
with open("subdomains.txt", "r") as f:
    subdomains = f.read().splitlines()

# Provide more diverse base domains for better training
base_domains = ["example.com", "testdomain.com", "mycompany.com"]
dataset = [{"input": base_domain, "output": subdomain} for base_domain in base_domains for subdomain in subdomains]

# Step 3: Define the dataset class
class SubdomainDataset(Dataset):
    def __init__(self, tokenizer, dataset):
        self.tokenizer = tokenizer
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        input_text = self.dataset[idx]["input"]
        output_text = self.dataset[idx]["output"]

        # Tokenizing input and output with padding and truncation
        input_ids = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")["input_ids"].squeeze()
        labels = self.tokenizer(output_text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": labels}

# Step 4: Load the model and tokenizer from Hugging Face with proxy settings
model_name = "distilgpt2"
proxies = {
    "http": "http://localhost:7890",
    "https": "http://localhost:7890",
}

tokenizer = AutoTokenizer.from_pretrained(model_name, proxies=proxies)
model = AutoModelForCausalLM.from_pretrained(model_name, proxies=proxies)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Resize the model's token embeddings to accommodate the new special token if added
model.resize_token_embeddings(len(tokenizer))

# Move model to device (GPU or CPU)
model.to(device)

# Step 5: Prepare the dataset
train_dataset = SubdomainDataset(tokenizer, dataset)

# Step 6: Configure training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=10000,
    save_total_limit=2,
    evaluation_strategy="no",
    report_to="wandb",  # Report to Weights and Biases (W&B)
    fp16=torch.cuda.is_available(),  # Enable mixed precision training only if CUDA is available
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# Step 8: Start Training
trainer.train()

# Step 9: Save the final model post-training
trainer.save_model("./results/final_model")

# Log the final metrics and close W&B
wandb.finish()


Using device: cuda




VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))



Step,Training Loss
50,6.4498
100,0.6351
150,0.6276
200,0.662
250,0.6048
300,0.5789
350,0.6086
400,0.6003
450,0.6324
500,0.604


KeyboardInterrupt: 

In [2]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import wandb
from sklearn.model_selection import train_test_split

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 1: Securely login to W&B using an environment variable
wandb.login(key="348d74f5eaafa788d29878c98e6f78509d6e52d1")

# Initialize W&B project
wandb.init(
    project="subdomain-prediction",
    config={
        "learning_rate": 1e-5,
        "architecture": "distilgpt2",
        "dataset": "n0kovo_subdomains",
        "epochs": 5,
    }
)

# Step 2: Load the subdomain prefixes from the file
with open("subdomains.txt", "r") as f:
    subdomain_prefixes = f.read().splitlines()

# Normalize subdomain prefixes and base domains
subdomain_prefixes = [subdomain.strip().lower() for subdomain in subdomain_prefixes if subdomain.strip()]
base_domains = [domain.strip().lower() for domain in ["example.com", "testdomain.com", "mycompany.com"]]

# Ensure subdomain prefixes are not empty
if not subdomain_prefixes:
    raise ValueError("The 'subdomains.txt' file is empty or not properly loaded.")

# Prepare the dataset by combining base domains with subdomain prefixes
dataset = []
for base_domain in base_domains:
    for subdomain_prefix in subdomain_prefixes:
        dataset.append({"input": base_domain, "output": subdomain_prefix})

# Check the number of samples
print(f"Total dataset samples: {len(dataset)}")

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

# Step 3: Define the dataset class
class SubdomainDataset(Dataset):
    def __init__(self, tokenizer, dataset, max_length=128):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        base_domain = self.dataset[idx]["input"]
        subdomain_prefix = self.dataset[idx]["output"]
        combined_text = f"{base_domain} [SEP] {subdomain_prefix}"

        encoding = self.tokenizer(
            combined_text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        labels = input_ids.clone()
        sep_index = (input_ids == self.tokenizer.sep_token_id).nonzero(as_tuple=True)
        if sep_index[0].numel() > 0:
            sep_idx = sep_index[0][0]
            labels[:sep_idx + 1] = -100  # Ignore the base domain and separator
        else:
            labels[:] = -100  # If no [SEP] token found, ignore all tokens

        return {
            "input_ids": input_ids.to(device),
            "attention_mask": attention_mask.to(device),
            "labels": labels.to(device)
        }

# Step 4: Load the model and tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add special tokens if they don't exist
special_tokens = {'pad_token': '[PAD]', 'sep_token': '[SEP]'}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

# Move model to device
model.to(device)

# Step 5: Prepare the datasets
train_dataset = SubdomainDataset(tokenizer, train_data)
val_dataset = SubdomainDataset(tokenizer, val_data)

# Step 6: Configure training arguments with gradient accumulation
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    gradient_accumulation_steps=8,  # Simulate a larger batch size
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=200,
    report_to="wandb",
    fp16=torch.cuda.is_available(),
    learning_rate=1e-5,
    warmup_steps=100,
    weight_decay=0.01,
)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


# Step 8: Start Training
trainer.train()

# Step 9: Save the final model post-training
trainer.save_model("./results/final_model")

# Log the final metrics and close W&B
wandb.finish()


Using device: cuda




Total dataset samples: 1500000
Training samples: 1350000, Validation samples: 150000




Step,Training Loss,Validation Loss
200,0.3071,0.295197
400,0.2884,0.280848
600,0.2771,0.273253
800,0.2783,0.266993
1000,0.2694,0.264242
1200,0.2596,0.260103
1400,0.2622,0.258251
1600,0.2574,0.256345
1800,0.2617,0.255079
2000,0.2582,0.253471


KeyboardInterrupt: 

In [3]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import wandb
from sklearn.model_selection import train_test_split

# Ensure main block for Windows multiprocessing
if __name__ == '__main__':
    # Check if CUDA is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Step 1: Securely login to W&B using an environment variable
    
    # Initialize W&B project
    wandb.init(
        project="subdomain-prediction",
        config={
            "learning_rate": 1e-5,
            "architecture": "distilgpt2",
            "dataset": "n0kovo_subdomains",
            "epochs": 5,
        }
    )
    
    wandb.login(key="348d74f5eaafa788d29878c98e6f78509d6e52d1")


    # Step 2: Load the subdomain prefixes from the file
    with open("subdomains.txt", "r") as f:
        subdomain_prefixes = f.read().splitlines()

    # Normalize subdomain prefixes and base domains
    subdomain_prefixes = [subdomain.strip().lower() for subdomain in subdomain_prefixes if subdomain.strip()]
    base_domains = [domain.strip().lower() for domain in ["example.com", "testdomain.com", "mycompany.com"]]

    # Ensure subdomain prefixes are not empty
    if not subdomain_prefixes:
        raise ValueError("The 'subdomains.txt' file is empty or not properly loaded.")

    # Prepare the dataset by combining base domains with subdomain prefixes
    dataset = []
    for base_domain in base_domains:
        for subdomain_prefix in subdomain_prefixes:
            dataset.append({"input": base_domain, "output": subdomain_prefix})

    # Optionally reduce dataset size for testing
    # dataset = dataset[:100000]  # Uncomment to use first 100,000 samples

    # Check the number of samples
    print(f"Total dataset samples: {len(dataset)}")

    # Split the dataset into training and validation sets
    train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)
    print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

    # Step 3: Define the dataset class
    class SubdomainDataset(Dataset):
        def __init__(self, tokenizer, dataset, max_length=128):
            self.tokenizer = tokenizer
            self.dataset = dataset
            self.max_length = max_length

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            base_domain = self.dataset[idx]["input"]
            subdomain_prefix = self.dataset[idx]["output"]
            combined_text = f"{base_domain} [SEP] {subdomain_prefix}"

            encoding = self.tokenizer(
                combined_text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].squeeze()
            attention_mask = encoding["attention_mask"].squeeze()

            labels = input_ids.clone()

            # Ensure sep_token_id is set
            if self.tokenizer.sep_token_id is None:
                self.tokenizer.sep_token_id = self.tokenizer.convert_tokens_to_ids('[SEP]')

            sep_index = (input_ids == self.tokenizer.sep_token_id).nonzero(as_tuple=True)
            if sep_index[0].numel() > 0:
                sep_idx = sep_index[0][0]
                labels[:sep_idx + 1] = -100  # Ignore the base domain and separator
            else:
                labels[:] = -100  # If no [SEP] token found, ignore all tokens

            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            }

    # Step 4: Load the model and tokenizer
    model_name = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Add special tokens if they don't exist
    special_tokens = {'pad_token': '[PAD]', 'sep_token': '[SEP]'}
    added_tokens = tokenizer.add_special_tokens(special_tokens)
    if added_tokens > 0:
        model.resize_token_embeddings(len(tokenizer))

    # Move model to device
    model.to(device)
    print(next(model.parameters()).device)  # Should output 'cuda:0'

    # Step 5: Prepare the datasets
    train_dataset = SubdomainDataset(tokenizer, train_data)
    val_dataset = SubdomainDataset(tokenizer, val_data)

    # Use DataCollatorForLanguageModeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Step 6: Configure training arguments with optimizations
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=32,  # Increased batch size
        gradient_accumulation_steps=1,   # No gradient accumulation
        num_train_epochs=5,
        logging_dir='./logs',
        logging_steps=500,     # Less frequent logging
        save_steps=2000,       # Less frequent saving
        save_total_limit=2,
        evaluation_strategy="steps",
        eval_steps=1000,
        report_to="wandb",
        fp16=True,  # Ensure mixed precision is enabled
        learning_rate=1e-5,
        warmup_steps=100,
        weight_decay=0.01,
        dataloader_num_workers=0,  # Set to 0 for Windows
        dataloader_pin_memory=True
    )

    # Step 7: Initialize Trainer with data collator
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    # Step 8: Start Training
    trainer.train()

    # Step 9: Save the final model post-training
    trainer.save_model("./results/final_model")

    # Log the final metrics and close W&B
    wandb.finish()


Using device: cuda


VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▇▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▂▁▁▃▁▁▁▂▁▁▁▁▁▁▁▁▁▁▆▇█▅▇▇
eval/samples_per_second,▇███████████▇▅██████████████▇██████▃▂▁▄▂
eval/steps_per_second,▇█████████████████▇███████████████▆▃▂▁▄▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▆▇▇▅▅▄▄▄▄▄▃▃▄▃▂▃▃▃▃▂▃▃▃▃▂▃▂▂▂▂▃▂▃▂▂▂▁▁▁

0,1
eval/loss,0.2279
eval/runtime,372.423
eval/samples_per_second,402.768
eval/steps_per_second,50.346
train/epoch,0.3603
train/global_step,15200.0
train/grad_norm,0.20164
train/learning_rate,1e-05
train/loss,0.2333




Total dataset samples: 1500000
Training samples: 1350000, Validation samples: 150000
cuda:0




Step,Training Loss,Validation Loss
1000,2.9002,2.70692
2000,2.7243,2.628881
3000,2.6679,2.584461
4000,2.6041,2.551131
5000,2.6072,2.526572
6000,2.576,2.506913
7000,2.5522,2.489658
8000,2.53,2.474399
9000,2.5153,2.460489
10000,2.5101,2.44687


0,1
eval/loss,█▇▆▅▄▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▆▁▁▂▁▁▁▁▁▁▂▁▁▁▁▂▂▁▁▁██████▅▂▂▂▂▂▂▂▂▂▂█
eval/samples_per_second,▄███▇███████▇████▇▇▁▁▂▇▇▇▇▇▇▇▇▇▇▇▇▇▆▇▅▁▂
eval/steps_per_second,▄▇████████████████▇▇▂▁▂▂▁▂▂▇▇▇▇▇▇▇▇▇▇▅▂▂
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇██
train/grad_norm,█▆▄▄▅▃▄▅▅▃▁▂▃▂▃▃▅▄▄▂▅▃▂▁▃▁▃▄▂▃▁▃▄▂▁▃▃▂▄▄
train/learning_rate,█████████▇▇▇▇▆▆▆▆▆▆▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▆▅▅▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,2.04739
eval/runtime,330.9302
eval/samples_per_second,453.268
eval/steps_per_second,56.658
total_flos,2.20469133312e+17
train/epoch,5.0
train/global_step,210940.0
train/grad_norm,2.94231
train/learning_rate,0.0
train/loss,2.0924


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "

Generated Subdomains:


In [1]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import random

# Ensure main block for Windows multiprocessing
if __name__ == '__main__':
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Step 1: Load the existing model and tokenizer
    model_path = "./results/final_model"  # Path to your previously trained model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.to(device)

    # Step 2: Load and prepare the new dataset
    # Assuming 'new_subdomains.txt' contains the new subdomain prefixes
    with open("new_subdomains.txt", "r") as f:
        new_subdomain_prefixes = f.read().splitlines()

    # Normalize and clean the new subdomain prefixes
    new_subdomain_prefixes = [subdomain.strip().lower() for subdomain in new_subdomain_prefixes if subdomain.strip()]
    base_domains = [domain.strip().lower() for domain in ["example.com", "testdomain.com", "mycompany.com"]]

    # Prepare the new dataset
    new_dataset = []
    for base_domain in base_domains:
        for subdomain_prefix in new_subdomain_prefixes:
            new_dataset.append({"input": base_domain, "output": subdomain_prefix})

    # Check the number of samples
    print(f"New dataset samples: {len(new_dataset)}")

    # Split the new dataset into training and validation sets
    train_data_new, val_data_new = train_test_split(new_dataset, test_size=0.1, random_state=42)
    print(f"New training samples: {len(train_data_new)}, New validation samples: {len(val_data_new)}")

    # Step 3: Combine the original and new datasets to prevent catastrophic forgetting
    # Load the original dataset (assuming you have 'train_data' and 'val_data' from previous training)
    # If you don't have them, you can skip combining and just use the new dataset
    # For this example, we'll proceed with the new dataset only

    # If you have the original data, combine datasets
    # combined_train_data = train_data + train_data_new
    # combined_val_data = val_data + val_data_new

    # If you don't have the original data, just use the new data
    combined_train_data = train_data_new
    combined_val_data = val_data_new

    # Shuffle the combined datasets
    random.shuffle(combined_train_data)
    random.shuffle(combined_val_data)

    # Step 4: Define the dataset class
    class SubdomainDataset(Dataset):
        def __init__(self, tokenizer, dataset, max_length=128):
            self.tokenizer = tokenizer
            self.dataset = dataset
            self.max_length = max_length

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            base_domain = self.dataset[idx]["input"]
            subdomain_prefix = self.dataset[idx]["output"]
            combined_text = f"{base_domain} [SEP] {subdomain_prefix}"

            encoding = self.tokenizer(
                combined_text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].squeeze()
            attention_mask = encoding["attention_mask"].squeeze()

            labels = input_ids.clone()

            # Ensure sep_token_id is set
            if self.tokenizer.sep_token_id is None:
                self.tokenizer.sep_token_id = self.tokenizer.convert_tokens_to_ids('[SEP]')

            sep_index = (input_ids == self.tokenizer.sep_token_id).nonzero(as_tuple=True)
            if sep_index[0].numel() > 0:
                sep_idx = sep_index[0][0]
                labels[:sep_idx + 1] = -100  # Ignore the base domain and separator
            else:
                labels[:] = -100  # If no [SEP] token found, ignore all tokens

            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": labels
            }

    # Step 5: Prepare the datasets
    train_dataset = SubdomainDataset(tokenizer, combined_train_data)
    val_dataset = SubdomainDataset(tokenizer, combined_val_data)

    # Step 6: Set up training arguments
    from transformers import get_linear_schedule_with_warmup

    training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    num_train_epochs=3,
    learning_rate=5e-6,
    warmup_steps=500,
    logging_dir='./logs',
    logging_strategy="steps",     # Log at each logging step
    logging_steps=100,            # Log every 100 steps
    log_level='info',             # Set the logging level
    save_steps=1000,
    eval_steps=1000,
    eval_strategy="steps",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
    load_best_model_at_end=True,
    report_to="all",  
    run_name="subdomain_prediction_run_final", # Report to all integrations (e.g., TensorBoard)
    )


    # Step 7: Define the data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    # Step 8: Initialize the Trainer with early stopping
    from transformers import EarlyStoppingCallback

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Step 9: Start training
    trainer.train()

    # Step 10: Save the fine-tuned model
    trainer.save_model("./results/final_model_finetuned")

    # Step 11: Evaluate the updated model
    model = AutoModelForCausalLM.from_pretrained("./results/final_model_finetuned")
    model.to(device)
    model.eval()

    # Test the model with some examples
    base_domain = "example.com"
    input_text = f"{base_domain} [SEP]"

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
    ).to(device)

    # Generate subdomains
    outputs = model.generate(
        inputs["input_ids"],
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=10,
        top_p=0.8,
        temperature=1.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    # Post-process and display the generated subdomains
    max_subdomain_parts = 5  # Limit the number of parts in the subdomain

    print("Generated Subdomains:")
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain_prefix = text.split('[SEP]')[-1].strip()
        if subdomain_prefix:
            full_subdomain = f"{subdomain_prefix}.{base_domain}"
            # Only display subdomains with a reasonable number of parts
            if full_subdomain.count('.') <= max_subdomain_parts:
                print(full_subdomain)


Using device: cuda
New dataset samples: 8999997
New training samples: 8099997, New validation samples: 900000


Using auto half precision backend
***** Running training *****
  Num examples = 8,099,997
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 759,375
  Number of trainable parameters = 81,914,112
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: reyadweb34 (reyadweb34-hasn). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011288888888925108, max=1.0…

Step,Training Loss,Validation Loss
1000,3.0521,2.991019
2000,3.0351,2.953824
3000,3.0047,2.933663
4000,2.9943,2.923324
5000,2.9853,2.914265
6000,2.9585,2.908148
7000,2.9519,2.90222
8000,2.9522,2.897186
9000,2.9575,2.893144
10000,2.937,2.888259



***** Running Evaluation *****
  Num examples = 900000
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Configuration saved in ./results\checkpoint-1000\generation_config.json
Model weights saved in ./results\checkpoint-1000\model.safetensors
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
Deleting older checkpoint [results\checkpoint-210000] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 900000
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json
Configuration saved in ./results\checkpoint-2000\generation_config.json
Model weights saved in ./results\checkpoint-2000\model.safetensors
tokenizer config file saved in ./results\checkpoint-2000\tokenizer_config.json
Special tokens file saved in

Generated Subdomains:
example.com  vps-6a6d8a5b-5f2.example.com
example.com  bst-b4b2a7c8-d9c.example.com
example.com  cn-dev-dns-f8b7b3d.example.com
example.com  vps-9c8b5c8c-c4b.example.com
example.com  d7b5a8f9d2f7a9e.example.com


In [13]:

model_path = "./results/final_model_finetuned"  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.to(device)
model.eval()

# Adjusted generation parameters
outputs = model.generate(
    inputs["input_ids"],
    max_length=20,
    num_return_sequences=5,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

# Improved post-processing
import re

# Define a regex pattern for realistic subdomains
pattern = re.compile(r'^[a-z0-9]+([-][a-z0-9]+)*$')

print("Generated Subdomains:")
for output in outputs:
    text = tokenizer.decode(output, skip_special_tokens=True)
    subdomain_prefix = text.split('[SEP]')[-1].strip()
    if subdomain_prefix:
        # Apply regex filtering
        if pattern.match(subdomain_prefix):
            full_subdomain = f"{subdomain_prefix}.{base_domain}"
            print(full_subdomain)


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Generated Subdomains:


In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset
import random
import logging
from nltk.translate.bleu_score import sentence_bleu
import nltk
from Levenshtein import distance as levenshtein_distance

# Download NLTK data (run this once)
nltk.download('punkt')

# Set up logging (optional)
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logging.getLogger("transformers").setLevel(logging.INFO)

if __name__ == '__main__':
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the fine-tuned model and tokenizer
    model = AutoModelForCausalLM.from_pretrained("./results/final_model_finetuned").to(device)
    tokenizer = AutoTokenizer.from_pretrained("./results/final_model_finetuned")

    # Define the evaluation function
    def evaluate_model(model, tokenizer, test_data, max_length=20):
        model.eval()
        predictions = []
        ground_truths = []

        for sample in test_data:
            base_domain = sample["input"]
            true_subdomain = sample["output"]
            input_text = f"{base_domain} [SEP]"

            inputs = tokenizer(
                input_text,
                return_tensors="pt",
                max_length=128,
                truncation=True,
            ).to(device)

            with torch.no_grad():
                outputs = model.generate(
                    inputs["input_ids"],
                    max_length=max_length,
                    num_return_sequences=1,
                    do_sample=True,
                    top_k=50,
                    top_p=0.95,
                    temperature=0.7,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
                )

            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predicted_subdomain = generated_text.split('[SEP]')[-1].strip()

            # Append to lists
            predictions.append(predicted_subdomain)
            ground_truths.append(true_subdomain)

        return predictions, ground_truths

    # Define metrics calculation functions
    def calculate_exact_match(predictions, ground_truths):
        correct = 0
        total = len(ground_truths)
        for pred, true in zip(predictions, ground_truths):
            if pred.lower() == true.lower():
                correct += 1
        accuracy = correct / total
        return accuracy

    def calculate_bleu_score(predictions, ground_truths):
        total_bleu = 0
        for pred, true in zip(predictions, ground_truths):
            reference = [true.lower().split('.')]
            candidate = pred.lower().split('.')
            bleu_score = sentence_bleu(reference, candidate)
            total_bleu += bleu_score
        average_bleu = total_bleu / len(ground_truths)
        return average_bleu

    def calculate_average_levenshtein(predictions, ground_truths):
        total_distance = 0
        for pred, true in zip(predictions, ground_truths):
            distance = levenshtein_distance(pred.lower(), true.lower())
            total_distance += distance
        average_distance = total_distance / len(ground_truths)
        return average_distance

    # Example test data (replace this with your actual test set)
    test_data = [
        {"input": "example.com", "output": "www"},
        {"input": "example.com", "output": "mail"},
        {"input": "example.com", "output": "blog"},
        {"input": "example.com", "output": "shop"},
        {"input": "testdomain.com", "output": "api"},
        {"input": "testdomain.com", "output": "dev"},
        {"input": "testdomain.com", "output": "staging"},
        {"input": "testdomain.com", "output": "support"},
        {"input": "mycompany.com", "output": "portal"},
        {"input": "mycompany.com", "output": "login"},
        {"input": "mycompany.com", "output": "dashboard"},
        {"input": "mycompany.com", "output": "admin"},
        # Add more samples as needed
    ]

    # Run evaluation
    predictions, ground_truths = evaluate_model(model, tokenizer, test_data)

    # Calculate metrics
    accuracy = calculate_exact_match(predictions, ground_truths)
    print(f"\nExact Match Accuracy: {accuracy * 100:.2f}%")

    average_bleu = calculate_bleu_score(predictions, ground_truths)
    print(f"Average BLEU Score: {average_bleu * 100:.2f}")

    average_levenshtein = calculate_average_levenshtein(predictions, ground_truths)
    print(f"Average Levenshtein Distance: {average_levenshtein:.2f}")

    # Print predictions and ground truths
    print("\nPredictions vs Ground Truths:")
    for base_domain, pred, true in zip([sample["input"] for sample in test_data], predictions, ground_truths):
        print(f"Base Domain: {base_domain}, True Subdomain: {true}, Predicted Subdomain: {pred}")


[nltk_data] Error loading punkt: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": tru

Using device: cuda


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at ./results/final_model_finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
loading configuration file ./results/final_model_finetuned\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.



Exact Match Accuracy: 0.00%
Average BLEU Score: 0.00
Average Levenshtein Distance: 31.17

Predictions vs Ground Truths:
Base Domain: example.com, True Subdomain: www, Predicted Subdomain: example.com  bst-3c7f9c2d-c8f
Base Domain: example.com, True Subdomain: mail, Predicted Subdomain: example.com  bst-1a6d5e1c-2d68
Base Domain: example.com, True Subdomain: blog, Predicted Subdomain: example.com  bst-5a892ba3-b9e9-
Base Domain: example.com, True Subdomain: shop, Predicted Subdomain: example.com  s-vdi-hclnxs-dns-a0
Base Domain: testdomain.com, True Subdomain: api, Predicted Subdomain: testdomain.com  nah-rvr-01-dns-d8b
Base Domain: testdomain.com, True Subdomain: dev, Predicted Subdomain: testdomain.com  vmi547962d5c6d6f09
Base Domain: testdomain.com, True Subdomain: staging, Predicted Subdomain: testdomain.com  c9a4b4b1f4b2b0
Base Domain: testdomain.com, True Subdomain: support, Predicted Subdomain: testdomain.com  bst-b8dd6b9c-7d8
Base Domain: mycompany.com, True Subdomain: portal, 

In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.metrics import accuracy_score

# Load the trained model and tokenizer
model_path = "./results/final_model_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Define test samples and expected subdomains
test_domains = ["example.com", "testdomain.com", "mycompany.com"]
ground_truth_subdomains = [["www", "support", "blog"], ["dev", "api", "mail"], ["portal", "sales", "helpdesk"]]

# Initialize results
total_predictions = 0
correct_predictions = 0
inference_times = []

# Loop over each test domain to generate predictions and calculate inference time
for base_domain, correct_subdomains in zip(test_domains, ground_truth_subdomains):
    input_text = f"{base_domain} [SEP]"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding=True
    ).to(model.device)

    # Measure inference time
    start_time = time.time()
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Explicitly passing attention_mask
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=10,
        top_p=0.8,
        temperature=1.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    end_time = time.time()

    # Calculate inference time
    inference_times.append(end_time - start_time)

    # Process generated outputs
    generated_subdomains = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain = text.split('[SEP]')[-1].strip()
        if subdomain:
            generated_subdomains.append(subdomain)

    # Calculate accuracy for this base domain
    correct_predictions += sum(1 for subdomain in generated_subdomains if subdomain in correct_subdomains)
    total_predictions += len(generated_subdomains)

# Calculate overall accuracy and average inference time per domain
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
average_inference_time = sum(inference_times) / len(inference_times)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Average Inference Time per Domain: {average_inference_time:.4f} seconds")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Model Accuracy: 0.00
Average Inference Time per Domain: 0.1890 seconds


In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.metrics import accuracy_score

# Load the trained model and tokenizer
model_path = "./results/final_model_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Define test samples and expected subdomains
test_domains = ["example.com", "testdomain.com", "mycompany.com"]
ground_truth_subdomains = [["www", "support", "blog"], ["dev", "api", "mail"], ["portal", "sales", "helpdesk"]]

# Initialize results
total_predictions = 0
correct_predictions = 0
inference_times = []

# Loop over each test domain to generate predictions and calculate inference time
for base_domain, correct_subdomains in zip(test_domains, ground_truth_subdomains):
    input_text = f"{base_domain} [SEP]"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding=True
    ).to(model.device)

    # Measure inference time
    start_time = time.time()
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Explicitly passing attention_mask
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=5,  # Reduced to limit randomness
        top_p=0.7,  # Focus on more probable tokens
        temperature=0.7,  # Lower temperature for less diversity
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    end_time = time.time()

    # Calculate inference time
    inference_times.append(end_time - start_time)

    # Process generated outputs
    generated_subdomains = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain = text.split('[SEP]')[-1].strip()
        if subdomain:
            generated_subdomains.append(subdomain)

    # Debug: Print generated subdomains for inspection
    print(f"Generated subdomains for {base_domain}: {generated_subdomains}")

    # Calculate accuracy for this base domain
    correct_predictions += sum(1 for subdomain in generated_subdomains if subdomain in correct_subdomains)
    total_predictions += len(generated_subdomains)

# Calculate overall accuracy and average inference time per domain
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
average_inference_time = sum(inference_times) / len(inference_times)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Average Inference Time per Domain: {average_inference_time:.4f} seconds")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Generated subdomains for example.com: ['example.com  cb-dev-eastus-1a9e8c7', 'example.com  bst-c5c9f9d6-c8e', 'example.com  bst-b8c5b9d4-a7f', 'example.com  bst-b8b7b6b4-f5b', 'example.com  c9c8c9a9d6c8c9a']
Generated subdomains for testdomain.com: ['testdomain.com  bst-c8c6b6b6-b8', 'testdomain.com  bst-c8c7e5f2-c9', 'testdomain.com  bst-c9e7a7e0-b8', 'testdomain.com  bst-a5f8a5b5-b8', 'testdomain.com  bst-f9a5f6a1-d8']
Generated subdomains for mycompany.com: ['mycompany.com  bst-f5c7f9b5-a8', 'mycompany.com  bst-c9c8b7d7-f8', 'mycompany.com  bst-f8b7c7c5-a7', 'mycompany.com  bst-a5f6c6a1-d8', 'mycompany.com  vps-7c7f6b8c-4d']
Model Accuracy: 0.00
Average Inference Time per Domain: 0.1983 seconds


In [7]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.metrics import accuracy_score

# Load the trained model and tokenizer
model_path = "./results/final_model_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Define test samples and expected subdomains
test_domains = ["example.com", "testdomain.com", "mycompany.com"]
ground_truth_subdomains = [["www", "support", "blog"], ["dev", "api", "mail"], ["portal", "sales", "helpdesk"]]

# Initialize results
total_predictions = 0
correct_predictions = 0
inference_times = []

# Loop over each test domain to generate predictions and calculate inference time
for base_domain, correct_subdomains in zip(test_domains, ground_truth_subdomains):
    input_text = f"{base_domain} [SEP]"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding=True
    ).to(model.device)

    # Measure inference time
    start_time = time.time()
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Explicitly passing attention_mask
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=5,  # Reduced to limit randomness
        top_p=0.7,  # Focus on more probable tokens
        temperature=0.7,  # Lower temperature for less diversity
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    end_time = time.time()

    # Calculate inference time
    inference_times.append(end_time - start_time)

    # Process generated outputs
    generated_subdomains = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain = text.split('[SEP]')[-1].strip()
        if subdomain:
            generated_subdomains.append(subdomain)

    # Debug: Print generated subdomains for inspection
    print(f"Generated subdomains for {base_domain}: {generated_subdomains}")

    # Calculate accuracy for this base domain
    correct_predictions += sum(1 for subdomain in generated_subdomains if subdomain in correct_subdomains)
    total_predictions += len(generated_subdomains)

# Calculate overall accuracy and average inference time per domain
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
average_inference_time = sum(inference_times) / len(inference_times)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Average Inference Time per Domain: {average_inference_time:.4f} seconds")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Generated subdomains for example.com: ['example.com  bst-b5b9c7b1-b5e', 'example.com  bst-c5b7f9d5-c8b', 'example.com  bst-b8b9a5c8-c5c', 'example.com  bst-c8e6b7b5-a6b', 'example.com  bst-c9b7c8c0-c8c']
Generated subdomains for testdomain.com: ['testdomain.com  vpn-c-uw2-2-dns-', 'testdomain.com  bst-b5c6a8a5-c8', 'testdomain.com  bst-f9e6b8a6-c9', 'testdomain.com  vpn-sj-c1-01-dns-', 'testdomain.com  bst-c9e8b9c8-c8']
Generated subdomains for mycompany.com: ['mycompany.com  vps-7e8c7f8a-b8', 'mycompany.com  vpn-us-west-2-a1-prod', 'mycompany.com  bst-a9a8a5f5-f5', 'mycompany.com  vpn-s-b-01-extranet-us', 'mycompany.com  bst-f5a8c7a6-b8']
Model Accuracy: 0.00
Average Inference Time per Domain: 0.2097 seconds


In [9]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.metrics import accuracy_score

# Load the trained model and tokenizer
model_path = "./results/final_model_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Define test samples and expected subdomains
test_domains = ["example.com", "testdomain.com", "mycompany.com"]
ground_truth_subdomains = [["www", "support", "blog"], ["dev", "api", "mail"], ["portal", "sales", "helpdesk"]]

# Initialize results
total_predictions = 0
correct_predictions = 0
inference_times = []

def is_valid_subdomain(subdomain):
    # Basic validation rules for subdomains
    return all(char.isalnum() or char in ['-', '_'] for char in subdomain) and \
           1 <= len(subdomain) <= 63 and \
           not subdomain.startswith('-') and not subdomain.endswith('-')

# Loop over each test domain to generate predictions and calculate inference time
for base_domain, correct_subdomains in zip(test_domains, ground_truth_subdomains):
    input_text = f"{base_domain} [SEP]"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding=True
    ).to(model.device)

    # Measure inference time
    start_time = time.time()
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=5,
        top_p=0.7,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    end_time = time.time()

    # Calculate inference time
    inference_times.append(end_time - start_time)

    # Process generated outputs
    generated_subdomains = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain = text.split('[SEP]')[-1].strip()
        if subdomain and is_valid_subdomain(subdomain):
            generated_subdomains.append(subdomain)

    # Debug: Print generated subdomains for inspection
    print(f"Generated subdomains for {base_domain}: {generated_subdomains}")

    # Calculate accuracy for this base domain
    correct_predictions += sum(1 for subdomain in generated_subdomains if subdomain in correct_subdomains)
    total_predictions += len(generated_subdomains)

# Calculate overall accuracy and average inference time per domain
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
average_inference_time = sum(inference_times) / len(inference_times)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Average Inference Time per Domain: {average_inference_time:.4f} seconds")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Generated subdomains for example.com: []
Generated subdomains for testdomain.com: []
Generated subdomains for mycompany.com: []
Model Accuracy: 0.00
Average Inference Time per Domain: 0.1795 seconds


In [10]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.metrics import accuracy_score

# Load the trained model and tokenizer
model_path = "./results/final_model_finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# Define test samples and expected subdomains
test_domains = ["example.com", "testdomain.com", "mycompany.com"]
ground_truth_subdomains = [["www", "support", "blog"], ["dev", "api", "mail"], ["portal", "sales", "helpdesk"]]

# Initialize results
total_predictions = 0
correct_predictions = 0
inference_times = []

def is_valid_subdomain(subdomain):
    # Basic validation rules for subdomains
    return all(char.isalnum() or char in ['-', '_'] for char in subdomain) and \
           1 <= len(subdomain) <= 63 and \
           not subdomain.startswith('-') and not subdomain.endswith('-')

# Loop over each test domain to generate predictions and calculate inference time
for base_domain, correct_subdomains in zip(test_domains, ground_truth_subdomains):
    input_text = f"{base_domain} [SEP]"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding=True
    ).to(model.device)

    # Measure inference time
    start_time = time.time()
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=5,
        top_p=0.7,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
    end_time = time.time()

    # Calculate inference time
    inference_times.append(end_time - start_time)

    # Process generated outputs
    generated_subdomains = []
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain = text.split('[SEP]')[-1].strip()
        if subdomain and is_valid_subdomain(subdomain):
            generated_subdomains.append(subdomain)

    # Debug: Print generated subdomains for inspection
    print(f"Generated subdomains for {base_domain}: {generated_subdomains}")

    # Calculate accuracy for this base domain
    correct_predictions += sum(1 for subdomain in generated_subdomains if subdomain in correct_subdomains)
    total_predictions += len(generated_subdomains)

# Calculate overall accuracy and average inference time per domain
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
average_inference_time = sum(inference_times) / len(inference_times)

print(f"Model Accuracy: {accuracy:.2f}")
print(f"Average Inference Time per Domain: {average_inference_time:.4f} seconds")


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Generated subdomains for example.com: []
Generated subdomains for testdomain.com: []
Generated subdomains for mycompany.com: []
Model Accuracy: 0.00
Average Inference Time per Domain: 0.1884 seconds


In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "./results/final_model_finetuned"

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Load model
model = GPT2LMHeadModel.from_pretrained(model_path)


loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": 

In [12]:
# Sample input
input_text = "Generate subdomains for example.com"

# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate output
output = model.generate(input_ids, max_length=50, num_return_sequences=1)

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generate subdomains for example.com  bst-c9b9b7d-b9e0-8a3a-b9c8b8b8b8b8c-0a8b


In [13]:
# Sample input
input_text = "Generate subdomains for example.com"

# Tokenize the input and create attention mask
input_ids = tokenizer.encode(input_text, return_tensors='pt')
attention_mask = (input_ids != tokenizer.pad_token_id).long()  # Create an attention mask

# Generate output with specified pad_token_id
output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)


Generate subdomains for example.com  bst-c9b9b7d-b9e0-8a3a-b9c8b8b8b8b8c-0a8b


In [15]:
# List of domains for which you want to generate subdomains
domains = ["example.com", "testdomain.com", "mycompany.com"]

# Generate subdomains for each domain
for domain in domains:
    # Tokenize the input
    input_ids = tokenizer.encode(domain, return_tensors='pt')

    # Generate subdomains
    outputs = model.generate(input_ids, max_length=50, num_return_sequences=5)

    # Decode and print the generated subdomains
    generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print(f"Generated subdomains for {domain}: {generated_subdomains}")



ValueError: Greedy methods without beam search do not support `num_return_sequences` different than 1 (got 5).

In [16]:
# List of domains for which you want to generate subdomains
domains = ["example.com", "testdomain.com", "mycompany.com"]

# Generate subdomains for each domain
for domain in domains:
    # Tokenize the input
    input_ids = tokenizer.encode(domain, return_tensors='pt')

    # Generate subdomains
    outputs = model.generate(input_ids, max_length=50, num_return_sequences=5)

    # Decode and print the generated subdomains
    generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print(f"Generated subdomains for {domain}: {generated_subdomains}")


ValueError: Greedy methods without beam search do not support `num_return_sequences` different than 1 (got 5).

In [18]:
import torch

# Generate subdomains for each domain
for domain in domains:
    # Tokenize the input and create attention mask
    input_ids = tokenizer.encode(domain, return_tensors='pt')
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    # Generate subdomains using sampling
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=5, do_sample=True, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the generated subdomains
    generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print(f"Generated subdomains for {domain}: {generated_subdomains}")


Generated subdomains for example.com: ['example.com  ip-10-0-104-133-13099-66e9d49c5b9db6e-0af9d3a0ee2b5f5-1bb49c00', 'example.com  hfzhw5k1kjyt7n0qo2v3vyb6a6i5cff-0000-0000-0000-0000-0b0f05c1', 'example.com  waws-prod-am2-341.publish.azurewebsites-0c867b7-5b5820f8f6c66-0f5ddd5b', 'example.com  usnccrm01-us-east-1-prg-aks-0c5cc5db2a5a1b4f2f2cb2-1ab7d7b7816', 'example.com  waws-prod-bn1-025--9f5dcb49-c2f4f6ee3c6d-0c26f2c0f69f11-1f9']
Generated subdomains for testdomain.com: ['testdomain.com  t1-j-2-mrsvc001-wpprod-rkp1a-b8f6898c1cf6aa5c7ce7e4e-0000-0000', 'testdomain.com  ec2mail1-1-2-13-218-6678c6af97d76660c3de2-0000-0000-0000-0000-1e3c5a8ea', 'testdomain.com  mybillingbox-stage-eastus2-aks-2aaa3c0ccad3d9ba8-b4a5ffb36d5df6c9-0078d5', 'testdomain.com  f5bdfb7d77dccc16fb1bc0d1abb3f8fd5b2d3d5c5-824a6b0f0d9', 'testdomain.com  sslvpn-mh-oobisinessatw-2a1f-9a826f4ac4f3-0000-1c9f5f0065f094']
Generated subdomains for mycompany.com: ['mycompany.com  srv202101036863840c9ddpossoap-east-rr-7d1e26d

In [21]:
# List of domains to test
domains = ['example.com', 'testdomain.com', 'mycompany.com']

# Function to generate and evaluate subdomains
def test_model(domains):
    for domain in domains:
        # Generate subdomains
        input_ids = tokenizer.encode(domain, return_tensors='pt')
        outputs = model.generate(input_ids, max_length=50, num_return_sequences=1)  # Set to 1
        
        # Decode and print the generated subdomains
        generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        print(f"Generated subdomains for {domain}:")
        for subdomain in generated_subdomains:
            print(subdomain)
            # Here you can add checks against the characteristics

# Run the test
test_model(domains)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated subdomains for example.com:
example.com  bst-c9b9b7b5-b9b4-4c1b-b9b0-c8b8b8b8b8b8d-0f8b


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated subdomains for testdomain.com:
testdomain.com  bst-c9b9b7b5-b9b4-4c1b-b9b8-c8b8b8b8b8b8b-0a8
Generated subdomains for mycompany.com:
mycompany.com  bst-c9b9b9b5-b9b0-4c1b-b9b0-b9b8b8b8b8b8b-0a8


In [20]:
# List of domains to test
domains = ['example.com', 'testdomain.com', 'mycompany.com']

# Function to generate and evaluate subdomains
def test_model(domains):
    for domain in domains:
        # Generate subdomains
        input_ids = tokenizer.encode(domain, return_tensors='pt')
        outputs = model.generate(input_ids, max_length=50, num_return_sequences=5)
        
        # Decode and print the generated subdomains
        generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        print(f"Generated subdomains for {domain}:")
        for subdomain in generated_subdomains:
            print(subdomain)
            # Here you can add checks against the characteristics
            # e.g., check format, length, allowed characters, etc.

# Run the test
test_model(domains)


ValueError: Greedy methods without beam search do not support `num_return_sequences` different than 1 (got 5).

In [25]:
# List of domains to test
domains = ['example.com', 'testdomain.com', 'mycompany.com']

# Function to generate and evaluate subdomains
def test_model(domains):
    for domain in domains:
        # Generate subdomains
        input_ids = tokenizer.encode(domain, return_tensors='pt')
        
        # Create attention mask
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

        # Use beam search and specify the number of return sequences
        outputs = model.generate(input_ids, 
                         attention_mask=attention_mask,
                         max_length=50, 
                         num_return_sequences=5,
                         do_sample=True,  # Enable sampling
                         top_k=50,        # Use top-k sampling
                         top_p=0.95,      # Use nucleus sampling
                         pad_token_id=tokenizer.eos_token_id)
               # Set pad_token_id

        # Decode and print the generated subdomains
        generated_subdomains = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
        print(f"Generated subdomains for {domain}:")
        for subdomain in generated_subdomains:
            print(subdomain)
            # Here you can add checks against the characteristics

# Run the test
test_model(domains)


Generated subdomains for example.com:
example.com  wt-admin-stage-lwc1-ap-northeast-2-test-b0bc69e3-7a4d5a8e7c076-1f38
example.com  azuregateway-a4e8c0075-a7b9-4d28-88b6-a5a9f9fccd5e1-0afb7fb4
example.com  bst-1c9b6b26-f7d7-4dd0-9b72-7d09cb3f099-0daa8b1dfd0b0c
example.com  g2r-exch03-2-ch2-01p-web-01pwg-3f6c8f18e17f6e7-0c5employed4f0a
example.com  ip-10-0-123-143-92-218479866413 mascothub-1ce9b8f1d3a-0b6d60a1d60e sincerely-0
Generated subdomains for testdomain.com:
testdomain.com  zenztjhqnqkqqc9w8wxf7xlqjd8v6jv7r9b4x-a1f6f7e
testdomain.com  nginx11f15e8c1b5a4c98eb4a0aac3de1b6a9f2-0000-0000-0000-0000-0000-00-0000
testdomain.com  bst-2f8bb59d-5a6b-450b-b4bc-8f4c3b09f66-0c3f0045d9bf1f5
testdomain.com  gcp3-stg-sgt-wcf-02-dns-e7f29fc6e5f983-7c1d1baf8b66ef5-1
testdomain.com  ewg3-test-nordpci-aks-6cd6ef9ce0-9dbf6c6c3d3c-1dc4a3a8c0aa
Generated subdomains for mycompany.com:
mycompany.com  bst-4d2a1f75-d8a1-4c75-b987-9e7e6b8d28d6-1f49c738a07
mycompany.com  wc1-int5-h3-ext-1-sm100-wgbzt-1c6c5f

In [28]:
import os
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import random

# Ensure main block for Windows multiprocessing
if __name__ == '__main__':
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the existing fine-tuned model and tokenizer
    model_path = "./results/final_model_finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.to(device)

    # Load and prepare the new dataset
    with open("new_subdomains.txt", "r") as f:
        new_subdomain_prefixes = f.read().splitlines()
    new_subdomain_prefixes = [sub.strip().lower() for sub in new_subdomain_prefixes if sub.strip()]
    base_domains = ["example.com", "testdomain.com", "mycompany.com"]
    
    # Curriculum Learning: Separate common and complex subdomains
    common_subdomains = ["www", "api", "mail"]
    complex_subdomains = [sub for sub in new_subdomain_prefixes if sub not in common_subdomains]
    
    # Prepare the new dataset with Curriculum Learning in mind
    def prepare_dataset(subdomains):
        data = []
        for domain in base_domains:
            for sub in subdomains:
                data.append({"input": domain, "output": sub})
        return data

    # Start with common subdomains, add complex ones gradually
    dataset_stage1 = prepare_dataset(common_subdomains)
    dataset_stage2 = prepare_dataset(complex_subdomains)
    
    # Combine data with a focus on complex patterns after initial learning
    combined_data = dataset_stage1 + dataset_stage2
    
    # Split data into training and validation sets
    train_data, val_data = train_test_split(combined_data, test_size=0.1, random_state=42)

    # Dataset class
    class SubdomainDataset(Dataset):
        def __init__(self, tokenizer, dataset, max_length=128):
            self.tokenizer = tokenizer
            self.dataset = dataset
            self.max_length = max_length

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            base_domain = self.dataset[idx]["input"]
            subdomain_prefix = self.dataset[idx]["output"]
            combined_text = f"{base_domain} [SEP] {subdomain_prefix}"

            encoding = self.tokenizer(
                combined_text,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].squeeze()
            attention_mask = encoding["attention_mask"].squeeze()
            labels = input_ids.clone()

            if self.tokenizer.sep_token_id is None:
                self.tokenizer.sep_token_id = self.tokenizer.convert_tokens_to_ids('[SEP]')

            sep_index = (input_ids == self.tokenizer.sep_token_id).nonzero(as_tuple=True)
            if sep_index[0].numel() > 0:
                sep_idx = sep_index[0][0]
                labels[:sep_idx + 1] = -100
            else:
                labels[:] = -100

            return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

    # Prepare the datasets
    train_dataset = SubdomainDataset(tokenizer, train_data)
    val_dataset = SubdomainDataset(tokenizer, val_data)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=32,
        num_train_epochs=5,
        learning_rate=5e-6,
        warmup_steps=500,
        logging_dir='./logs',
        logging_strategy="steps",
        logging_steps=100,
        save_steps=1000,
        eval_steps=1000,
        eval_strategy="steps",
        save_total_limit=2,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        report_to="all",
        run_name="subdomain_prediction_run_v2_with_curriculum",
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    # Initialize Trainer with Early Stopping and Data Reweighting
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Training with curriculum learning stages
    trainer.train()

    # Save the fine-tuned model as version 2
    trainer.save_model("./results/final_model_v2")

    # Evaluate the updated model
    model = AutoModelForCausalLM.from_pretrained("./results/final_model_v2")
    model.to(device)
    model.eval()

    # Test the model with some examples
    base_domain = "example.com"
    input_text = f"{base_domain} [SEP]"

    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
    ).to(device)

    # Generate subdomains with diverse sampling
    outputs = model.generate(
        inputs["input_ids"],
        max_length=20,
        num_return_sequences=5,
        do_sample=True,
        top_k=10,
        top_p=0.8,
        temperature=1.0,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    # Display generated subdomains
    print("Generated Subdomains for v2 with Curriculum Learning:")
    for output in outputs:
        text = tokenizer.decode(output, skip_special_tokens=True)
        subdomain_prefix = text.split('[SEP]')[-1].strip()
        if subdomain_prefix:
            full_subdomain = f"{subdomain_prefix}.{base_domain}"
            print(full_subdomain)


Using device: cuda


loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./results/final_model_finetuned\config.json
Model config GPT2Config {
  "_name_or_path": "./results/final_model_finetuned",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_la

Step,Training Loss,Validation Loss
1000,2.8689,2.82986
2000,2.8746,2.830192
3000,2.8834,2.828952
4000,2.9039,2.829655
5000,2.8657,2.828202
6000,2.8714,2.827327
7000,2.8826,2.826838
8000,2.8823,2.825295
9000,2.9055,2.825892
10000,2.859,2.82478



***** Running Evaluation *****
  Num examples = 899998
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json
Configuration saved in ./results\checkpoint-1000\generation_config.json
Model weights saved in ./results\checkpoint-1000\model.safetensors
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
Deleting older checkpoint [results\checkpoint-15000] due to args.save_total_limit

***** Running Evaluation *****
  Num examples = 899998
  Batch size = 8
Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json
Configuration saved in ./results\checkpoint-2000\generation_config.json
Model weights saved in ./results\checkpoint-2000\model.safetensors
tokenizer config file saved in ./results\checkpoint-2000\tokenizer_config.json
Special tokens file saved in 

Generated Subdomains for v2 with Curriculum Learning:
example.com  ssl1066337-1e2f-4b4f.example.com
example.com  ssl1063678-0e5c-40a7-.example.com
example.com  bst-a2b8a1b7-8e6.example.com
example.com  bst-f5f9e6e4-e8e.example.com
example.com  bst-a9c9d6d1-f7a.example.com
