# Load Pretrained Model and Tokenizer
Load the pretrained T5 model and tokenizer using the `get_pretrained_model` function.

In [1]:
# Importing Libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='tensorflow')

device = "cuda" if torch.cuda.is_available() else "cpu"

def get_pretrained_model(model_name="google/flan-t5-base"):
    """
    Load the pretrained T5 model and tokenizer.

    Parameters:
    - model_name: str, default "google/flan-t5-base", the name of the pretrained model to load

    Returns:
    - tokenizer: Pretrained tokenizer
    - model: Pretrained T5 model
    """
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    return tokenizer, model

# Load the pretrained model and tokenizer
model_name = "google/flan-t5-base"
tokenizer, model = get_pretrained_model()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Preprocess Data
Define a function `preprocess_function` to tokenize the input and output examples, and prepare labels.

In [7]:
# Preprocess Data

def preprocess_function(examples, tokenizer):
    """
    Tokenize the dataset for input and output.

    Parameters:
    - examples: Dictionary with "input" and "output" keys
    - tokenizer: Pretrained tokenizer

    Returns:
    - Dictionary with tokenized inputs and labels
    """
    prefix = ""
    if model_name in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", "google/flan-t5-base"]:
        prefix = "Rephrase the following technical explanation into a natural language sentence:"


    inputs = tokenizer(
        [prefix + text for text in examples["input"]],
        padding="max_length", truncation=True, max_length=256
    )
    outputs = tokenizer(
        examples["output"],
        padding="max_length", truncation=True, max_length=128
    )
    inputs["labels"] = outputs["input_ids"]
    return inputs

# Prepare Dataset
Convert the raw data into a tokenized dataset using the `prepare_dataset` function.

In [8]:
from datasets import Dataset

def prepare_dataset(raw_data, tokenizer):
    """
    Convert raw data into a tokenized dataset.

    Parameters:
    - raw_data: List of dictionaries with "input" and "output" keys
    - tokenizer: Pretrained tokenizer

    Returns:
    - tokenized_datasets: Tokenized dataset
    """
    dataset = Dataset.from_list(raw_data)
    tokenized_datasets = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return tokenized_datasets

# Sample dataset
raw_data = [
    # --- Simple Condition Rules ---
    {"input": "C: getKitchenCoffee requires [staffCardAvailable]", "output": "You can get coffee from the kitchen if a staff card is available."},
    {"input": "C: enterOffice requires [securityBadge]", "output": "You need a security badge to enter the office."},
    {"input": "C: accessServerRoom requires [adminClearance, keyCard]", "output": "To access the server room, you must have admin clearance and a key card."},
    {"input": "C: driveCompanyCar requires [validLicense, authorization]", "output": "Driving the company car requires both a valid license and authorization."},
    {"input": "C: openSafe requires [correctPinCode]", "output": "The safe can only be opened with the correct PIN code."},
    {"input": "C: borrowLaptop requires [managerApproval]", "output": "You need manager approval to borrow a laptop."},
    {"input": "C: accessConfidentialFile requires [encryptionKey, clearanceLevel3]", "output": "To access the confidential file, you must have an encryption key and level 3 clearance."},

    # --- Complex Condition Rules ---
    {"input": "C: attendMeeting requires [calendarInvite]", "output": "To attend the meeting, you must have a calendar invite."},
    {"input": "C: accessProjectFolder requires [vpnAccess, teamMembership]", "output": "To access the project folder, you need VPN access and team membership."},
    {"input": "C: printDocuments requires [printerConnected, paperAvailable]", "output": "You can print documents only if the printer is connected and has paper available."},
    {"input": "C: unlockDevice requires [correctPassword, biometricScan]", "output": "You can unlock the device only with the correct password and a biometric scan."},
    {"input": "C: approveExpense requires [budgetApproval, financeSignature]", "output": "An expense can be approved only with budget approval and a finance department signature."},
    {"input": "C: deploySoftware requires [codeReview, securityCheck]", "output": "Software can be deployed only after a code review and a security check."},
    {"input": "C: bookFlight requires [travelApproval, availableBudget]", "output": "Booking a flight requires travel approval and an available budget."},
    {"input": "C: purchaseItem requires [sufficientBalance]", "output": "You can purchase the item only if you have sufficient balance."},
    {"input": "C: leaveBuilding requires [exitPass]", "output": "To leave the building, you need an exit pass."},
    {"input": "C: sendEmail requires [internetConnection]", "output": "You need an internet connection to send an email."},
    {"input": "C: startProject requires [projectApproval, assignedTeam]", "output": "A project can only start with approval and an assigned team."},

    # --- Simple Preference Rules ---
    {"input": "V: getOwnCard (0,0,0) > getOthersCard (0,0,2)", "output": "It is preferable to get your own card rather than using someone else's."},
    {"input": "V: useCompanyLaptop (1,1,1) > usePersonalLaptop (0,0,1)", "output": "Using a company laptop is preferred over using a personal laptop."},
    {"input": "V: takeElevator (3,0,1) > takeStairs (2,1,2)", "output": "Taking the elevator is preferred over taking the stairs in this situation."},
    {"input": "V: buyNewPrinter (5,2,1) > repairOldPrinter (3,3,3)", "output": "Buying a new printer is preferable to repairing the old one."},
    {"input": "V: rentCar (4,0,2) > usePublicTransport (3,1,4)", "output": "Renting a car is a better choice than using public transportation."},
    {"input": "V: installSoftwareUpdate (2,0,1) > postponeUpdate (1,1,3)", "output": "Installing the software update now is better than postponing it."},

    # --- Complex Preference Rules ---
    {"input": "V: attendMeetingRemotely (1,1,0) > attendInPerson (0,0,2)", "output": "It is preferable to attend the meeting remotely rather than in person."},
    {"input": "V: useCloudStorage (2,0,0) > useLocalStorage (1,1,3)", "output": "Using cloud storage is preferable to using local storage."},
    {"input": "V: printInBlackWhite (0,0,0) > printInColor (2,0,3)", "output": "Printing in black and white is preferred over printing in color."},
    {"input": "V: enableTwoFactorAuth (3,0,1) > relyOnPasswordOnly (1,1,3)", "output": "Enabling two-factor authentication is better than relying only on a password."},
    {"input": "V: hireExperiencedCandidate (5,1,0) > hireNewGraduate (3,2,4)", "output": "Hiring an experienced candidate is preferable to hiring a new graduate."},
    {"input": "V: deployStableVersion (2,0,0) > deployBetaVersion (1,1,3)", "output": "Deploying the stable version is preferable to deploying the beta version."},

    # --- Real-world Inspired Condition Rules ---
    {"input": "C: applyForLoan requires [creditScoreAbove600, steadyIncome]", "output": "To apply for a loan, you must have a credit score above 600 and a steady income."},
    {"input": "C: registerForExam requires [courseEnrollment, feePaid]", "output": "You can register for the exam only if you are enrolled in the course and have paid the fee."},
    {"input": "C: enterGym requires [membershipCard]", "output": "You need a membership card to enter the gym."},
    {"input": "C: submitAssignment requires [beforeDeadline, correctFormat]", "output": "Assignments must be submitted before the deadline and in the correct format."},
    {"input": "C: boardFlight requires [passport, boardingPass]", "output": "You must have a passport and boarding pass to board the flight."},
    {"input": "C: withdrawCash requires [debitCard, sufficientBalance]", "output": "You can withdraw cash only if you have a debit card and sufficient balance."},

    # --- Real-world Inspired Preference Rules ---
    {"input": "V: workFromHome (4,1,0) > workFromOffice (3,2,2)", "output": "Working from home is preferable to working from the office in this situation."},
    {"input": "V: useEmailForOfficialCommunication (3,0,1) > useMessagingApps (1,1,2)", "output": "Using email for official communication is better than using messaging apps."},
    {"input": "V: driveElectricCar (5,0,1) > drivePetrolCar (3,2,3)", "output": "Driving an electric car is preferable to driving a petrol car."},
    {"input": "V: buyOrganicFood (4,1,0) > buyNonOrganicFood (3,2,2)", "output": "Buying organic food is preferable to buying non-organic food."},
    {"input": "V: useRecyclableMaterials (3,0,0) > usePlasticMaterials (1,2,4)", "output": "Using recyclable materials is preferred over using plastic materials."}
]

# Prepare dataset
tokenized_datasets = prepare_dataset(raw_data, tokenizer)

# Split the dataset into training and evaluation sets
train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets[0]

{'input': 'C: getKitchenCoffee requires [staffCardAvailable]',
 'output': 'You can get coffee from the kitchen if a staff card is available.',
 'input_ids': [419,
  27111,
  8,
  826,
  2268,
  7295,
  139,
  3,
  9,
  793,
  1612,
  7142,
  10,
  254,
  10,
  129,
  439,
  7059,
  35,
  3881,
  7398,
  15,
  2311,
  784,
  26416,
  6936,
  26,
  188,
  900,
  173,
  179,
  908,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [10]:
test_text = "C: getKitchenCoffee requires [staffCardAvailable]"
tokenized = tokenizer(test_text, return_tensors="pt")

print("Tokenized Input:", tokenized)
decoded = tokenizer.decode(tokenized["input_ids"][0])
print("Decoded Back:", decoded)

Tokenized Input: {'input_ids': tensor([[  205,    10,   129,   439,  7059,    35,  3881,  7398,    15,  2311,
           784, 26416,  6936,    26,   188,   900,   173,   179,   908,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Decoded Back: C: getKitchenCoffee requires [staffCardAvailable]</s>


# Train Model
Fine-tune the pretrained T5 model using the tokenized dataset with the `train_model` function.

In [12]:
from transformers import TrainingArguments, Trainer

def train_model(train_dataset, eval_dataset, model):
    """Fine-tune the T5 model and ensure loss is decreasing."""
    training_args = TrainingArguments(
        output_dir="./t5_nle",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=20,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_steps=100,
        logging_steps=50,  # Log loss every 50 steps
        logging_dir="./logs",
        report_to="none",
        fp16=True,
        do_eval=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    print("Starting training...")
    trainer.train()
    print("Training complete.")

    return model


# Train the model
trained_model = train_model(train_dataset, eval_dataset, model)

Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
50,0.0,


Training complete.


# Generate Explanation
Define a function `generate_explanation` to generate natural language explanations using the trained T5 model.

In [13]:
# Generate Explanation

def generate_explanation(model, tokenizer, input_text):
    """Generate a natural explanation using the trained T5 model."""
    input_text = "translate Formal to Natural: " + input_text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    print("Tokenized input:", inputs["input_ids"])  # Debugging

    output_ids = model.generate(
        **inputs,
        max_length=5000,
        num_beams=10,  # Improve quality with beam search
        early_stopping=True
    )

    print("Raw output (token IDs):", output_ids)  # Debugging

    decoded_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return decoded_text


# Example usage
input_text = "C: getKitchenCoffee requires [staffCardAvailable]"
generated_explanation = generate_explanation(trained_model, tokenizer, input_text)
print(f"Input: {input_text}\nGenerated Explanation: {generated_explanation}")

Tokenized input: tensor([[13959,  3025,   138,    12,  6869,    10,   205,    10,   129,   439,
          7059,    35,  3881,  7398,    15,  2311,   784, 26416,  6936,    26,
           188,   900,   173,   179,   908,     1]], device='cuda:0')
Raw output (token IDs): tensor([[    0,   784, 26416,  6936,    26,   188,   900,   173,   908,     1,
             1,     1]], device='cuda:0')
Input: C: getKitchenCoffee requires [staffCardAvailable]
Generated Explanation: [staffCardAvail]


# Test Model
Define a function `test_model` to test the model on a list of inputs and print the generated explanations.

In [None]:
def test_model(model, tokenizer, test_inputs):
    """
    Test the model on a list of inputs.

    Parameters:
    - model: Trained T5 model
    - tokenizer: Tokenizer corresponding to the model
    - test_inputs: List of formal explanations

    Prints:
    - Generated explanations
    """
    for test_input in test_inputs:
        explanation = generate_explanation(model, tokenizer, test_input)
        print(f"Input: {test_input}\nGenerated Explanation: {explanation}\n")

# Test the model
test_inputs = [
    "C: getKitchenCoffee requires [staffCardAvailable]",
    "V: getOwnCard (0,0,0) > getOthersCard (0,0,2)"
]
test_model(trained_model, tokenizer, test_inputs)

# Example Usage
Demonstrate the usage of the functions with sample data, including loading the model, preparing the dataset, training the model, and testing it with example inputs.

In [None]:
# Example Usage

# Load the pretrained model and tokenizer
tokenizer, model = get_pretrained_model()

# Sample dataset
raw_data = [
    {
        "input": "C: getKitchenCoffee requires [staffCardAvailable]",
        "output": "You can get coffee from the kitchen if a staff card is available."
    },
    {
        "input": "V: getOwnCard (0,0,0) > getOthersCard (0,0,2)",
        "output": "It is preferable to get your own card rather than using someone else's."
    }
]

# Prepare dataset
tokenized_datasets = prepare_dataset(raw_data, tokenizer)

# Split the dataset into training and evaluation sets
train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Train the model
trained_model = train_model(train_dataset, eval_dataset, model)

# Test the model
test_inputs = [
    "C: getKitchenCoffee requires [staffCardAvailable]",
    "V: getOwnCard (0,0,0) > getOthersCard (0,0,2)"
]
test_model(trained_model, tokenizer, test_inputs)