# Fine-tuning

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast, TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch
import pandas as pd
import math

## Loading dataset for fine-tuning

In [55]:
# Load the CSV
file_path = "graphviz_dataset.csv"
df = pd.read_csv(file_path)

# Combine data into training format
train_data = []
for index, row in df.iterrows():
    command = row["sentence"]
    graph = row["graph"]
    formatted_text = f"Command: {command}\nGraph: {graph}"
    train_data.append(formatted_text)

## Loading gpt2-small

In [56]:
# Load model and tokenizer
model_name = "gpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 512

model = GPT2LMHeadModel.from_pretrained(model_name)

# Resize model embeddings (required after setting pad_token)
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

## Checking max_length to avoid unfinished generations due to it's limit

In [39]:
lengths = [len(tokenizer(f"Command: {s}\nGraph: {g}")["input_ids"]) for s, g in zip(df["sentence"], df["graph"])]
print("Max length:", max(lengths))
print("Avg length:", sum(lengths) / len(lengths))

Max length: 456
Avg length: 276.2458


## Preparing data for fine-tuning

In [57]:
class CommandDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(0),
            "attention_mask": tokenized["attention_mask"].squeeze(0),
            "labels": tokenized["input_ids"].squeeze(0),  # Labels for causal LM training
        }

# Create the dataset
dataset = CommandDataset(train_data, tokenizer)

In [58]:
from sklearn.model_selection import train_test_split

# Splitting dataset 80/10/10
train_texts, temp_texts = train_test_split(train_data, test_size=0.2, random_state=42)
val_texts, test_texts = train_test_split(temp_texts, test_size=0.5, random_state=42)

# Creating CommandDatasets
train_dataset = CommandDataset(train_texts, tokenizer)
val_dataset = CommandDataset(val_texts, tokenizer)
test_dataset = CommandDataset(test_texts, tokenizer)

## Saving test dataset {"sentence": ..., "graph": ...} for fine-tuned model evaluation

In [59]:
df_test = pd.DataFrame(test_texts)
df_test.to_csv("test_set_fixed.csv", index=False)

## Fine-tuning
#### Stopped after 2 epochs as there was no significant improvement

In [60]:
training_args = TrainingArguments(
    output_dir="./results",             # Where to save the model
    overwrite_output_dir=True,
    num_train_epochs=3,                 # Number of epochs
    per_device_train_batch_size=1,      # Batch size
    gradient_accumulation_steps=4,
    save_steps=500,                     # Save checkpoint every 500 steps
    save_total_limit=2,                 # Only keep the 2 most recent checkpoints
    eval_strategy="epoch",              # Evaluate after each epoch
    logging_dir="./logs",               # Logging directory
    logging_steps=500,
    do_train=True,
    do_eval=True,
    fp16=False,                         # Disable mixed precision (use full precision)
)

In [61]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start fine-tuning
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0678,0.061239
2,0.0594,0.056563


KeyboardInterrupt: 

## Saving fine-tuned model

In [63]:
model.save_pretrained("./fine_tuned_gpt2_graphs2")
tokenizer.save_pretrained("./fine_tuned_gpt2_graphs2")

('./fine_tuned_gpt2_graphs2\\tokenizer_config.json',
 './fine_tuned_gpt2_graphs2\\special_tokens_map.json',
 './fine_tuned_gpt2_graphs2\\vocab.json',
 './fine_tuned_gpt2_graphs2\\merges.txt',
 './fine_tuned_gpt2_graphs2\\added_tokens.json',
 './fine_tuned_gpt2_graphs2\\tokenizer.json')

## Loading non-fine-tuned and fine-tuned models

In [2]:
# Load the non-fine-tuned GPT-2 model
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
non_fine_tuned_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the fine-tuned GPT-2 model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2_graphs2")


In [4]:
def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids
    output = model.generate(inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def output_comparison(prompt):
    # Generate text from non-fine-tuned GPT-2
    non_fine_tuned_output = generate_text(non_fine_tuned_model, tokenizer, prompt)

    # Generate text from fine-tuned GPT-2
    fine_tuned_output = generate_text(fine_tuned_model, tokenizer, prompt, max_length=512)

    print("Non-Fine-Tuned Output:\n", non_fine_tuned_output)
    print("Fine-Tuned Output:\n", fine_tuned_output)

# def calculate_perplexity(model, tokenizer, prompt):
#     inputs = tokenizer(prompt, return_tensors="pt")
#     labels = inputs["input_ids"]
#     outputs = model(**inputs, labels=labels)
#     loss = outputs.loss
#     perplexity = math.exp(loss.item())
#     return perplexity

# def perplexity_comparison(prompt):
#     # Calculate perplexity for both models
#     non_fine_tuned_perplexity = calculate_perplexity(non_fine_tuned_model, tokenizer, prompt)
#     fine_tuned_perplexity = calculate_perplexity(fine_tuned_model, tokenizer, prompt)

#     print("Non-Fine-Tuned Perplexity:", non_fine_tuned_perplexity, "\n")
#     print("Fine-Tuned Perplexity:", fine_tuned_perplexity, "\n")

Lower perplexity indicates that the model's predictions are more accurate and that it is better at understanding and predicting the text.
Higher perplexity means the model's predictions are less accurate, implying it struggles to model the language or predict the next token.

## Two prompts, just to show the huge gap between non-fine-tuned and fine-tuned models.

In [5]:
prompts = [
    "Command: Pick up the soft remote from the wooden couch and place it on the wooden kitchen table.",
    "Command: Take the pink shirt from the round table and hook it on the hanger."
]


In [6]:
for prompt in prompts:
    print("Prompt:\n", prompt, '\n')
    output_comparison(prompt)
    # perplexity_comparison(prompt)
    print("_______________________________________________________________________ \n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt:
 Command: Pick up the soft remote from the wooden couch and place it on the wooden kitchen table. 

Non-Fine-Tuned Output:
 Command: Pick up the soft remote from the wooden couch and place it on the wooden kitchen table.

The soft remote will be placed on the table and the soft remote will be placed on the table.

The soft remote will be placed
Fine-Tuned Output:
 Command: Pick up the soft remote from the wooden couch and place it on the wooden kitchen table.
Graph: digraph SemanticGraph {
    node [shape=ellipse, style=filled, fillcolor=white];
    pick [label="Pick", shape=box, fillcolor=lightblue];
    remote [label="remote", shape=ellipse, fillcolor=white];
    kitchen_table [label="kitchen table", shape=ellipse, fillcolor=white];
    place [label="Place", shape=box, fillcolor=lightgreen];
    soft [label="soft", shape=ellipse, fillcolor=white];
    wooden [label="wooden", shape=ellipse, fillcolor=white];
    couch [label="couch", shape=ellipse, fillcolor=white];
    soft -

#### Non-fine-tuned model
Non-fine-tuned gpt2-small struggles to generate anything that would make any sense. It often just repeats the command until it reaches max_length limit. That's why the limit is set to 50.

#### Fine-tuned model
Fine-tuned gpt2-small generated semantic graphs with correct graphviz syntax. The perplexity was lower comparing to non-fine-tuned model.

## Non-fine-tuned model few-shot prompting

In [8]:
few_shot_prompt = """Command: Pick the red book from the shelf and place it on the table.
Graph: digraph SemanticGraph {
    node [shape=ellipse, style=filled, fillcolor=white];
    pick [label="Pick", shape=box, fillcolor=lightblue];
    book [label="book", shape=ellipse, fillcolor=white];
    shelf [label="shelf", shape=ellipse, fillcolor=white];
    table [label="table", shape=ellipse, fillcolor=white];
    red [label="red", shape=ellipse, fillcolor=white];
    place [label="Place", shape=box, fillcolor=lightgreen];
    red -> book [label="attribute"];
    pick -> book [label="object"];
    pick -> shelf [label="from"];
    place -> book [label="object"];
    place -> table [label="on"];
}

Command: Take the green bottle from the drawer and put it on the counter.
Graph: digraph SemanticGraph {
    node [shape=ellipse, style=filled, fillcolor=white];
    take [label="Take", shape=box, fillcolor=lightblue];
    bottle [label="bottle", shape=ellipse, fillcolor=white];
    drawer [label="drawer", shape=ellipse, fillcolor=white];
    counter [label="counter", shape=ellipse, fillcolor=white];
    green [label="green", shape=ellipse, fillcolor=white];
    put [label="Put", shape=box, fillcolor=lightgreen];
    green -> bottle [label="attribute"];
    take -> bottle [label="object"];
    take -> drawer [label="from"];
    put -> bottle [label="object"];
    put -> counter [label="on"];
}

Command: Pick up the soft remote from the wooden couch and place it on the kitchen table.
Graph:
"""

print(generate_text(non_fine_tuned_model, tokenizer, few_shot_prompt, max_length=1024))

Command: Pick the red book from the shelf and place it on the table.
Graph: digraph SemanticGraph {
    node [shape=ellipse, style=filled, fillcolor=white];
    pick [label="Pick", shape=box, fillcolor=lightblue];
    book [label="book", shape=ellipse, fillcolor=white];
    shelf [label="shelf", shape=ellipse, fillcolor=white];
    table [label="table", shape=ellipse, fillcolor=white];
    red [label="red", shape=ellipse, fillcolor=white];
    place [label="Place", shape=box, fillcolor=lightgreen];
    red -> book [label="attribute"];
    pick -> book [label="object"];
    pick -> shelf [label="from"];
    place -> book [label="object"];
    place -> table [label="on"];
}

Command: Take the green bottle from the drawer and put it on the counter.
Graph: digraph SemanticGraph {
    node [shape=ellipse, style=filled, fillcolor=white];
    take [label="Take", shape=box, fillcolor=lightblue];
    bottle [label="bottle", shape=ellipse, fillcolor=white];
    drawer [label="drawer", shape=elli

# Fine-tuned model evaluation

## Generating predictions on the test dataset

In [6]:
import evaluate
import torch
from tqdm import tqdm
import pandas as pd
import json

# Loading test dataset (sentence and graph are already separated into different columns)
test_df = pd.read_csv("test_set_v2.csv")

predictions = []
references = []
inputs_list = []

# Generating predictions for all sentences in the test dataset
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    input_text = row["sentence"]
    expected_output = row["graph"]

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to(fine_tuned_model.device)
    with torch.no_grad():
        outputs = fine_tuned_model.generate(**inputs, max_length=512)

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    inputs_list.append(input_text)
    predictions.append(decoded)
    references.append(expected_output.strip())

# Saving results to csv file
output_df = pd.DataFrame({
    "input": inputs_list,
    "expected": references,
    "predicted": predictions
})
output_df.to_csv("predictions_fine_tuned_gpt2.csv", index=False)

  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|                                                                               | 1/1000 [00:06<1:44:45,  6.29s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▏                                                                              | 2/1000 [00:13<1:54:18,  6.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▏                                                                              | 3/1000 [00:22<2:08:30,  7.73s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▎                                                                              | 4/1000 [00:27<1:49:23,  6.59s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|▍                                       

In [7]:
# Load CSV with columns: input, expected, predicted
df = pd.read_csv("predictions_fine_tuned_gpt2.csv")

# Remove the input from the beginning of predicted, if present
def remove_input_from_prediction(row):
    input_text = row["input"].strip()
    predicted_text = row["predicted"].strip()

    if predicted_text.startswith(input_text):
        predicted_text = predicted_text[len(input_text):].strip()

    return predicted_text

# Apply the cleaning function
df["predicted_clean"] = df.apply(remove_input_from_prediction, axis=1)

# Save the updated DataFrame to a new CSV
df.to_csv("predictions_cleaned.csv", index=False)

## Evaluation

In [31]:
import pydot
from graphviz import Source

df = pd.read_csv("predictions_cleaned.csv")

predictions = df["predicted_clean"].tolist()
references = df["expected"].tolist()

# Loading bleu and rouge metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

bleu_result = bleu.compute(predictions=predictions, references=[[r] for r in references])
rouge_result = rouge.compute(predictions=predictions, references=references)

# Valid graphviz syntax count
def is_valid_dot(dot_string):
    try:
        # Try to parse the DOT string
        Source(dot_string)
        return True
    except Exception as e:
        return False

valid_graphs = 0

for prediction in predictions:
    if is_valid_dot(prediction):
        valid_graphs += 1

# Exact Match
exact_matches = sum(p.strip() == r.strip() for p, r in zip(predictions, references))
em_score = exact_matches / len(predictions)

# Fuzzy Exact Match
def extract_edges(dot_string):
    try:
        graphs = pydot.graph_from_dot_data(dot_string)
        if not graphs:
            return set()
        graph = graphs[0]
        return set((e.get_source(), e.get_destination()) for e in graph.get_edges())
    except Exception:
        return set()

def fuzzy_dot_match(pred, ref):
    return extract_edges(pred) == extract_edges(ref)

fuzzy_matches = [fuzzy_dot_match(p, r) for p, r in zip(predictions, references)]
fuzzy_accuracy = sum(fuzzy_matches) / len(fuzzy_matches)

results = {
    "BLEU": bleu_result["bleu"],
    "ROUGE-L": rouge_result["rougeL"],
    "Valid_graphs": valid_graphs / len(predictions),
    "Exact_Match": em_score,
    "Fuzzy_Exact_Match": fuzzy_accuracy
}

print(json.dumps(results, indent=2))

{
  "BLEU": 0.9929166994743918,
  "ROUGE-L": 0.9374676548622919,
  "Valid_graphs": 1.0,
  "Exact_Match": 0.336,
  "Fuzzy_Exact_Match": 0.935
}


In [3]:
import sys
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
import pandas as pd
import evaluate
import tqdm
import pydot
import graphviz

print(f"Python version: {sys.version.split()[0]}")
print("transformers:", transformers.__version__)
print("torch:", torch.__version__)
print("pandas:", pd.__version__)
print("evaluate:", evaluate.__version__)
print("tqdm:", tqdm.__version__)
print("pydot:", pydot.__version__)
print("graphviz:", graphviz.__version__)

Python version: 3.12.7
transformers: 4.49.0
torch: 2.6.0+cpu
pandas: 2.2.2
evaluate: 0.4.3
tqdm: 4.66.5
pydot: 3.0.4
graphviz: 0.20.3
