In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("..")

In [3]:
import ast
import json
import numpy as np
import pandas as pd

from util import bfs

# In context learning prompts

In this notebook we will create a set of prompts to test the in context learning capabilities of ChatGPT when applied to graph visualization problems.

In [4]:
seed = 42
np.random.seed(seed)

## Rank assignment prompts

We will start with the problem of rank assignment. We will generate the new prompts by randomly sampling 5 graphs from different files, and prepending their correct solutions to the prompt asking for the solution of the current graph.

In [5]:
prompt_base = """
Perform a rank assignment on the graph. Use node 0 as a source for the graph. Each node must be assigned to a rank that is equal to the shortest path between that node and the source. Thus, node 0 will be assigned to rank 0, and the neighbors of node 0 will be assigned to rank 1. 
Write no explanations, only respond with the id of each node and the rank it has been assigned to in a format <id> - <rank>.\n
"""

In [6]:
rank_prompt_query_dir = "queries/rank_prompts"
all_rank_prompt_files = set(os.listdir(rank_prompt_query_dir))

In [7]:
def read_prompt_edge_list(query: str) -> list:
    rank_prompt_edge_list = (
        query.split("edge connections:")[1]
        .split("Perform a rank assignment")[0]
        .strip()
    )
    rank_prompt_edge_list = list(ast.literal_eval(rank_prompt_edge_list))
    return rank_prompt_edge_list

def rank_assignment_to_formatted_str(rank_assignment: dict) -> str:
    rank_assignment_str = ""
    for rank, nodes in rank_assignment.items():
        for node in nodes:
            rank_assignment_str += f"{node} - {rank}\n"
    return rank_assignment_str

def minimize_prompt(query: str) -> str:
    query = query.split("\n")
    query = query[:-2]
    query = "\n".join(query)
    return query

In [8]:
k_samples = 3
sample_size = 100

# Sample `sample_size` queries from the all_rank_prompt_files
sampled_rank_prompt_files = np.random.choice(
    list(all_rank_prompt_files), sample_size, replace=False
)
print(f"Sampled {len(sampled_rank_prompt_files)} queries")

Sampled 100 queries


In [9]:
rank_prompts_icl = {}

for rank_prompt_file in sampled_rank_prompt_files:
    # Sample 5 other prompt files different from the current one
    other_rank_prompt_files = np.random.choice(
        list(all_rank_prompt_files - {rank_prompt_file}), k_samples, replace=False
    )
    assert rank_prompt_file not in other_rank_prompt_files

    # Read the current prompt and the other prompts
    rank_prompt = open(os.path.join(rank_prompt_query_dir, rank_prompt_file)).read()
    other_rank_prompts = [
        open(os.path.join(rank_prompt_query_dir, other_rank_prompt_file)).read()
        for other_rank_prompt_file in other_rank_prompt_files
    ]

    # Extract the edge list for both the current prompt and the other prompts
    rank_prompt_edge_list = read_prompt_edge_list(rank_prompt)
    other_rank_prompt_edge_lists = [
        read_prompt_edge_list(other_rank_prompt)
        for other_rank_prompt in other_rank_prompts
    ]

    # Compute the correct rank assignment for the other prompts
    other_rank_prompt_rank_assignments = [
        bfs(other_rank_prompt_edge_list, 0)
        for other_rank_prompt_edge_list in other_rank_prompt_edge_lists
    ]

    # Convert the rank assignments in the expected format, i.e. <id> - <rank> one per row
    other_rank_prompt_rank_assignments_str = [
        rank_assignment_to_formatted_str(other_rank_prompt_rank_assignment)
        for other_rank_prompt_rank_assignment in other_rank_prompt_rank_assignments
    ]

    # Build the prompt
    prompt = [
        "Input:\n{}\nAnswer:\n{}\n".format(
            minimize_prompt(other_rank_prompts[i]),
            other_rank_prompt_rank_assignments_str[i],
        )
        for i in range(k_samples)
    ]
    prompt = prompt_base + "".join(prompt)
    prompt += "Input:\n{}\nAnswer:\n".format(minimize_prompt(rank_prompt))

    # expected_answer = bfs(rank_prompt_edge_list, 0)
    # expected_answer_str = rank_assignment_to_formatted_str(expected_answer)

    rank_prompts_icl[rank_prompt_file] = prompt.strip()

    del rank_prompt, other_rank_prompts, prompt


In [10]:
rank_prompts_icl_query_dir = "queries/rank_prompts_icl"
os.makedirs(rank_prompts_icl_query_dir, exist_ok=True)

In [11]:
# Write the prompts to disk
for rank_prompt_file, prompt in rank_prompts_icl.items():
    with open(os.path.join(rank_prompts_icl_query_dir, rank_prompt_file), "w") as f:
        f.write(prompt)

## Transpose prompts

We can repeat a similar process for the crossing minimization prompts.

In [5]:
prompt_base = """We want to reduce crossings on a graph drawing.
We want to order the nodes in the layers so that there are few crossings in the graph. 
- visit every rank once, starting from layer 0
- try different transpositions of the nodes in that layer
- count the crossings for every transposition. There is a crossing between two edges e1 and e2 if the source of e1 comes before the source of e2, and the target of e1 comes after the target of e2 in the order of nodes in a layer.
- record the transposition that produces the least amount of crossings, and sort the nodes accordingly.
Nodes can NOT be moved to a different layer. You can only reorder nodes within layers.
Write no code and no explanation.
Return the layers dictionary with the nodes ordered, in a code block. I want it formatted like this: {<layer_id>:[<list of ordered nodes>]}"""

In [6]:
transpose_query_dir = "queries/transpose_prompts3"
all_transpose_files = set(os.listdir(transpose_query_dir))

In [7]:
k_samples = 3
sample_size = 100

# Sample `sample_size` queries from the all_rank_prompt_files
sampled_transpose_files = np.random.choice(
    list(all_transpose_files), sample_size, replace=False
)
print(f"Sampled {len(sampled_transpose_files)} queries")

Sampled 100 queries


In [8]:
transpose_queries = {}

for query_file in sorted(os.listdir(transpose_query_dir)):
    transpose_queries[query_file] = {}
    query_file_path = os.path.join(transpose_query_dir, query_file)

    query_str = open(query_file_path, "r").read().strip()

    query_edges = query_str.split("edges = ")[1]
    query_edges = query_edges.split("\n")[0].strip()
    query_edges = ast.literal_eval(query_edges)

    query_ranks = query_str.split("ranks = ")[1]
    query_ranks = query_ranks.split("\n\n")[0].strip()
    query_ranks = query_ranks.split("\n")
    # From each substring remove "Layer " at the 
    # beginning and add "," at the end
    query_ranks = [r[6:].strip() + "," for r in query_ranks]
    query_ranks = "".join(query_ranks)
    query_ranks = "{" + query_ranks[:-1] + "}"
    query_ranks = ast.literal_eval(query_ranks)
    
    transpose_queries[query_file]["edges"] = query_edges
    transpose_queries[query_file]["ranks"] = query_ranks

print("Number of queries: ", len(transpose_queries))

Number of queries:  134


In [9]:
def format_query_to_str(edges: list, ranks: dict) -> str:
    """Format a query to a string"""
    query = []
    query.append("This is the list of edges. Every edge has [<source_id>, <target_id>]:")
    query.append("edges = {}".format(edges))
    query.append("This is the description of what nodes are contained in what layer: ")
    for layer, nodes in ranks.items():
        if layer == 0:
            query.append("ranks = Layer {}: {}".format(layer, nodes))
        else:
            query.append("Layer {}: {}".format(layer, nodes))
    query = "\n".join(query)
    return query

def format_ground_truth(ranks: dict) -> str:
    """Format the ground truth to a string"""
    gt = {int(k): v for k, v in ranks.items()}
    
    ground_truth = "{\n"
    for layer, nodes in gt.items():
        ground_truth += f"{layer}: {nodes},\n"
    ground_truth = ground_truth[:-2] + "\n}"
    return ground_truth
    

In [10]:
# Let's load the ground truth answers for the minimal number of crossings for each query
strasifimal_results = json.load(open("stratisfimal_results/optimal_ranks_10_11.json", "r"))
print("Number of queries with ground truth: ", len(strasifimal_results))

Number of queries with ground truth:  207


In [11]:
transpose_icl = {}

for transpose_file in sampled_transpose_files:
    # Sample 5 other prompt files different from the current one
    other_transpose_files = np.random.choice(
        list(all_transpose_files - {transpose_file}), k_samples, replace=False
    )
    assert transpose_file not in other_transpose_files

    cur_edges = transpose_queries[transpose_file]["edges"]
    cur_ranks = transpose_queries[transpose_file]["ranks"]
    other_edges = [
        transpose_queries[other_file]["edges"] for other_file in other_transpose_files
    ]
    other_ranks = [
        transpose_queries[other_file]["ranks"] for other_file in other_transpose_files
    ]

    ground_truth = strasifimal_results[transpose_file.split(".txt")[0]]
    other_ground_truth = [
        strasifimal_results[other_file.split(".txt")[0]]
        for other_file in other_transpose_files
    ]

    cur_query = format_query_to_str(cur_edges, cur_ranks)
    other_queries = [
        format_query_to_str(other_edges[i], other_ranks[i]) for i in range(k_samples)
    ]
    other_ground_truth_str = [format_ground_truth(gt) for gt in other_ground_truth]

    # print("Current query: ", cur_edges, cur_ranks)
    # print("Other queries: ", other_edges, other_ranks)

    # print(format_query_to_str(cur_edges, cur_ranks))
    # print("Ground truth: ", ground_truth)
    # print(format_ground_truth(ground_truth))

    # Build the prompt
    prompt = [
        "\nInput:\n{}\nAnswer:\n\n{}\n".format(
            other_queries[i],
            other_ground_truth_str[i],
        )
        for i in range(k_samples)
    ]
    prompt = prompt_base + "\n" + "".join(prompt)
    prompt += "\nInput:\n{}\n\nAnswer:\n".format(cur_query)

    transpose_icl[transpose_file] = prompt.strip()
    del (
        cur_edges,
        cur_ranks,
        other_edges,
        other_ranks,
        ground_truth,
        other_ground_truth,
        prompt,
    )


In [12]:
# tojs = [i.split(".txt")[0].strip() for i in sorted(list(all_transpose_files))]
# json.dump(tojs, open("transpose_prompts3_files.json", "w"), indent=2)

In [13]:
transpose_prompts_icl_query_dir = "queries/transpose_prompts_icl"
os.makedirs(transpose_prompts_icl_query_dir, exist_ok=True)

In [14]:
# Write the prompts to disk
for query_file, prompt in transpose_icl.items():
    query_file_path = os.path.join(transpose_prompts_icl_query_dir, query_file)
    with open(query_file_path, "w") as f:
        f.write(prompt)