# Generate exploration problem set (Chat history)

Find a file, write a question, and create a path

AutoGen Version 0.2.2

In [1]:
import autogen
from autogen import AssistantAgent, UserProxyAgent

In [2]:

import glob
import json
import os
import pdb
import random
import re
import sys
from typing import List

from termcolor import colored

from gpt_utils import num_tokens, reply

sys.path.append('../')
from utils import list_files  # SPM's utils

MODEL_NAME = "gpt-35-turbo"
MAX_COMMAND_STEPS = 100
NEED_VERIFY = False
REPOS_ROOT = os.path.expanduser("~/data/repos")
OUT_DIR = "rst_search"
DISALLOWED_TEXT = ["<|endoftext|>", ]

In [3]:
FILES_SUFFIX = [
    '.txt', '.md', '.py', '.ipynb',  '.js', '.html',
    '.java', '.c', '.cpp', '.h', '.hpp', '.sh', '.bash'
]


In [4]:
config_list_gpt3 = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-35-turbo", "gpt-3.5-turbo"],
    },
)

config_list_gpt4 = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-4", "gpt-4-0314", "gpt4", "gpt-4-32k", "gpt-4-32k-0314", "gpt-4-32k-v0314"],
    },
)

gpt3_llm_config = {"config_list": config_list_gpt3, "cache_seed": 42}
gpt4_llm_config = {"config_list": config_list_gpt4, "cache_seed": 42}

In [5]:
def _random_file(root: str, suffix: list, ignore_regex: list) -> str:
    # Initialize an empty list to store the filenames that
    # match the suffix criteria
    matching_files = []

    # Traverse through the directory tree starting from the root
    for dirpath, dirnames, filenames in os.walk(root):
        for filename in filenames:
            # Check if the file has one of the defined suffixes
            if any(filename.lower().endswith(s.lower()) for s in suffix):
                # The file extension matches one of the suffixes
                dirpath = dirpath.replace("\\", "/")
                folder_paths = dirpath.split("/")
                if any(f.startswith(".") for f in folder_paths):
                    # Skip: the file is in a hidden folder
                    continue
                
                fname = os.path.join(dirpath, filename)
                
                # We search "ignore_regex" with the path name and file name, but not
                # the repo name.
                rel_name = os.path.relpath(fname, root)
                if any(re.search(regex, rel_name) for regex in ignore_regex):
                    # The file name matches one of the ignore regexes
                    continue
                # Append the full path of the file to the list
                matching_files.append(fname)
                
                
    if len(matching_files) == 0:
        pdb.set_trace()
    assert len(matching_files), "There is no matching file. Strange! Please take a look!"

    # Randomly select a file from the list of matching files
    return random.choice(matching_files)


In [6]:
def _random_block(filename: str, n_max_words: int) -> str:
    """
    Given a large file, return a random chunk of text (split by lines) 
    containing up to n_max_words words.
    
    Parameters:
        filename (str): The path to the file from which to read.
        n_max_words (int): The maximum number of words that the random block of 
            text should contain.
        
    Returns:
        str: A string containing the random block of text from the file.
    """
    # Initialize an empty list to store all lines from the file
    all_lines = []

    # Read all lines from the file and store them in the list
    with open(filename, 'r') as f:
        all_lines = f.readlines()

    # If the file is empty or contains no lines, return an empty string
    if not all_lines:
        return ""

    # Randomly choose a starting line index
    start_idx = random.randint(0, len(all_lines) - 1)

    # Initialize variables to keep track of the number of words and the
    # selected lines
    n_words = 0
    selected_lines = []

    # Loop to collect lines until n_max_words is reached or the end of
    # the file is reached
    for line in all_lines[start_idx:]:
        # line_words = line.split()
        n_words += num_tokens(line)

        if n_words > n_max_words:
            break

        selected_lines.append(line)

    return '\n'.join(selected_lines)



In [7]:
def norm_path(filename):
    # Check if the string contains any non-alphanumeric characters
    if any(not ch.isalnum() and ch not in [".", "_", "-", "/"]
           for ch in filename):
        return f'"{filename}"'
    return filename


def unnorm_path(filename):
    if filename[0] in ["\'", "\""] and filename[0] == filename[-1]:
        return unnorm_path(filename[1:-1])
    return filename


In [8]:
def commands_to_reach_destination(start: str,
                                  destination: str,
                                  folder_find_acc: float = 0.8) -> List[str]:
    """Use Linux's "ls", "cd", and "cat command to explore the path, 
    from `start` to the `destination`.

    Note that you are unfamiliar with the path, so you may need to "ls" to see
    the content inside a folder.

    It is guaranteed that the start and destination exist. 

    Args:
        start (str): a path
        destination (str): filename, a file which we want to find. 
        folder_find_acc (float, optional): the probability of finding a 
            correct folder. Defaults to 0.8.

    Returns:
        commands (list): a list of commands
    """
    assert os.path.isdir(start)
    assert os.path.isfile(destination)

    # Initialize an empty list to store the commands
    commands = []

    curr = start
    while True:
        commands.append("ls")

        # List the contents in the current directory
        contents = os.listdir(curr)

        # Separate the contents into files and folders
        files = [f for f in contents if os.path.isfile(os.path.join(curr, f))]
        folders = [f for f in contents if os.path.isdir(os.path.join(curr, f))]

        # Roll a dice to find the correct file
        if os.path.realpath(destination) in [
                os.path.realpath(os.path.join(curr, fname)) for fname in files
        ]:
            correct_command = f"cat {norm_path(os.path.basename(destination))}"
        else:
            # If we haven't found the file, we should try folders
            correct_path = os.path.relpath(destination, curr)
            cd_to = correct_path.split('/')[0]
            correct_command = f"cd {norm_path(cd_to)}"

        # List all possible actions
        all_possible_commands = [
            f"cat {norm_path(os.path.basename(fname))}" for fname in files
        ] + [f"cd {norm_path(os.path.basename(folder))}" for folder in folders]
        if curr != start:
            all_possible_commands += ["cd .."]

        if random.random() <= folder_find_acc:
            commands.append(correct_command)
        else:
            commands.append(random.choice(all_possible_commands))

        if commands[-1].startswith("cd "):
            dirname = unnorm_path(commands[-1][3:])
            curr = os.path.join(curr, dirname)
            curr = os.path.realpath(curr)
        else:
            cat_fname = unnorm_path(commands[-1].split()[-1])
            if os.path.realpath(os.path.join(
                    curr, cat_fname)) == os.path.realpath(destination):
                break

        if len(commands) > MAX_COMMAND_STEPS:
            raise RuntimeError("Too many commands")

    # Remove redundant "ls" command.
    # iterate through the commands, we don't need two adjacent "ls" commands,
    # even if there are some "cat" inside.
    rst = []
    need_ls = True
    for cmd in commands:
        if cmd.startswith("ls"):
            if need_ls:
                rst.append(cmd)
                need_ls = False
        else:
            rst.append(cmd)
            if cmd.startswith("cd"):
                need_ls = True

    print(colored(destination, "green"))
    print(commands)

    return commands


In [9]:

def optimal_path(start: str, destination: str) -> List[str]:
    """Use Linux's "ls", "cd", and "cat command to explore the path, 
    from `start` to the `destination`.

    Note that you are unfamiliar with the path, so you may need to "ls" to see
    the content inside a folder.

    It is guaranteed that the start and destination exist. 

    Args:
        start (str): a path
        destination (str): filename, a file which we want to find. 
        folder_find_acc (float, optional): the probability of finding a 
            correct folder. Defaults to 0.8.

    Returns:
        commands (list): a list of commands
    """
    assert os.path.isdir(start)
    assert os.path.isfile(destination)

    # folders = os.path.relpath(destination, start).split("/")
    # commands = []
    # for dirname in folders[:-1]:
    #     commands += ["ls", f"cd {dirname}"]
    # commands += ["ls", f"cat {folders[-1]}"]

    folders = os.path.relpath(destination, start).split("/")
    commands = [
        cmd for dirname in folders[:-1]
        for cmd in ["ls", f"cd {norm_path(dirname)}"]
    ] + ["ls", f"cat {norm_path(folders[-1])}"]

    return commands

In [10]:
def _random_question(root: str, filename: str, block: str, question_asker: AssistantAgent, user: UserProxyAgent) -> str:

    
    user.initiate_chat(question_asker, message=f"""
Now, ask questions regarding the following file.

--- Here are a part of the content from the file {filename} ---
{block}
------""")
    
    ans = user.last_message()["content"]
    
    pairs = re.findall(r'QUESTION:\s*(.*?)\nANSWER:\s*(.*?)\n\n', ans, re.DOTALL)
   
    if len(pairs) == 0:
        # Maybe just the answer is not generated. try again with "question" only.
        questions = re.findall(r'QUESTION:\s*(.*?)', ans, re.DOTALL)

        if len(questions) == 0:
            print(colored("NO questions generated by GPT!", "red"), filename)
            print(colored(ans, "yellow"))
            return []
        pairs = [(_ques, "") for _ques in questions]
        
    
    if not NEED_VERIFY:
        return pairs
    
    # Verify the generated question
    verified_pairs = []
    for question, answer in pairs:
        verify_prompt = f"""
Is the following question and answer specific to the given content? Or, is it a general question?

--- Here are the content ---
{block}

Question: {question}
Answer: {answer}

--- Reply with one word: SPECIFIC or GENERAL ---
"""
        ans_v = reply(verify_prompt, model_name=MODEL_NAME)

        if ans_v.upper().find("SPECIFIC") >= 0:
            verified_pairs.append((question, answer))
        else:
            print(colored("Skip the question:", "red"), question, "\t", answer)

    return verified_pairs


In [11]:
def gen(root: str, n_files: int = 10, outname: str = "out.json"):
    if os.path.exists(outname):
        print(f"Skip {outname}, because it already exists~")
        return

    rst = []
    num_success_attempts = 0
    
    architecture = list_files(root, file_suffixes=FILES_SUFFIX)
    architecture = architecture[:100] # use the first 100 files if there are too many.
    system_prompt = f"""
You are responsible to ask questions regarding a specific file inside a repository.

--- Repository Architecture ---
{architecture}
------


You need generate reading comprehension questions and answers based on the content.
You question should be very specific to the given file and the given content.
If the question is too general, don't even bother generating it.
The question should NOT be relevant to other files within the repository.

Use the format:

QUESTION: a question goes here
ANSWER: the answer to the question goes here

--- Good question example ---
QUESTION: How to generate an agent in the package?
QUESTION: Write code for "optimizer".
QUESTION: Show me suffix that are ignored from the pre-processing step.


--- Bad question example ---
QUESTION: What is the purpose of the code?
QUESTION: How to use the help function?
QUESTION: What is the purpose of @staticmethod?

"""
    open("prompt.tmp", "w").write(system_prompt)

    question_asker = AssistantAgent(
            name="Asker",
            human_input_mode="NEVER",
            max_consecutive_auto_reply=1,
            system_message=system_prompt,
            llm_config=gpt3_llm_config,
        )
    
    user = autogen.UserProxyAgent(
        name="User",
        human_input_mode="NEVER",
        max_consecutive_auto_reply=0
    )
    
    num_failed_attempts = 0
    while num_success_attempts < n_files and num_failed_attempts < 100:
        filename = _random_file(
            root=root,
            suffix=FILES_SUFFIX,
            ignore_regex=[".*out.*", ".*\.git.*", ".*test.*"])
        
        try:
            block = _random_block(filename, 3000)
        except Exception as e:
            print(e)
            num_failed_attempts += 1
            continue
        
        if num_tokens(block) < 100:
            print("A small block ignored in:", filename)
            num_failed_attempts += 1
            continue
        for disallowed in DISALLOWED_TEXT:
            if block.find(disallowed) >= 0:
                print(f"The string `{disallowed}` is in the block, which is disallowed!")
                num_failed_attempts += 1
                continue
        pairs = _random_question(root, filename, block, question_asker, user)

        if len(pairs) == 0:
            print(colored("NO questions generated by GPT!", "red"), filename)
        else:
            num_success_attempts += 1

        optimal = optimal_path(root, filename)  # the optimal path
        n_level = len([cmd for cmd in optimal if cmd.startswith("cd ")])

        for question, answer in pairs:
            try:
                commands = commands_to_reach_destination(root,
                                                         filename,
                                                         folder_find_acc=0.8)
            except Exception as e:
                print(e)
                continue

            print(colored(optimal, "red"))

            rst.append({
                "question": question,
                "answer": answer,
                "commands": commands,
                "optimal_path": optimal,
                "filename": os.path.relpath(filename, root),
                "root": os.path.relpath(root, REPOS_ROOT),
                "n_level": n_level
            })

    if len(rst) == 0:
        pdb.set_trace()

    # dump `rst` to json
    with open(outname, "w") as f:
        json.dump(rst, f, indent=2)



In [None]:
if __name__ == "__main__":

    random.seed(1)

    # print(cmds)

    os.makedirs(OUT_DIR, exist_ok=True)
    for dirname in os.listdir(REPOS_ROOT):
        print(f"Processing {dirname}...")
        gen(os.path.join(REPOS_ROOT, dirname),
            n_files=10,
            outname=os.path.join(OUT_DIR, f"{dirname}.json"))

        # pdb.set_trace()

    # Combine all /*.json into one json file
    all_data = []
    for filename in glob.glob(os.path.join(OUT_DIR, "*.json")):
        all_data += json.load(open(filename, "r"))

    json.dump(all_data, open("file_search.json", "w"), indent=2)

Processing chatbot-ui-main...
Skip rst_search/chatbot-ui-main.json, because it already exists~
Processing consistency_models-main...
Skip rst_search/consistency_models-main.json, because it already exists~
Processing LongLoRA-main...
Skip rst_search/LongLoRA-main.json, because it already exists~
Processing 3x-ui-main...
Skip rst_search/3x-ui-main.json, because it already exists~
Processing sqlcoder-main...
Skip rst_search/sqlcoder-main.json, because it already exists~
Processing floatui-main...
Skip rst_search/floatui-main.json, because it already exists~
Processing ai-main...
Skip rst_search/ai-main.json, because it already exists~
Processing gpt-migrate-main...
Skip rst_search/gpt-migrate-main.json, because it already exists~
Processing baize-chatbot-main...
Skip rst_search/baize-chatbot-main.json, because it already exists~
Processing conc-main...
Skip rst_search/conc-main.json, because it already exists~
Processing lit-llama-main...
Skip rst_search/lit-llama-main.json, because it a


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/repos/Coffee_Company/public/newsletter/2021 May/2.md ---


Opti Coffee's support extends beyond monetary assistance; it embodies a shared vision of a brighter, more equitable future for the coffee industry. By nurturing the growth of these young farmers, Opti Coffee fosters a cycle of positive change that uplifts families, communities, and the coffee landscape at large.



As the news of Opti Coffee's generous donation spreads, it serves as a source of inspiration and a call to action for the broader coffee community. The ripple effect of this contribution reminds us all of the immense potential that exists within the realm of coffee â€“ not only as a beloved beverage but as a vehicle for social progress and meaningful change.



With every sip of Opti Coffee's exceptional blends, coffee enthusiasts around the world become integral participants in this transformative 


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What are the key strategies for Opti Coffee's marketing success?
ANSWER: The key strategies for Opti Coffee's marketing success include developing a strong and consistent brand identity, crafting a compelling unique selling proposition, expanding its product line, prioritizing packaging and presentation, establishing a competitive yet value-based pricing strategy, leveraging both online sales and retail partnerships for distribution, and utilizing a mix of traditional and digital marketing channels.

--------------------------------------------------------------------------------
Too many commands
A small block ignored in: /home/beibinli/data/repos/Coffee_Company/solver/src/__init__.py
[33mUser[0m (to Asker):


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/repos/Coffee_Company/public/newsletter/


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: How does the fulfill_demand method of the Cafe class handle the situation where the fulfilled quantity exceeds the demand for a specific coffee type?
ANSWER: If the fulfilled quantity exceeds the demand for a specific coffee type, the method raises a ValueError with the message "Fulfilled quantity exceeds demand".

--------------------------------------------------------------------------------
[32m/home/beibinli/data/repos/Coffee_Company/solver/src/cafe.py[0m
['ls', 'cat "Employee Benefits.md"', 'ls', 'cd solver', 'ls', 'cd src', 'ls', 'cat cafe.py']
[31m['ls', 'cd solver', 'ls', 'cd src', 'ls', 'cat cafe.py'][0m
A small block ignored in: /home/beibinli/data/repos/Coffee_Company/solver/cpp/CoffeeDistributionOptimizer.h
[33mUser[0m (to Asker):


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What happens if the filtered data has no rows after the filtering process?
ANSWER: A ValueError will be raised with the message 'No bean with name {args.name} found.'

QUESTION: How is the datetime column generated from the 'month' and 'year' columns in the filtered data?
ANSWER: The 'month' and 'year' columns are combined and converted into a single datetime column using pd.to_datetime.

QUESTION: What is the purpose of the plot_data function?
ANSWER: The plot_data function is used to group the data by date and calculate the minimum, maximum, and mean price per unit of the bean, and then plot the price range (min to max) and mean price over time for the specified bean.

--------------------------------------------------------------------------------
[32m/home/beibinli/data/repos/Coffee_Company/visualization/bean_price.py[0m
['ls', 'cd visualization', 'ls', 'cat bean


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What does the 25% increase in revenue reflect about Opti Coffee?
ANSWER: The 25% increase in revenue reflects a thriving customer base that is drawn to Opti Coffee's offerings, a testament to the brand's ability to connect with individuals on a profound level.

QUESTION: What does the 15% boost in profit demonstrate about Opti Coffee?
ANSWER: The 15% boost in profit is a testament to Opti Coffee's deft financial management and operational efficiency, showcasing the company's ability to balance growth with responsible business practices.

QUESTION: What does Opti Coffee's success highlight beyond the numbers?
ANSWER: Opti Coffee's success highlights its holistic approach to business, including the dedication of its employees, partnerships forged, and the meaningful connections established with coffee-growing communities and coffee enthusiasts.

-------------------------


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What coordinates are used as the location for the folium map?
ANSWER: The coordinates used as the location for the folium map are provided by the `get_coordinates` function, which takes the city and country as input and returns the latitude and longitude.

QUESTION: What is the purpose of the folium.Circle function in the plot_map method?
ANSWER: The folium.Circle function in the plot_map method is used to create circles on the map, with each circle representing a city and its corresponding supplier count. The radius of the circle is based on the supplier count, and the popup provides information about the city, country, and the number of suppliers.

--------------------------------------------------------------------------------
[32m/home/beibinli/data/repos/Coffee_Company/visualization/cpp/plot_supplier_map.py[0m
['ls', 'cd visualization', 'ls', 'cat salary.py', 'l

QUESTION: How can you set the roasting cost for a specific coffee type in the Roastery class?
ANSWER: You can set the roasting cost for a specific coffee type in the Roastery class by using the method "setRoastingCost", which takes the coffee type, cost, and constraints as parameters.

QUESTION: What is the purpose of the "validateRoastingConstraints" method in the Roastery class?
ANSWER: The "validateRoastingConstraints" method in the Roastery class is used to validate the roasting constraints for a specific coffee type, ensuring that the provided constraints match the required constraints for roasting the coffee type.

QUESTION: In the Roastery class, how is the information about roasting and shipping costs displayed?
ANSWER: In the Roastery class, the information about roasting and shipping costs is displayed using the "displayInfo" method, which generates a formatted string containing the name, location, contact, roasting costs, and shipping costs.

--------------------------------


--------------------------------------------------------------------------------
[32m/home/beibinli/data/repos/Coffee_Company/public/our_coffee.md[0m
['ls', 'cd public', 'ls', 'cat our_coffee.md']
[31m['ls', 'cd public', 'ls', 'cat our_coffee.md'][0m
[32m/home/beibinli/data/repos/Coffee_Company/public/our_coffee.md[0m
['ls', 'cat "Manual%3A How to Negotiate with Coffee Bean Suppliers and Cafe Shops.md"', 'ls', 'cd public', 'ls', 'cat our_coffee.md']
[31m['ls', 'cd public', 'ls', 'cat our_coffee.md'][0m
A small block ignored in: /home/beibinli/data/repos/Coffee_Company/solver/src/java/Supplier.java
[33mUser[0m (to Asker):


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/repos/Coffee_Company/visualization/cpp/plot_supplier_map.py ---
        if coords:

            folium.Circle(

                location=coords,

                radius=int(row['supplier_count']*10000),

                color='blue',

    


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What is the purpose of the method "add_cold_brew_constraint" in the plan_optimizer.py file?
ANSWER: The purpose of the method "add_cold_brew_constraint" is to add a constraint that "cold brew coffee" should only use coffee beans from a supplier named "Vietnam".

--------------------------------------------------------------------------------
[32m/home/beibinli/data/repos/Coffee_Company/solver/src/plan_optimizer.py[0m
['ls', 'cat "roast manual.md"', 'ls', 'cd solver', 'ls', 'cd src', 'ls', 'cat distribution_network.py', 'ls', 'cat plan_optimizer.py']
[31m['ls', 'cd solver', 'ls', 'cd src', 'ls', 'cat plan_optimizer.py'][0m
[33mUser[0m (to Asker):


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/repos/Coffee_Company/visualization/supplier_price.py ---
    

    return parser.parse_args()



def


--------------------------------------------------------------------------------
[33mAsker[0m (to User):

QUESTION: What are the two primary types of blending discussed in the file "Roast Manual.md"?
ANSWER: The two primary types of blending discussed are pre-roast blending and post-roast blending.

QUESTION: What are some advantages of pre-roast blending in coffee roasting, as mentioned in the file "Roast Manual.md"?
ANSWER: Some advantages of pre-roast blending include the potential for a more harmonious and integrated flavor profile, consistent final product, and cost-effectiveness due to the requirement of less time and effort compared to post-roast blending.

QUESTION: How can a coffee roasting business maintain consistency in their signature blend, according to the content in "Roast Manual.md"?
ANSWER: A coffee roasting business can maintain consistency in their signature blend by sourcing high-quality beans, following a consistent roasting process, and regularly cupping the b


Now, ask questions regarding the following file.

--- Here are a part of the content from the file /home/beibinli/data/repos/Coffee_Company/public/merged.md ---


**Join the Coffee Revolution**:

Be it a cafÃ© owner, a coffee aficionado, or someone who finds solace in a comforting cup of joe, Opti Coffee is your passport to a global coffee community. Dive deep into a world teeming with flavors, textures, and stories. Visit us online or in-store today. Your perfect cup awaits with Opti Coffee - uniting the world with optimization, one cup at a time.



Thank you for choosing us and for letting Opti Coffee be a part of your daily brew. Share the love of coffee, and let every sip be an adventure! ðŸŽ‰

------

--------------------------------------------------------------------------------
