In [2]:
from dotenv import load_dotenv
from openai import OpenAI
from pathlib import Path
import os

load_dotenv()
# openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

SYSTEM_PROMPT = """
You are a helpful assistant. 
We want to build the family tree of the Rougon-Macquart family based on the text of the 1st book.
We have split the task into various steps. With that in mind, help me do each task.
"""

In [None]:
# Step 1
def split_text_into_overlapping_chunks(input_text_file_path: Path) -> None:
    """
    This splits the text into chunks of defined length with predefined overlap.
    Splits could only happen on the closest "\n" character.
    But here I am building a bare-metal solution.
    """
    _SPLIT_LENGTH = 30000  # chars
    _SPLIT_OVERLAP = 2000  # chars
    textified_file = input_text_file_path.read_text(encoding="utf-8")
    max_char = len(textified_file)
    starts = [
        x - _SPLIT_OVERLAP if x != 0 else 0 for x in range(0, max_char, _SPLIT_LENGTH)
    ]
    ends = [
        x if x < max_char else max_char
        for x in range(_SPLIT_LENGTH, max_char + _SPLIT_LENGTH, _SPLIT_LENGTH)
    ]

    # print(starts[:10], ends[:10], max_char)
    # return

    (input_text_file_path.parent / "splits").mkdir(exist_ok=True)
    for i in range(len(starts)):
        chunk_file_path = input_text_file_path.parent / f"""splits/{i}/text.txt"""
        chunk_file_path.parent.mkdir(exist_ok=True)
        chunk_text = textified_file[starts[i] : ends[i]]
        chunk_file_path.write_text(chunk_text, encoding="utf-8")


split_text_into_overlapping_chunks(Path("data/La_Fortune_des_Rougon.txt"))

In [None]:
# Step 2
def list_character_names(split_directory: Path):
    """
    List all characters that are mentionned in the text.
    """

    prompt = """
    At this stage, we want to build a list of all characters that are mentionned in the text.

    The output will be a text file (do not include any other information).

    Bear in mind the following guidelines:

    - List all character names, with known nicknames. For example: "Pierre Rougon -> [Pierre, le capitaine,  Pierrot, etc.]". Do not include references that could refer to several characters (there could be several M. Rougon). We will assume that there is only one character per typle (firstname, lastname)
    - If there are various surnames known in the paragraph, list them all. For example: "Pierre Rougon -> [Pierre, le capitaine,  Pierrot, etc.]
    - There should be one line per character (and one character per line)
    - The character name put first should be the most relevant (ideally "firstname lastname") if this is not known in the pragraph, then use the most relevant nickname.
    - If the member is a Rougon family members, prefix with *. A Rougon-Macquart may not bear the familly name and yet be related (for example through wedding)
    - Do not invent nickanmes or assume a character is a Rougon-Macquart if there is no eveidence of it in the text.
    - Do not include the full name to the left of the arrow in the list of names. If there is no known other name, just leave the list empty.
    - Do not put family relationships in the list of names except if this is the only reference to the character. for example "grand-père" could be a nickname but if his name is Jean then it should not write "Jean ->[grand-père de Jacques]"


    Example expected output:
    ```txt
    * Pierre Rougon -> [Pierre, Pierrot, le marchand d'huile, etc.]
    * François Mouret -> [etc.]
    Boutigny -> [etc.]
    ```
    
    ############################################################
    Provided text:
    ############################################################
    {chunk_text}
    ```
    """

    for p in split_directory.iterdir():
        if p.is_file():
            continue
        chunk_file = p / "text.txt"
        chunk_text = chunk_file.read_text(encoding="utf-8")
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt.format(chunk_text=chunk_text)}
        ]
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=messages,
            temperature=0.7,
        )

        extracted_name_file = p / "extracted_names.txt"
        extracted_name_file.write_text(response.choices[0].message.content, encoding="utf-8")

list_character_names(Path("data/splits"))

KeyboardInterrupt: 

In [5]:
# Step 3
def consolidate_character_names(split_directory: Path):
    """
    Consolidate the character names from all splits into a single file.

    NB Given we only have 23 splits here, I am merging all extracted names in one file.
    If we were to build a truly scalable solution, we would need some sort of map-redude:
    take the first 2 names
    """
    name_list = ""

    prompt = """
    At this stage, we have a split the main text into smaller chunks.

    For each chunk, we have built a list of names as follows:
    "known name -> [nickname1, nickname2, etc.]"
    Ideally the known name is the character's full name (firstname lastname), but it can also be a nickname (like "tante Dide")

    We want to consolidate all the lists of names (each being built on a separate chunk) into a single list of names.

    The output format should be similar to individual lists of names that is:
    ```txt
    * Pierre Rougon -> [Pierre, Pierrot, le marchand d'huile, etc.]
    * François Mouret -> [etc.]
    Boutigny -> [etc.]
    ```

    The criteria for the merge are as follows:
    - There should be one line per character and one line only.
        - If 2 lines or more refer to the same character, keep the most accurate designation (ideally full name) as name and include in the list the merge of all nicknames from the combined lines
        - If 2 lines refer to the same character with different names, merge the 2 lines either way following the directive above.
        - If 1 line in the merge has an asterisk, keep it in the merge as a prefix.
    - If the member is a Rougon family members, prefix with *. A Rougon-Macquart may not bear the familly name and yet be related (for example through wedding
    - Do not include the full name to the left of the arrow in the list of names. If there is no known other name, just leave the list empty.
    - If a name is too vague and can refer to various characters without a clear explanation, and that the list does not help with the diambiguation, then drop it. But if the name is actually the family name of a specific character precised in the list, then use it in the merge

    Example expected output:
    ```txt
    * Pierre Rougon -> [Pierre, Pierrot, le marchand d'huile, etc.]
    * François Mouret -> [etc.]
    Boutigny -> [etc.]
    ```

    Do not include any other information in the output (no explanation or header)

    The lists of names per chunk are separated by a line jump    
    ############################################################
    Provided lists of names:
    ############################################################
    {name_list}
    ```
    """

    for p in split_directory.iterdir():
        if p.is_file():
            continue
        names_file = p / "extracted_names.txt"
        name_list += ("\n" + names_file.read_text(encoding="utf-8"))


    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt.format(name_list=name_list)}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        temperature=0.7,
    )

    consolidated_names_file = split_directory / "consolidated_names.txt"
    consolidated_names_file.write_text(response.choices[0].message.content, encoding="utf-8")

consolidate_character_names(Path("data/splits"))

In [6]:
# Step 4
def desambiguate_characters(split_directory: Path):
    """
    List clearly the characters at hand in the text (disambiguate pronouns and nicknames).
    """

    prompt = """
    At this stage, we have a list of characters that are mentionned in the text and we have identified those that are Rougon-Macquart.

    For further extraction of relationships between characters in a later stage, we want to rewrite an excerpt of the book with the appropriate name as identified per the list.
    We want to replace any character reference (nickname, pronoun, etc.) with the character identified in the list between brackets.
    We want to  perform this change only for members of the Rougon-Macquart family identified by the asterisk (*) that prefixes their name in the list.

    ##### EXAMPLE LIST OF NAMES #####

    * Pierre Rougon -> [Pierre, le marchand d'huile, le jeune Rougon, le mari]
    * Félicité Rougon -> [Félicité, sa femme, la petite, la vieille gueuse, la petite mère]
    Chantegreil -> [le braconnier Chantegreil, le père de Miette]
    Eulalie Chantegreil -> [Eulalie, la tante Eulalie]

    ##### EXAMPLE TEXT INPUT #####

    À ce moment, on vit circuler Aristide parmi les groupes. Le cher garçon, devant ce soulèvement formidable, avait pensé qu’il était imprudent de ne pas rester l’ami des républicains ; mais comme, d’un autre côté, il ne voulait pas trop se compromettre avec eux, il était venu leur faire ses adieux, le bras en écharpe, en se plaignant amèrement de cette maudite blessure qui l’empêchait de tenir une arme.
    Alors Pierre s’arrêta un instant sur le trottoir désert. Il poussa un gros soupir de soulagement et de triomphe. Ces gueux de républicains lui abandonnaient donc Plassans. La ville lui appartenait, à cette heure : elle dormait comme une sotte ; elle était là, noire et paisible, muette et confiante, et il n’avait qu’à étendre la main pour la prendre. Cette courte halte, ce regard d’homme supérieur jeté sur le sommeil de toute une sous-préfecture, lui causèrent des jouissances ineffables. Il resta là, croisant les bras, prenant, seul dans la nuit, une pose de grand capitaine à la veille d’une victoire. Au loin, il n’entendait que le chant des fontaines du cours, dont les filets d’eau sonores tombaient dans les bassins.

    ##### EXAMPLE OUTPUT #####

    À ce moment, on vit circuler Aristide parmi les groupes. Le cher garçon, devant ce soulèvement formidable, avait pensé qu’il était imprudent de ne pas rester l’ami des républicains ; mais comme, d’un autre côté, il ne voulait pas trop se compromettre avec eux, il était venu leur faire ses adieux, le bras en écharpe, en se plaignant amèrement de cette maudite blessure qui l’empêchait de tenir une arme.
    Alors [Pierre Rougon] s’arrêta un instant sur le trottoir désert. Il poussa un gros soupir de soulagement et de triomphe. Ces gueux de républicains lui ([Pierre Rougon]) abandonnaient donc Plassans. La ville lui ([Pierre Rougon]) appartenait, à cette heure : elle dormait comme une sotte ; elle était là, noire et paisible, muette et confiante, et il ([Pierre Rougon]) n’avait qu’à étendre la main pour la prendre. Cette courte halte, ce regard d’homme supérieur jeté sur le sommeil de toute une sous-préfecture, lui ([Pierre Rougon]) causèrent des jouissances ineffables. Il([Pierre Rougon]) resta là, croisant les bras, prenant, seul dans la nuit, une pose de grand capitaine à la veille d’une victoire. Au loin, il ([Pierre Rougon]) n’entendait que le chant des fontaines du cours, dont les filets d’eau sonores tombaient dans les bassins.

    
    ############################################################
    Provided list of names:
    ############################################################
    {names_list}
    ############################################################
    Provided intput text:
    ############################################################
    {input_text}

    ############################################################
    Expected output text (complete here)
    ############################################################



    
    """

    consolidated_name_list = (split_directory / "consolidated_names.txt").read_text(encoding="utf-8")

    for p in split_directory.iterdir():
        if p.is_file():
            continue
        chunk_file = p / "text.txt"
        chunk_text = chunk_file.read_text(encoding="utf-8")
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt.format(names_list=consolidated_name_list, input_text=chunk_text)}
        ]
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=messages,
            temperature=0.7,
        )

        desambiguated_text_file = p / "desambiguated_text.txt"
        desambiguated_text_file.write_text(response.choices[0].message.content, encoding="utf-8")

desambiguate_characters(Path("data/splits"))

In [8]:
# Step 5
def find_relationships(split_directory: Path):
    """
    List the relationships between the members of the Rougon-Macquart family based on the text (per chunk).
    """

    prompt = """
    At this stage, we have a list of characters that are mentionned in the text and we have identified those that are Rougon-Macquart. (an asterisk (*) prefixes their name in the list).
    We also have text chunks with clear mentions of characters that are members of this family (they are written between brackets).

    We want to start extrating the relationships between the members of the Rougon-Macquart family based on the text.

    We will provide the list of names and a chunk of the book as input.

    You need to output a list of relationships between the members of the Rougon-Macquart family (only those) as follows:

    Example expected output:
    ```txt
    * A -> B [married]
    * C -> D [parent]
    * A -> C [parent]
    ```

    Where A,B,C,D are the names of the characters as they appear in the list of names (just the main full name, not the nickname). The relationship is written between brackets.

    The goal will be to create a family tree later. Consequently we will build a DAG (using DOT language most likely)
    The rules for this are as follows:
    - Between 2 characters there should only be one relationship. If there is a choice to make, we pick downards when possible (i.ie parent insteado of child)
    - Choose direct relationships and do not add exta relationships (example if A is parent of B and B is parent of C, do not add A -> C [grandparent]. Seeminlingly if grandparent relationship is mentioned in the text, try to break it down into 2 relationships (A -> B [parent] and B -> C [parent]).
    - When possible try to write only "parent" and "married" relationships (no mother/father/son/sibling/ nephew or whatever whenever it can be avoided). Should some members of the familly tree be missing, add them as "unknown" (example: A -> unknwon1 [parent] and unknown1 -> C [parent]). Be careful to not mix up various unknwonws (always add the right numberas a suffix to refer to the appropriate one).
    - if a non direct relationship is indicated in the text, then add it as is.


    
    ############################################################
    Provided list of names:
    ############################################################
    {names_list}

    ############################################################
    Provided intput text:
    ############################################################
    {input_text}

    ############################################################
    Expected relationships:  (complete here)
    ############################################################

    
    """

    consolidated_name_list = (split_directory / "consolidated_names.txt").read_text(encoding="utf-8")

    for p in split_directory.iterdir():
        if p.is_file():
            continue
        chunk_file = p / "desambiguated_text.txt"
        chunk_text = chunk_file.read_text(encoding="utf-8")
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": prompt.format(names_list=consolidated_name_list, input_text=chunk_text)}
        ]
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=messages,
            temperature=0.7,
        )

        relationships_file = p / "relationships.txt"
        relationships_file.write_text(response.choices[0].message.content, encoding="utf-8")

find_relationships(Path("data/splits"))

In [10]:
# Step 6
def consolidate_relationships(split_directory: Path):
    """
    Consolidate the relationships from all splits into a single file.

    We have now extracted for several chunks of text from the book the relationships between the characters.
    Based upon them, we want to build a single list of relationships.
    """
    relationships_list = ""

    prompt = """
    At this stage, we have now extracted for several chunks of text from the book the relationships between the characters.
    Based upon them, we want to consolidate results into a single list of relationships.

    The output format should be similar to individual lists of relationships that is:
    ```txt
    * A -> B [married]
    * C -> D [parent]
    * A -> C [parent]
    ```

    The criteria for the merge are as follows:
    - Try to use only "parent" and "married" relationships whenever possible
    - If there are some "holes" in the family tree relationships, add them as "unknown" (example: A -> unknwon1 [parent] and unknown1 -> C [parent]). Be careful to not mix up various unknwonws (always add the right numberas a suffix to refer to the appropriate one).
    - There should be one line per relationship maximum.
        - If 2 written family ties or more indicate the same relationship, then keep only downards relationships (i.e. parent instead of child)
    - The relationships in the end should be akin to a DAG (there should not be cycles, or duplicate nodes).

    If there happen to be incompatibilities (clear contract)


    Do not include any other information in the output

    The lists of relationships found per chunk are separated by a line jump    
    
    ############################################################
    Provided list of relationships (lists are separated by line jumps):
    ############################################################
    {relationships_lists}

    ############################################################
    Expected relationships (complete here without comments or additional text or header)
    ############################################################
    
    """

    for p in split_directory.iterdir():
        if p.is_file():
            continue
        relationship_file = p / "relationships.txt"
        relationships_list += ("\n" + relationship_file.read_text(encoding="utf-8"))


    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt.format(relationships_lists=relationships_list)}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        temperature=0.7,
    )

    consolidated_names_file = split_directory / "consolidated_relationships.txt"
    consolidated_names_file.write_text(response.choices[0].message.content, encoding="utf-8")

consolidate_relationships(Path("data/splits"))

In [4]:
# step 7
def build_graph(split_directory: Path):
    """
    Build a Graphviz compatible graph based on a list of relationships.
    """

    prompt = """
    At this stage, we have built a list of relationships between the members of the Rougon-Macquart family.
    We now want to build a graphviz compatible graph (.DOT language) based on these relationships.

    The goal is to reconstruct the family tree of the Rougon-Macquart family.
    Consequently the final output should respect the following rules:

    - The graph should be a DAG with no loops
    - There should only be direct relationships (i.e. no grandparent, uncle, etc.) : only "parent" and "married" relationships should be used.
    - If need be "unknown_i" can be created to fill in the gaps in the family tree when the name of an intermediary parent is not known.
    - The arrows should NOT be labeled with the type of relationship (i.e. "parent" or "married"), and the nodes be the name of the characters.
    - Each character should be represented by a single node (i.e. no duplicates).

    - "married" relationship representing married couples should be grouped with an intermediary node "m1 [label = "{{A Rougon | B Rougon}}";];" and then the corresponding node has the children: "m1 -> "D Rougon"; m1 -> "E Rougon"; So there should not be individual nodes for each member of the married couple.
    - "parent" relationshipis the only relationships between the intermediary nodes of couples and the children. This is represented by the standard arrow (without label)

    ############################################################
    Provided lists of relationships:
    ############################################################
    {relationships_lists}
    """

    consolidated_relationships_file  = split_directory / "consolidated_relationships.txt"

    relationships_list = consolidated_relationships_file.read_text(encoding="utf-8")

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt.format(relationships_lists=relationships_list)}
    ]

    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=messages,
        temperature=0.7,
    )

    graphviz_output_file = split_directory / "graphviz_output.dot"
    graphviz_output_file.write_text(response.choices[0].message.content, encoding="utf-8")

build_graph(Path("data/splits"))