In [1]:
import re
import pandas as pd

def novel_to_dataframe(novel_text: str) -> pd.DataFrame:
    """
    Takes the full plain text of a novel, identifies chapters,
    chunks the text by paragraphs within each chapter, and returns a pandas DataFrame.

    Args:
        novel_text: A string containing the full plain text of the novel.

    Returns:
        A pandas DataFrame with columns 'chapter', 'chunk', and 'chunk_order_number'.
    """

    # Optional: Attempt to remove Project Gutenberg headers/footers
    # Define common markers
    start_marker_pattern = r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+\*\*\*"
    end_marker_pattern = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+\*\*\*"

    # Remove text before the start marker
    start_match = re.search(start_marker_pattern, novel_text)
    if start_match:
        novel_text = novel_text[start_match.end():]

    # Remove text after the end marker
    end_match = re.search(end_marker_pattern, novel_text)
    if end_match:
        novel_text = novel_text[:end_match.start()]

    novel_text = novel_text.strip()

    chapters_data = []
    chunk_order_counter = 0  # Initialize chunk order counter

    # Regex to find chapter titles like "CHAPTER I.", "CHAPTER II.", etc.
    # It assumes chapter titles are on their own line.
    chapter_pattern = re.compile(r"^(CHAPTER [IVXLCDM]+\.)", re.MULTILINE)

    matches = list(chapter_pattern.finditer(novel_text))

    if not matches:
        # If no chapters are found according to the pattern,
        # treat the whole text as chunks under an 'Unknown' chapter.
        paragraphs = re.split(r'\n\s*\n+', novel_text)
        for para_content in paragraphs:
            para_cleaned = para_content.strip()
            if para_cleaned:
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': 'Unknown',
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })
        if chapters_data:
            return pd.DataFrame(chapters_data)
        else: # If text was empty or only whitespace
            return pd.DataFrame(columns=['chapter', 'chunk', 'chunk_order_number'])

    # Process text before the first chapter, if any
    first_chapter_start_index = matches[0].start()
    text_before_first_chapter = novel_text[:first_chapter_start_index].strip()
    if text_before_first_chapter:
        paragraphs_before = re.split(r'\n\s*\n+', text_before_first_chapter)
        for para_content in paragraphs_before:
            para_cleaned = para_content.strip()
            if para_cleaned:
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': 'Preamble', # Or 'Introduction', 'Unknown', etc.
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })

    for i, match in enumerate(matches):
        chapter_title = match.group(1)  # e.g., "CHAPTER I."

        # Content starts after the current chapter title's line
        content_start_index = match.end()

        # Content ends at the start of the next chapter title, or at the end of the text
        if i + 1 < len(matches):
            content_end_index = matches[i+1].start()
        else:
            content_end_index = len(novel_text)

        chapter_content = novel_text[content_start_index:content_end_index].strip()

        # Split chapter content into paragraphs (chunks)
        paragraphs = re.split(r'\n\s*\n+', chapter_content)

        for para_content in paragraphs:
            para_cleaned = para_content.strip()
            if para_cleaned:  # Add only non-empty paragraphs
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': chapter_title,
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })

    df = pd.DataFrame(chapters_data)
    return df

if __name__ == '__main__':
    # Use the example text you provided (truncated for brevity here)
    file_path = "/Users/davidspencer/Downloads/memgraph_import/Adventures of Huckleberry Finn.txt" # Replace with the actual path to your .txt file

    with open(file_path, 'r') as file:
        huckleberry_finn_text = file.read()


    # You would continue with CHAPTER III, IV, V, VI, VII... and the rest of the novel.
    # For this example, we're using the truncated text above.

    # Process the text
    df_novel = novel_to_dataframe(huckleberry_finn_text)

    # Display the DataFrame (or parts of it)
    print(f"Successfully processed {len(df_novel)} chunks.")
    print("\nFirst 10 chunks:")
    print(df_novel.head(10))

    print("\nLast 10 chunks:")
    print(df_novel.tail(10))

    # Example: Show all chunks from a specific chapter
    if not df_novel.empty and 'CHAPTER II.' in df_novel['chapter'].unique():
        print("\nAll chunks from CHAPTER II.:")
        print(df_novel[df_novel['chapter'] == 'CHAPTER II.'])
    elif not df_novel.empty:
        print(f"\nChunks found, but 'CHAPTER II.' not present in the sample used. Available chapters: {df_novel['chapter'].unique()}")

Successfully processed 2187 chunks.

First 10 chunks:
    chapter                                              chunk  \
0  Preamble                   ADVENTURES\nOF\nHUCKLEBERRY FINN   
1  Preamble                             (Tom Sawyer’s Comrade)   
2  Preamble                                      By Mark Twain   
3  Preamble                                            NOTICE.   
4  Preamble  Persons attempting to find a motive in this na...   
5  Preamble  BY ORDER OF THE AUTHOR\nPER G. G., CHIEF OF OR...   
6  Preamble                                        EXPLANATORY   
7  Preamble  In this book a number of dialects are used, to...   
8  Preamble  I make this explanation for the reason that wi...   
9  Preamble                                        THE AUTHOR.   

   chunk_order_number  
0                   1  
1                   2  
2                   3  
3                   4  
4                   5  
5                   6  
6                   7  
7                   8  
8  

In [2]:
df_novel

Unnamed: 0,chapter,chunk,chunk_order_number
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1
1,Preamble,(Tom Sawyer’s Comrade),2
2,Preamble,By Mark Twain,3
3,Preamble,NOTICE.,4
4,Preamble,Persons attempting to find a motive in this na...,5
...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186


In [3]:
# Define the author and book title
author_name = "Mark Twain"
book_title = "ADVENTURES OF HUCKLEBERRY FINN"

# Add the 'author' column with the specified value for all rows
df_novel["author"] = author_name

# Add the 'book' column with the specified value for all rows
df_novel["book"] = book_title

In [4]:
df_novel

Unnamed: 0,chapter,chunk,chunk_order_number,author,book
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
1,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2,Preamble,By Mark Twain,3,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
3,Preamble,NOTICE.,4,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
4,Preamble,Persons attempting to find a motive in this na...,5,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
...,...,...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN


In [17]:
system_prompt = """
# Memgraph Literary Knowledge Graph Generator

Generate a single structured JSON file containing both nodes and relationships for Memgraph import. This file will contain all entities and their connections in one unified format optimized for literary text analysis with the following knowledge graph schema:

**Processing Context:** This prompt is designed for chunk-by-chunk processing of literary texts, where each chunk represents a segment of a chapter with specific ordering. The input data contains:

- `chapter`: Chapter identifier (e.g., "CHAPTER II")  
- `chunk`: Text content passage
- `chunk_order_number`: Sequential position within the book

## Entity Types

- **Actor**: People, organizations, characters, agents
- **Object**: Physical items, tools, documents, artifacts  
- **Location**: Places, addresses, geographic areas
- **Event**: Actions, incidents, occurrences, processes
- **Intangible**: Knowledge, concepts, ideas, beliefs
- **Book**: Literary works, novels, publications
- **Author**: Writers, creators of literary works
- **Chapter**: Sections or divisions within books
- **Chunk**: Text segments or passages within chapters

## Relationship Schema

### Actor Relationships

- `(Actor)-[INTERACTED_WITH]->(Actor)`
- `(Actor)-[MENTIONED]->(Actor)`
- `(Actor)-[ASSOCIATED_WITH]->(Actor)`
- `(Actor)-[USED]->(Object)`
- `(Actor)-[ACQUIRED]->(Object)`
- `(Actor)-[DE_ACQUIRED]->(Object)`
- `(Actor)-[CREATED]->(Object)`
- `(Actor)-[DESTROYED]->(Object)`
- `(Actor)-[MODIFIED]->(Object)`
- `(Actor)-[ARRIVED_AT]->(Location)`
- `(Actor)-[DEPARTED_FROM]->(Location)`
- `(Actor)-[LOCATED_AT]->(Location)`
- `(Actor)-[PARTICIPATED_IN]->(Event)`
- `(Actor)-[LEARNED]->(Intangible)`
- `(Actor)-[FORGOT]->(Intangible)`
- `(Actor)-[CLAIMED]->(Intangible)`
- `(Actor)-[REFUTED]->(Intangible)`

### Object Relationships

- `(Object)-[BELONGS_TO]->(Actor)`
- `(Object)-[INFLUENCES]->(Actor)`
- `(Object)-[ATTRACTS]->(Actor)`
- `(Object)-[CONTAINS]->(Object)`
- `(Object)-[PART_OF]->(Object)`
- `(Object)-[CONNECTED_TO]->(Object)`
- `(Object)-[SIMILAR_TO]->(Object)`
- `(Object)-[LOCATED_IN]->(Location)`
- `(Object)-[ORIGINATED_FROM]->(Location)`
- `(Object)-[INVOLVED_IN]->(Event)`
- `(Object)-[CAUSED]->(Event)`
- `(Object)-[RESULTED_FROM]->(Event)`
- `(Object)-[REPRESENTS]->(Intangible)`
- `(Object)-[EMBODIES]->(Intangible)`

### Location Relationships

- `(Location)-[HOSTS]->(Actor)`
- `(Location)-[EXCLUDES]->(Actor)`
- `(Location)-[CONTAINS]->(Object)`
- `(Location)-[HOUSES]->(Object)`
- `(Location)-[CONTAINS]->(Location)`
- `(Location)-[ADJACENT_TO]->(Location)`
- `(Location)-[PART_OF]->(Location)`
- `(Location)-[CONNECTED_TO]->(Location)`
- `(Location)-[HOSTED]->(Event)`
- `(Location)-[WITNESSED]->(Event)`
- `(Location)-[ASSOCIATED_WITH]->(Intangible)`
- `(Location)-[SYMBOLIZES]->(Intangible)`

### Event Relationships

- `(Event)-[AFFECTED]->(Actor)`
- `(Event)-[CAUSED_BY]->(Actor)`
- `(Event)-[INVOLVED]->(Object)`
- `(Event)-[PRODUCED]->(Object)`
- `(Event)-[CONSUMED]->(Object)`
- `(Event)-[OCCURRED_AT]->(Location)`
- `(Event)-[MOVED_FROM]->(Location)`
- `(Event)-[MOVED_TO]->(Location)`
- `(Event)-[PRECEDED]->(Event)`
- `(Event)-[FOLLOWED]->(Event)`
- `(Event)-[CAUSED]->(Event)`
- `(Event)-[CONCURRENT_WITH]->(Event)`
- `(Event)-[REVEALED]->(Intangible)`
- `(Event)-[DEMONSTRATED]->(Intangible)`
- `(Event)-[RESULTED_IN]->(Intangible)`

### Intangible Relationships

- `(Intangible)-[INFLUENCED]->(Actor)`
- `(Intangible)-[POSSESSED_BY]->(Actor)`
- `(Intangible)-[KNOWN_BY]->(Actor)`
- `(Intangible)-[APPLIED_TO]->(Object)`
- `(Intangible)-[MANIFESTED_IN]->(Object)`
- `(Intangible)-[ORIGINATED_FROM]->(Location)`
- `(Intangible)-[ASSOCIATED_WITH]->(Location)`
- `(Intangible)-[DEMONSTRATED_IN]->(Event)`
- `(Intangible)-[REVEALED_BY]->(Event)`
- `(Intangible)-[CONTRADICTS]->(Intangible)`
- `(Intangible)-[SUPPORTS]->(Intangible)`
- `(Intangible)-[DERIVED_FROM]->(Intangible)`
- `(Intangible)-[RELATED_TO]->(Intangible)`

### Book Relationships

- `(Book)-[WRITTEN_BY]->(Author)`
- `(Book)-[CONTAINS]->(Chapter)`
- `(Book)-[FEATURES]->(Actor)`
- `(Book)-[MENTIONS]->(Object)`
- `(Book)-[SET_IN]->(Location)`
- `(Book)-[DESCRIBES]->(Event)`
- `(Book)-[EXPLORES]->(Intangible)`
- `(Book)-[PUBLISHED_AT]->(Location)`
- `(Book)-[PUBLISHED_IN]->(Event)`

### Author Relationships

- `(Author)-[WROTE]->(Book)`
- `(Author)-[CREATED]->(Object)`
- `(Author)-[LIVED_IN]->(Location)`
- `(Author)-[BORN_IN]->(Location)`
- `(Author)-[PARTICIPATED_IN]->(Event)`
- `(Author)-[INFLUENCED_BY]->(Intangible)`
- `(Author)-[INFLUENCED]->(Actor)`
- `(Author)-[CONTEMPORARY_OF]->(Author)`
- `(Author)-[INSPIRED_BY]->(Author)`

### Chapter Relationships

- `(Chapter)-[PART_OF]->(Book)`
- `(Chapter)-[CONTAINS]->(Chunk)`
- `(Chapter)-[PRECEDED_BY]->(Chapter)`
- `(Chapter)-[FOLLOWED_BY]->(Chapter)`
- `(Chapter)-[FEATURES]->(Actor)`
- `(Chapter)-[MENTIONS]->(Object)`
- `(Chapter)-[SET_IN]->(Location)`
- `(Chapter)-[DESCRIBES]->(Event)`
- `(Chapter)-[CONVEYS]->(Intangible)`

### Chunk Relationships

- `(Chunk)-[PART_OF]->(Chapter)`
- `(Chunk)-[PRECEDED_BY]->(Chunk)`
- `(Chunk)-[FOLLOWED_BY]->(Chunk)`
- `(Chunk)-[MENTIONS]->(Actor)`
- `(Chunk)-[REFERENCES]->(Object)`
- `(Chunk)-[DESCRIBES]->(Location)`
- `(Chunk)-[DEPICTS]->(Event)`
- `(Chunk)-[CONVEYS]->(Intangible)`
- `(Chunk)-[CONTAINS_DIALOGUE_BY]->(Actor)`

## Output Requirements

**Generate a single structured JSON file containing both nodes and relationships.**

### JSON Structure: **knowledge_graph.json**

```json
{
  "metadata": {
    "generated_at": "[datetime]",
    "total_nodes": 85,
    "total_relationships": 157,
    "entity_types": ["Actor", "Object", "Location", "Event", "Intangible", "Book", "Author", "Chapter", "Chunk"]
  },
  "nodes": [
    {
      "id": 1,
      "label": "Book",
      "name": "The Adventures of Huckleberry Finn",
      "description": "Classic American novel by Mark Twain",
      "properties": {
        "genre": "Adventure Fiction",
        "publication_year": 1884
      },
      "timestamp": "[datetime]"
    }
  ],
  "relationships": [
    {
      "start_id": 1,
      "end_id": 2,
      "relationship_type": "WRITTEN_BY",
      "weight": 1.0,
      "properties": {
        "relationship_strength": "primary"
      },
      "timestamp": "[datetime]"
    }
  ]
}
```

### Node Object Structure:

- **id**: Unique sequential identifier (integer)
- **label**: Entity type (Actor/Object/Location/Event/Intangible/Book/Author/Chapter/Chunk)
- **name**: Display name for the entity
- **description**: Detailed description (include chunk_order_number for Chunk entities)
- **properties**: Additional key-value pairs specific to the entity type (optional)
- **timestamp**: ISO format timestamp reflecting narrative sequence

### Relationship Object Structure:

- **start_id**: Source node ID (integer)
- **end_id**: Target node ID (integer)
- **relationship_type**: Relationship name from the schema
- **weight**: Relationship strength (0.1-1.0)
- **properties**: Additional relationship metadata (optional)
- **timestamp**: ISO format timestamp

### Content Requirements:

1. **Create 80+ diverse nodes** representing all 9 entity types (including Book, Author, Chapter, Chunk)
2. **Create 150+ relationships** covering all relationship types from the schema
3. **Literary focus** - process book content with characters, locations, events, and themes
4. **Hierarchical structure** - Book → Chapter → Chunk relationships with proper ordering
5. **Character analysis** - identify and connect characters (Actors) mentioned in chunks
6. **Temporal progression** - logical sequence of timestamps reflecting narrative flow
7. **Complete schema coverage** - demonstrate every relationship type at least once
8. **Single file output** - structured JSON with metadata, nodes, and relationships sections

### Processing Context:

**Input Data Structure:**

- `chapter`: Chapter identifier (e.g., "CHAPTER II")
- `chunk`: Text content of the passage
- `chunk_order_number`: Sequential ordering within the book

**Entity Extraction Guidelines:**

- **Book**: The overall literary work being processed
- **Author**: Creator of the literary work
- **Chapter**: Each distinct chapter/section
- **Chunk**: Individual text passages with order numbers
- **Actor**: Characters, people mentioned in the text
- **Location**: Places described or referenced
- **Event**: Actions, scenes, incidents in the narrative
- **Object**: Items, artifacts mentioned in the text
- **Intangible**: Themes, concepts, emotions, ideas conveyed

### Example JSON Structure:

```json
{
  "metadata": {
    "generated_at": "[datetime]",
    "total_nodes": 8,
    "total_relationships": 6,
    "entity_types": ["Book", "Author", "Chapter", "Chunk", "Actor", "Location", "Event"]
  },
  "nodes": [
    {
      "id": 1,
      "label": "Book",
      "name": "The Adventures of Huckleberry Finn",
      "description": "Classic American novel by Mark Twain exploring themes of friendship and freedom",
      "properties": {
        "genre": "Adventure Fiction",
        "publication_year": 1884,
        "country": "United States"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 2,
      "label": "Author",
      "name": "Mark Twain",
      "description": "American writer and humorist; pen name of Samuel Clemens",
      "properties": {
        "birth_year": 1835,
        "nationality": "American"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 3,
      "label": "Chapter",
      "name": "Chapter II",
      "description": "Second chapter of the novel where Tom forms his robber band",
      "properties": {
        "chapter_number": 2,
        "chapter_title": "Our Gang's Dark Oath"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 4,
      "label": "Chunk",
      "name": "Chunk 22",
      "description": "Text passage about Tom and Huck tiptoeing through the woods",
      "properties": {
        "chunk_order_number": 22,
        "word_count": 245
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 5,
      "label": "Actor",
      "name": "Tom Sawyer",
      "description": "Young adventurous character who leads the group of boys",
      "properties": {
        "age_range": "child",
        "role": "leader"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 6,
      "label": "Actor",
      "name": "Huck Finn",
      "description": "Main protagonist and narrator of the story",
      "properties": {
        "age_range": "child",
        "role": "protagonist"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 7,
      "label": "Location",
      "name": "The woods",
      "description": "Forest setting where the boys meet secretly",
      "properties": {
        "location_type": "natural",
        "accessibility": "secluded"
      },
      "timestamp": "[datetime]"
    },
    {
      "id": 8,
      "label": "Event",
      "name": "Forming the robber band",
      "description": "Boys organizing their secret group with oaths and ceremonies",
      "properties": {
        "event_type": "social_formation",
        "duration": "evening"
      },
      "timestamp": "[datetime]"
    }
  ],
  "relationships": [
    {
      "start_id": 1,
      "end_id": 2,
      "relationship_type": "WRITTEN_BY",
      "weight": 1.0,
      "properties": {
        "relationship_strength": "primary",
        "creation_period": "1884"
      },
      "timestamp": "[datetime]"
    },
    {
      "start_id": 1,
      "end_id": 3,
      "relationship_type": "CONTAINS",
      "weight": 0.9,
      "properties": {
        "containment_type": "structural"
      },
      "timestamp": "[datetime]"
    },
    {
      "start_id": 3,
      "end_id": 4,
      "relationship_type": "CONTAINS",
      "weight": 0.9,
      "properties": {
        "sequence_order": 22
      },
      "timestamp": "[datetime]"
    },
    {
      "start_id": 4,
      "end_id": 5,
      "relationship_type": "MENTIONS",
      "weight": 0.8,
      "properties": {
        "mention_frequency": "multiple"
      },
      "timestamp": "[datetime]"
    },
    {
      "start_id": 4,
      "end_id": 6,
      "relationship_type": "MENTIONS",
      "weight": 0.8,
      "properties": {
        "mention_frequency": "multiple"
      },
      "timestamp": "[datetime]"
    },
    {
      "start_id": 8,
      "end_id": 7,
      "relationship_type": "OCCURRED_AT",
      "weight": 0.9,
      "properties": {
        "location_significance": "primary_setting"
      },
      "timestamp": "[datetime]"
    }
  ]
}
```

## Instructions:

1. **Output a single structured JSON file** with metadata, nodes, and relationships
2. **Include proper metadata section** with generation timestamp and counts
3. **Structure nodes and relationships** in separate arrays within the JSON
4. **Process literary content** - extract entities from book chapters and text chunks
5. **Maintain hierarchical structure** - ensure Book→Chapter→Chunk relationships
6. **Identify literary elements** - characters, settings, events, themes from text
7. **Use sequential timestamps** to reflect narrative progression and chunk ordering
8. **Reference node IDs correctly** - relationships must reference existing node IDs
9. **Handle chunk ordering** - use chunk_order_number in properties for temporal sequencing
10. **Include meaningful properties** - add relevant metadata to both nodes and relationships

### Critical JSON Formatting Requirements:

- **Use proper JSON syntax** - valid JSON with correct escaping
- **Escape special characters** - quotes, backslashes, and control characters
- **Use consistent data types** - integers for IDs, strings for text, floats for weights
- **Include all required fields** - id, label, name, description for nodes; start_id, end_id, relationship_type, weight for relationships
- **Maintain valid structure** - proper nesting and array formatting
- **Use ISO timestamps** - consistent datetime format throughout

### Literary Processing Guidelines:

- **Extract characters** (Actors) mentioned in dialogue and narrative
- **Identify locations** (Locations) described or referenced in chunks
- **Capture events** (Events) - actions, scenes, plot developments
- **Note objects** (Objects) - items, artifacts mentioned in text
- **Extract themes** (Intangibles) - concepts, emotions, ideas conveyed
- **Maintain narrative flow** - ensure chunk order reflects story progression
- **Add contextual properties** - enrich entities with relevant metadata
- **Create meaningful relationships** - connect entities based on textual evidence

### Memgraph Import Commands:

The JSON output can be imported into Memgraph using:

```cypher
// Import nodes from JSON
CALL json.load_from_file('knowledge_graph.json') YIELD value
UNWIND value.nodes AS node
CREATE (n)
SET n = node.properties
SET n.id = node.id
SET n.name = node.name
SET n.description = node.description
SET n.timestamp = node.timestamp
CALL apoc.create.addLabels(n, [node.label]) YIELD node AS labeled_node
RETURN labeled_node;

// Import relationships from JSON
CALL json.load_from_file('knowledge_graph.json') YIELD value
UNWIND value.relationships AS rel
MATCH (start {id: rel.start_id}), (end {id: rel.end_id})
CALL apoc.create.relationship(start, rel.relationship_type, 
  {weight: rel.weight, timestamp: rel.timestamp} + coalesce(rel.properties, {}), end)
YIELD rel AS created_rel
RETURN created_rel;
```

**Output Format:** Single structured JSON file optimized for literary text analysis with complete schema coverage including Book, Author, Chapter, and Chunk entities.
"""


In [None]:
# To run this code you need to install the following dependencies:
# pip install google-genai

# import base64
import os
from google import genai
from google.genai import types
import datetime
import pytz


def generate(dataframe: pd.DataFrame, chunk_number: int):
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    model = "gemini-2.0-flash-lite"
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(
                    text=f"""
                    Author: {dataframe['author'].iloc[chunk_number]}
                    Book: {dataframe['book'].iloc[chunk_number]}
                    Chapter: {dataframe['chapter'].iloc[chunk_number]}
                    chunk_order_number: {dataframe['chunk_order_number'].iloc[chunk_number]}
                    Chunk: {dataframe['chunk'].iloc[chunk_number]}
                    Datetime: {datetime.datetime.now(pytz.utc)}
                    """
                ),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        response_mime_type="application/json",
        system_instruction=[
            types.Part.from_text(
                text="""# Memgraph Literary Knowledge Graph Generator

Generate a single structured JSON file containing both nodes and relationships for Memgraph import. This file will contain all entities and their connections in one unified format optimized for literary text analysis with the following knowledge graph schema:

**Processing Context:** This prompt is designed for chunk-by-chunk processing of literary texts, where each chunk represents a segment of a chapter with specific ordering. The input data contains:

- `author`: Author name
- `book`: Book title
- `chapter`: Chapter identifier (e.g., \"CHAPTER II\")  
- `chunk`: Text content passage
- `chunk_order_number`: Sequential position within the book
- `datetime`: Current date and time in UTC at time of processing/generation

## Entity Types

- **Actor**: People, organizations, characters, agents
- **Object**: Physical items, tools, documents, artifacts  
- **Location**: Places, addresses, geographic areas
- **Event**: Actions, incidents, occurrences, processes
- **Intangible**: Knowledge, concepts, ideas, beliefs
- **Book**: Literary works, novels, publications
- **Author**: Writers, creators of literary works
- **Chapter**: Sections or divisions within books
- **Chunk**: Text segments or passages within chapters

## Relationship Schema

### Actor Relationships

- `(Actor)-[INTERACTED_WITH]->(Actor)`
- `(Actor)-[MENTIONED]->(Actor)`
- `(Actor)-[ASSOCIATED_WITH]->(Actor)`
- `(Actor)-[USED]->(Object)`
- `(Actor)-[ACQUIRED]->(Object)`
- `(Actor)-[DE_ACQUIRED]->(Object)`
- `(Actor)-[CREATED]->(Object)`
- `(Actor)-[DESTROYED]->(Object)`
- `(Actor)-[MODIFIED]->(Object)`
- `(Actor)-[ARRIVED_AT]->(Location)`
- `(Actor)-[DEPARTED_FROM]->(Location)`
- `(Actor)-[LOCATED_AT]->(Location)`
- `(Actor)-[PARTICIPATED_IN]->(Event)`
- `(Actor)-[LEARNED]->(Intangible)`
- `(Actor)-[FORGOT]->(Intangible)`
- `(Actor)-[CLAIMED]->(Intangible)`
- `(Actor)-[REFUTED]->(Intangible)`

### Object Relationships

- `(Object)-[BELONGS_TO]->(Actor)`
- `(Object)-[INFLUENCES]->(Actor)`
- `(Object)-[ATTRACTS]->(Actor)`
- `(Object)-[CONTAINS]->(Object)`
- `(Object)-[PART_OF]->(Object)`
- `(Object)-[CONNECTED_TO]->(Object)`
- `(Object)-[SIMILAR_TO]->(Object)`
- `(Object)-[LOCATED_IN]->(Location)`
- `(Object)-[ORIGINATED_FROM]->(Location)`
- `(Object)-[INVOLVED_IN]->(Event)`
- `(Object)-[CAUSED]->(Event)`
- `(Object)-[RESULTED_FROM]->(Event)`
- `(Object)-[REPRESENTS]->(Intangible)`
- `(Object)-[EMBODIES]->(Intangible)`

### Location Relationships

- `(Location)-[HOSTS]->(Actor)`
- `(Location)-[EXCLUDES]->(Actor)`
- `(Location)-[CONTAINS]->(Object)`
- `(Location)-[HOUSES]->(Object)`
- `(Location)-[CONTAINS]->(Location)`
- `(Location)-[ADJACENT_TO]->(Location)`
- `(Location)-[PART_OF]->(Location)`
- `(Location)-[CONNECTED_TO]->(Location)`
- `(Location)-[HOSTED]->(Event)`
- `(Location)-[WITNESSED]->(Event)`
- `(Location)-[ASSOCIATED_WITH]->(Intangible)`
- `(Location)-[SYMBOLIZES]->(Intangible)`

### Event Relationships

- `(Event)-[AFFECTED]->(Actor)`
- `(Event)-[CAUSED_BY]->(Actor)`
- `(Event)-[INVOLVED]->(Object)`
- `(Event)-[PRODUCED]->(Object)`
- `(Event)-[CONSUMED]->(Object)`
- `(Event)-[OCCURRED_AT]->(Location)`
- `(Event)-[MOVED_FROM]->(Location)`
- `(Event)-[MOVED_TO]->(Location)`
- `(Event)-[PRECEDED]->(Event)`
- `(Event)-[FOLLOWED]->(Event)`
- `(Event)-[CAUSED]->(Event)`
- `(Event)-[CONCURRENT_WITH]->(Event)`
- `(Event)-[REVEALED]->(Intangible)`
- `(Event)-[DEMONSTRATED]->(Intangible)`
- `(Event)-[RESULTED_IN]->(Intangible)`

### Intangible Relationships

- `(Intangible)-[INFLUENCED]->(Actor)`
- `(Intangible)-[POSSESSED_BY]->(Actor)`
- `(Intangible)-[KNOWN_BY]->(Actor)`
- `(Intangible)-[APPLIED_TO]->(Object)`
- `(Intangible)-[MANIFESTED_IN]->(Object)`
- `(Intangible)-[ORIGINATED_FROM]->(Location)`
- `(Intangible)-[ASSOCIATED_WITH]->(Location)`
- `(Intangible)-[DEMONSTRATED_IN]->(Event)`
- `(Intangible)-[REVEALED_BY]->(Event)`
- `(Intangible)-[CONTRADICTS]->(Intangible)`
- `(Intangible)-[SUPPORTS]->(Intangible)`
- `(Intangible)-[DERIVED_FROM]->(Intangible)`
- `(Intangible)-[RELATED_TO]->(Intangible)`

### Book Relationships

- `(Book)-[WRITTEN_BY]->(Author)`
- `(Book)-[CONTAINS]->(Chapter)`
- `(Book)-[FEATURES]->(Actor)`
- `(Book)-[MENTIONS]->(Object)`
- `(Book)-[SET_IN]->(Location)`
- `(Book)-[DESCRIBES]->(Event)`
- `(Book)-[EXPLORES]->(Intangible)`
- `(Book)-[PUBLISHED_AT]->(Location)`
- `(Book)-[PUBLISHED_IN]->(Event)`

### Author Relationships

- `(Author)-[WROTE]->(Book)`
- `(Author)-[CREATED]->(Object)`
- `(Author)-[LIVED_IN]->(Location)`
- `(Author)-[BORN_IN]->(Location)`
- `(Author)-[PARTICIPATED_IN]->(Event)`
- `(Author)-[INFLUENCED_BY]->(Intangible)`
- `(Author)-[INFLUENCED]->(Actor)`
- `(Author)-[CONTEMPORARY_OF]->(Author)`
- `(Author)-[INSPIRED_BY]->(Author)`

### Chapter Relationships

- `(Chapter)-[PART_OF]->(Book)`
- `(Chapter)-[CONTAINS]->(Chunk)`
- `(Chapter)-[PRECEDED_BY]->(Chapter)`
- `(Chapter)-[FOLLOWED_BY]->(Chapter)`
- `(Chapter)-[FEATURES]->(Actor)`
- `(Chapter)-[MENTIONS]->(Object)`
- `(Chapter)-[SET_IN]->(Location)`
- `(Chapter)-[DESCRIBES]->(Event)`
- `(Chapter)-[CONVEYS]->(Intangible)`

### Chunk Relationships

- `(Chunk)-[PART_OF]->(Chapter)`
- `(Chunk)-[PRECEDED_BY]->(Chunk)`
- `(Chunk)-[FOLLOWED_BY]->(Chunk)`
- `(Chunk)-[MENTIONS]->(Actor)`
- `(Chunk)-[REFERENCES]->(Object)`
- `(Chunk)-[DESCRIBES]->(Location)`
- `(Chunk)-[DEPICTS]->(Event)`
- `(Chunk)-[CONVEYS]->(Intangible)`
- `(Chunk)-[CONTAINS_DIALOGUE_BY]->(Actor)`

## Output Requirements

**Generate a single structured JSON file containing both nodes and relationships.**

### JSON Structure: **knowledge_graph.json**

```json
{
  \"metadata\": {
    \"generated_at\": \"[datetime]\",
    \"total_nodes\": 85,
    \"total_relationships\": 157,
    \"entity_types\": [\"Actor\", \"Object\", \"Location\", \"Event\", \"Intangible\", \"Book\", \"Author\", \"Chapter\", \"Chunk\"]
  },
  \"nodes\": [
    {
      \"id\": 1,
      \"label\": \"Book\",
      \"name\": \"The Adventures of Huckleberry Finn\",
      \"description\": \"Classic American novel by Mark Twain\",
      \"properties\": {
        \"genre\": \"Adventure Fiction\",
        \"publication_year\": 1884
      },
      \"timestamp\": \"[datetime]\"
    }
  ],
  \"relationships\": [
    {
      \"start_id\": 1,
      \"end_id\": 2,
      \"relationship_type\": \"WRITTEN_BY\",
      \"weight\": 1.0,
      \"properties\": {
        \"relationship_strength\": \"primary\"
      },
      \"timestamp\": \"[datetime]\"
    }
  ]
}
```

### Node Object Structure:

- **id**: Unique sequential identifier (integer)
- **label**: Entity type (Actor/Object/Location/Event/Intangible/Book/Author/Chapter/Chunk)
- **name**: Display name for the entity
- **description**: Detailed description (include chunk_order_number for Chunk entities)
- **properties**: Additional key-value pairs specific to the entity type (optional)
- **timestamp**: ISO format timestamp reflecting narrative sequence

### Relationship Object Structure:

- **start_id**: Source node ID (integer)
- **end_id**: Target node ID (integer)
- **relationship_type**: Relationship name from the schema
- **weight**: Relationship strength (0.1-1.0)
- **properties**: Additional relationship metadata (optional)
- **timestamp**: ISO format timestamp

### Content Requirements:

1. **Create 80+ diverse nodes** representing all 9 entity types (including Book, Author, Chapter, Chunk)
2. **Create 150+ relationships** covering all relationship types from the schema
3. **Literary focus** - process book content with characters, locations, events, and themes
4. **Hierarchical structure** - Book → Chapter → Chunk relationships with proper ordering
5. **Character analysis** - identify and connect characters (Actors) mentioned in chunks
6. **Temporal progression** - logical sequence of timestamps reflecting narrative flow
7. **Complete schema coverage** - demonstrate every relationship type at least once
8. **Single file output** - structured JSON with metadata, nodes, and relationships sections

### Processing Context:

**Input Data Structure:**

- `chapter`: Chapter identifier (e.g., \"CHAPTER II\")
- `chunk`: Text content of the passage
- `chunk_order_number`: Sequential ordering within the book

**Entity Extraction Guidelines:**

- **Book**: The overall literary work being processed
- **Author**: Creator of the literary work
- **Chapter**: Each distinct chapter/section
- **Chunk**: Individual text passages with order numbers
- **Actor**: Characters, people mentioned in the text
- **Location**: Places described or referenced
- **Event**: Actions, scenes, incidents in the narrative
- **Object**: Items, artifacts mentioned in the text
- **Intangible**: Themes, concepts, emotions, ideas conveyed

### Example JSON Structure:

```json
{
  \"metadata\": {
    \"generated_at\": \"[datetime]\",
    \"total_nodes\": 8,
    \"total_relationships\": 6,
    \"entity_types\": [\"Book\", \"Author\", \"Chapter\", \"Chunk\", \"Actor\", \"Location\", \"Event\"]
  },
  \"nodes\": [
    {
      \"id\": 1,
      \"label\": \"Book\",
      \"name\": \"The Adventures of Huckleberry Finn\",
      \"description\": \"Classic American novel by Mark Twain exploring themes of friendship and freedom\",
      \"properties\": {
        \"genre\": \"Adventure Fiction\",
        \"publication_year\": 1884,
        \"country\": \"United States\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 2,
      \"label\": \"Author\",
      \"name\": \"Mark Twain\",
      \"description\": \"American writer and humorist; pen name of Samuel Clemens\",
      \"properties\": {
        \"birth_year\": 1835,
        \"nationality\": \"American\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 3,
      \"label\": \"Chapter\",
      \"name\": \"Chapter II\",
      \"description\": \"Second chapter of the novel where Tom forms his robber band\",
      \"properties\": {
        \"chapter_number\": 2,
        \"chapter_title\": \"Our Gang's Dark Oath\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 4,
      \"label\": \"Chunk\",
      \"name\": \"Chunk 22\",
      \"description\": \"Text passage about Tom and Huck tiptoeing through the woods\",
      \"properties\": {
        \"chunk_order_number\": 22,
        \"word_count\": 245
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 5,
      \"label\": \"Actor\",
      \"name\": \"Tom Sawyer\",
      \"description\": \"Young adventurous character who leads the group of boys\",
      \"properties\": {
        \"age_range\": \"child\",
        \"role\": \"leader\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 6,
      \"label\": \"Actor\",
      \"name\": \"Huck Finn\",
      \"description\": \"Main protagonist and narrator of the story\",
      \"properties\": {
        \"age_range\": \"child\",
        \"role\": \"protagonist\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 7,
      \"label\": \"Location\",
      \"name\": \"The woods\",
      \"description\": \"Forest setting where the boys meet secretly\",
      \"properties\": {
        \"location_type\": \"natural\",
        \"accessibility\": \"secluded\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"id\": 8,
      \"label\": \"Event\",
      \"name\": \"Forming the robber band\",
      \"description\": \"Boys organizing their secret group with oaths and ceremonies\",
      \"properties\": {
        \"event_type\": \"social_formation\",
        \"duration\": \"evening\"
      },
      \"timestamp\": \"[datetime]\"
    }
  ],
  \"relationships\": [
    {
      \"start_id\": 1,
      \"end_id\": 2,
      \"relationship_type\": \"WRITTEN_BY\",
      \"weight\": 1.0,
      \"properties\": {
        \"relationship_strength\": \"primary\",
        \"creation_period\": \"1884\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"start_id\": 1,
      \"end_id\": 3,
      \"relationship_type\": \"CONTAINS\",
      \"weight\": 0.9,
      \"properties\": {
        \"containment_type\": \"structural\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"start_id\": 3,
      \"end_id\": 4,
      \"relationship_type\": \"CONTAINS\",
      \"weight\": 0.9,
      \"properties\": {
        \"sequence_order\": 22
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"start_id\": 4,
      \"end_id\": 5,
      \"relationship_type\": \"MENTIONS\",
      \"weight\": 0.8,
      \"properties\": {
        \"mention_frequency\": \"multiple\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"start_id\": 4,
      \"end_id\": 6,
      \"relationship_type\": \"MENTIONS\",
      \"weight\": 0.8,
      \"properties\": {
        \"mention_frequency\": \"multiple\"
      },
      \"timestamp\": \"[datetime]\"
    },
    {
      \"start_id\": 8,
      \"end_id\": 7,
      \"relationship_type\": \"OCCURRED_AT\",
      \"weight\": 0.9,
      \"properties\": {
        \"location_significance\": \"primary_setting\"
      },
      \"timestamp\": \"[datetime]\"
    }
  ]
}
```

## Instructions:

1. **Output a single structured JSON file** with metadata, nodes, and relationships
2. **Include proper metadata section** with generation timestamp and counts
3. **Structure nodes and relationships** in separate arrays within the JSON
4. **Process literary content** - extract entities from book chapters and text chunks
5. **Maintain hierarchical structure** - ensure Book→Chapter→Chunk relationships
6. **Identify literary elements** - characters, settings, events, themes from text
7. **Use sequential timestamps** to reflect narrative progression and chunk ordering
8. **Reference node IDs correctly** - relationships must reference existing node IDs
9. **Handle chunk ordering** - use chunk_order_number in properties for temporal sequencing
10. **Include meaningful properties** - add relevant metadata to both nodes and relationships

### Critical JSON Formatting Requirements:

- **Use proper JSON syntax** - valid JSON with correct escaping
- **Escape special characters** - quotes, backslashes, and control characters
- **Use consistent data types** - integers for IDs, strings for text, floats for weights
- **Include all required fields** - id, label, name, description for nodes; start_id, end_id, relationship_type, weight for relationships
- **Maintain valid structure** - proper nesting and array formatting
- **Use ISO timestamps** - consistent datetime format throughout

### Literary Processing Guidelines:

- **Extract characters** (Actors) mentioned in dialogue and narrative
- **Identify locations** (Locations) described or referenced in chunks
- **Capture events** (Events) - actions, scenes, plot developments
- **Note objects** (Objects) - items, artifacts mentioned in text
- **Extract themes** (Intangibles) - concepts, emotions, ideas conveyed
- **Maintain narrative flow** - ensure chunk order reflects story progression
- **Add contextual properties** - enrich entities with relevant metadata
- **Create meaningful relationships** - connect entities based on textual evidence

**Output Format:** Single structured JSON file optimized for literary text analysis with complete schema coverage including Book, Author, Chapter, and Chunk entities.
"""
            ),
        ],
    )

    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        print(chunk.text, end="")


if __name__ == "__main__":
    generate(df_novel, 15)


{
  "metadata": {
    "generated_at": "2024-10-27T12:00:00Z",
    "total_nodes": 10,
    "total_relationships": 11,
    "entity_types": ["Book", "Author", "Chapter", "Chunk", "Actor", "Object", "Intangible"]
  },
  "nodes": [
    {
      "id": 1,
      "label": "Book",
      "name": "ADVENTURES OF HUCKLEBERRY FINN",
      "description": "Classic American novel by Mark Twain",
      "properties": {
        "genre": "Adventure Fiction",
        "publication_year": 1884,
        "country": "United States"
      },
      "timestamp": "2025-06-01T02:58:40.860055Z"
    },
    {
      "id": 2,
      "label": "Author",
      "name": "Mark Twain",
      "description": "American writer and humorist",
      "properties": {
        "birth_year": 1835,
        "nationality": "American"
      },
      "timestamp": "2025-06-01T02:58:40.860055Z"
    },
    {
      "id": 3,
      "label": "Chapter",
      "name": "CHAPTER I.",
      "description": "First chapter of the novel",
      "properties": {
   