In [1]:
import re
import pandas as pd

def novel_to_dataframe(novel_text: str) -> pd.DataFrame:
    """
    Takes the full plain text of a novel, identifies chapters,
    chunks the text by paragraphs within each chapter, and returns a pandas DataFrame.

    Args:
        novel_text: A string containing the full plain text of the novel.

    Returns:
        A pandas DataFrame with columns 'chapter', 'chunk', and 'chunk_order_number'.
    """

    # Optional: Attempt to remove Project Gutenberg headers/footers
    # Define common markers
    start_marker_pattern = r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK [^*]+\*\*\*"
    end_marker_pattern = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK [^*]+\*\*\*"

    # Remove text before the start marker
    start_match = re.search(start_marker_pattern, novel_text)
    if start_match:
        novel_text = novel_text[start_match.end():]

    # Remove text after the end marker
    end_match = re.search(end_marker_pattern, novel_text)
    if end_match:
        novel_text = novel_text[:end_match.start()]

    novel_text = novel_text.strip()

    chapters_data = []
    chunk_order_counter = 0  # Initialize chunk order counter

    # Regex to find chapter titles like "CHAPTER I.", "CHAPTER II.", etc.
    # It assumes chapter titles are on their own line.
    chapter_pattern = re.compile(r"^(CHAPTER [IVXLCDM]+\.)", re.MULTILINE)

    matches = list(chapter_pattern.finditer(novel_text))

    if not matches:
        # If no chapters are found according to the pattern,
        # treat the whole text as chunks under an 'Unknown' chapter.
        paragraphs = re.split(r'\n\s*\n+', novel_text)
        for para_content in paragraphs:
            para_cleaned = para_content.strip()
            if para_cleaned:
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': 'Unknown',
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })
        if chapters_data:
            return pd.DataFrame(chapters_data)
        else: # If text was empty or only whitespace
            return pd.DataFrame(columns=['chapter', 'chunk', 'chunk_order_number'])

    # Process text before the first chapter, if any
    first_chapter_start_index = matches[0].start()
    text_before_first_chapter = novel_text[:first_chapter_start_index].strip()
    if text_before_first_chapter:
        paragraphs_before = re.split(r'\n\s*\n+', text_before_first_chapter)
        for para_content in paragraphs_before:
            para_cleaned = para_content.strip()
            if para_cleaned:
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': 'Preamble', # Or 'Introduction', 'Unknown', etc.
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })

    for i, match in enumerate(matches):
        chapter_title = match.group(1)  # e.g., "CHAPTER I."

        # Content starts after the current chapter title's line
        content_start_index = match.end()

        # Content ends at the start of the next chapter title, or at the end of the text
        if i + 1 < len(matches):
            content_end_index = matches[i+1].start()
        else:
            content_end_index = len(novel_text)

        chapter_content = novel_text[content_start_index:content_end_index].strip()

        # Split chapter content into paragraphs (chunks)
        paragraphs = re.split(r'\n\s*\n+', chapter_content)

        for para_content in paragraphs:
            para_cleaned = para_content.strip()
            if para_cleaned:  # Add only non-empty paragraphs
                chunk_order_counter += 1
                chapters_data.append({
                    'chapter': chapter_title,
                    'chunk': para_cleaned,
                    'chunk_order_number': chunk_order_counter
                })

    df = pd.DataFrame(chapters_data)
    return df

if __name__ == '__main__':
    # Use the example text you provided (truncated for brevity here)
    file_path = "/Users/davidspencer/Downloads/memgraph_import/Adventures of Huckleberry Finn.txt" # Replace with the actual path to your .txt file

    with open(file_path, 'r') as file:
        huckleberry_finn_text = file.read()


    # You would continue with CHAPTER III, IV, V, VI, VII... and the rest of the novel.
    # For this example, we're using the truncated text above.

    # Process the text
    df_novel = novel_to_dataframe(huckleberry_finn_text)

    # Display the DataFrame (or parts of it)
    print(f"Successfully processed {len(df_novel)} chunks.")
    print("\nFirst 10 chunks:")
    print(df_novel.head(10))

    print("\nLast 10 chunks:")
    print(df_novel.tail(10))

    # Example: Show all chunks from a specific chapter
    if not df_novel.empty and 'CHAPTER II.' in df_novel['chapter'].unique():
        print("\nAll chunks from CHAPTER II.:")
        print(df_novel[df_novel['chapter'] == 'CHAPTER II.'])
    elif not df_novel.empty:
        print(f"\nChunks found, but 'CHAPTER II.' not present in the sample used. Available chapters: {df_novel['chapter'].unique()}")

Successfully processed 2187 chunks.

First 10 chunks:
    chapter                                              chunk  \
0  Preamble                   ADVENTURES\nOF\nHUCKLEBERRY FINN   
1  Preamble                             (Tom Sawyer’s Comrade)   
2  Preamble                                      By Mark Twain   
3  Preamble                                            NOTICE.   
4  Preamble  Persons attempting to find a motive in this na...   
5  Preamble  BY ORDER OF THE AUTHOR\nPER G. G., CHIEF OF OR...   
6  Preamble                                        EXPLANATORY   
7  Preamble  In this book a number of dialects are used, to...   
8  Preamble  I make this explanation for the reason that wi...   
9  Preamble                                        THE AUTHOR.   

   chunk_order_number  
0                   1  
1                   2  
2                   3  
3                   4  
4                   5  
5                   6  
6                   7  
7                   8  
8  

In [2]:
df_novel

Unnamed: 0,chapter,chunk,chunk_order_number
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1
1,Preamble,(Tom Sawyer’s Comrade),2
2,Preamble,By Mark Twain,3
3,Preamble,NOTICE.,4
4,Preamble,Persons attempting to find a motive in this na...,5
...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186


In [3]:
# Define the author and book title
author_name = "Mark Twain"
book_title = "ADVENTURES OF HUCKLEBERRY FINN"

# Add the 'author' column with the specified value for all rows
df_novel["author"] = author_name

# Add the 'book' column with the specified value for all rows
df_novel["book"] = book_title

In [4]:
df_novel

Unnamed: 0,chapter,chunk,chunk_order_number,author,book
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
1,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2,Preamble,By Mark Twain,3,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
3,Preamble,NOTICE.,4,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
4,Preamble,Persons attempting to find a motive in this na...,5,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
...,...,...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN


In [35]:
updated_df

Unnamed: 0,chapter,chunk,chunk_order_number,author,book,kg_json
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
1,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2,Preamble,By Mark Twain,3,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
3,Preamble,NOTICE.,4,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
4,Preamble,Persons attempting to find a motive in this na...,5,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
...,...,...,...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."


In [17]:
import os
import datetime
import pytz
import asyncio
import pandas as pd
from google import genai
from google.genai import types

# ----------------------------------------------------------------------
# 1) Hoist your “static” system_text out of the loop so you only build it once.
# ----------------------------------------------------------------------
SYSTEM_TEXT = """# Memgraph Literary Knowledge Graph Generator

Generate a single structured JSON file containing both nodes and relationships for Memgraph import. This file will contain all entities and their connections in one unified format optimized for literary text analysis with the following knowledge graph schema:

**Processing Context:** This prompt is designed for chunk-by-chunk processing of literary texts, where each chunk represents a segment of a chapter with specific ordering. The input data contains:

- `author`: Author name
- `book`: Book title
- `chapter`: Chapter identifier (e.g., "CHAPTER II")  
- `chunk`: Text content passage
- `chunk_order_number`: Sequential position within the book
- `datetime`: Current date and time in UTC at time of processing/generation

## Entity Types

- **Actor**: People, organizations, characters, agents
- **Object**: Physical items, tools, documents, artifacts  
- **Location**: Places, addresses, geographic areas
- **Event**: Actions, incidents, occurrences, processes
- **Intangible**: Knowledge, concepts, ideas, beliefs
- **Book**: Literary works, novels, publications
- **Author**: Writers, creators of literary works
- **Chapter**: Sections or divisions within books
- **Chunk**: Text segments or passages within chapters

## Relationship Schema

### Actor Relationships

- `(Actor)-[INTERACTED_WITH]->(Actor)`
- `(Actor)-[MENTIONED]->(Actor)`
- `(Actor)-[ASSOCIATED_WITH]->(Actor)`
- `(Actor)-[USED]->(Object)`
- `(Actor)-[ACQUIRED]->(Object)`
- `(Actor)-[DE_ACQUIRED]->(Object)`
- `(Actor)-[CREATED]->(Object)`
- `(Actor)-[DESTROYED]->(Object)`
- `(Actor)-[MODIFIED]->(Object)`
- `(Actor)-[ARRIVED_AT]->(Location)`
- `(Actor)-[DEPARTED_FROM]->(Location)`
- `(Actor)-[LOCATED_AT]->(Location)`
- `(Actor)-[PARTICIPATED_IN]->(Event)`
- `(Actor)-[LEARNED]->(Intangible)`
- `(Actor)-[FORGOT]->(Intangible)`
- `(Actor)-[CLAIMED]->(Intangible)`
- `(Actor)-[REFUTED]->(Intangible)`

### Object Relationships

- `(Object)-[BELONGS_TO]->(Actor)`
- `(Object)-[INFLUENCES]->(Actor)`
- `(Object)-[ATTRACTS]->(Actor)`
- `(Object)-[CONTAINS]->(Object)`
- `(Object)-[PART_OF]->(Object)`
- `(Object)-[CONNECTED_TO]->(Object)`
- `(Object)-[SIMILAR_TO]->(Object)`
- `(Object)-[LOCATED_IN]->(Location)`
- `(Object)-[ORIGINATED_FROM]->(Location)`
- `(Object)-[INVOLVED_IN]->(Event)`
- `(Object)-[CAUSED]->(Event)`
- `(Object)-[RESULTED_FROM]->(Event)`
- `(Object)-[REPRESENTS]->(Intangible)`
- `(Object)-[EMBODIES]->(Intangible)`

### Location Relationships

- `(Location)-[HOSTS]->(Actor)`
- `(Location)-[EXCLUDES]->(Actor)`
- `(Location)-[CONTAINS]->(Object)`
- `(Location)-[HOUSES]->(Object)`
- `(Location)-[CONTAINS]->(Location)`
- `(Location)-[ADJACENT_TO]->(Location)`
- `(Location)-[PART_OF]->(Location)`
- `(Location)-[CONNECTED_TO]->(Location)`
- `(Location)-[HOSTED]->(Event)`
- `(Location)-[WITNESSED]->(Event)`
- `(Location)-[ASSOCIATED_WITH]->(Intangible)`
- `(Location)-[SYMBOLIZES]->(Intangible)`

### Event Relationships

- `(Event)-[AFFECTED]->(Actor)`
- `(Event)-[CAUSED_BY]->(Actor)`
- `(Event)-[INVOLVED]->(Object)`
- `(Event)-[PRODUCED]->(Object)`
- `(Event)-[CONSUMED]->(Object)`
- `(Event)-[OCCURRED_AT]->(Location)`
- `(Event)-[MOVED_FROM]->(Location)`
- `(Event)-[MOVED_TO]->(Location)`
- `(Event)-[PRECEDED]->(Event)`
- `(Event)-[FOLLOWED]->(Event)`
- `(Event)-[CAUSED]->(Event)`
- `(Event)-[CONCURRENT_WITH]->(Event)`
- `(Event)-[REVEALED]->(Intangible)`
- `(Event)-[DEMONSTRATED]->(Intangible)`
- `(Event)-[RESULTED_IN]->(Intangible)`

### Intangible Relationships

- `(Intangible)-[INFLUENCED]->(Actor)`
- `(Intangible)-[POSSESSED_BY]->(Actor)`
- `(Intangible)-[KNOWN_BY]->(Actor)`
- `(Intangible)-[APPLIED_TO]->(Object)`
- `(Intangible)-[MANIFESTED_IN]->(Object)`
- `(Intangible)-[ORIGINATED_FROM]->(Location)`
- `(Intangible)-[ASSOCIATED_WITH]->(Location)`
- `(Intangible)-[DEMONSTRATED_IN]->(Event)`
- `(Intangible)-[REVEALED_BY]->(Event)`
- `(Intangible)-[CONTRADICTS]->(Intangible)`
- `(Intangible)-[SUPPORTS]->(Intangible)`
- `(Intangible)-[DERIVED_FROM]->(Intangible)`
- `(Intangible)-[RELATED_TO]->(Intangible)`

### Book Relationships

- `(Book)-[WRITTEN_BY]->(Author)`
- `(Book)-[CONTAINS]->(Chapter)`
- `(Book)-[FEATURES]->(Actor)`
- `(Book)-[MENTIONS]->(Object)`
- `(Book)-[SET_IN]->(Location)`
- `(Book)-[DESCRIBES]->(Event)`
- `(Book)-[EXPLORES]->(Intangible)`
- `(Book)-[PUBLISHED_AT]->(Location)`
- `(Book)-[PUBLISHED_IN]->(Event)`

### Author Relationships

- `(Author)-[WROTE]->(Book)`
- `(Author)-[CREATED]->(Object)`
- `(Author)-[LIVED_IN]->(Location)`
- `(Author)-[BORN_IN]->(Location)`
- `(Author)-[PARTICIPATED_IN]->(Event)`
- `(Author)-[INFLUENCED_BY]->(Intangible)`
- `(Author)-[INFLUENCED]->(Actor)`
- `(Author)-[CONTEMPORARY_OF]->(Author)`
- `(Author)-[INSPIRED_BY]->(Author)`

### Chapter Relationships

- `(Chapter)-[PART_OF]->(Book)`
- `(Chapter)-[CONTAINS]->(Chunk)`
- `(Chapter)-[PRECEDED_BY]->(Chapter)`
- `(Chapter)-[FOLLOWED_BY]->(Chapter)`
- `(Chapter)-[FEATURES]->(Actor)`
- `(Chapter)-[MENTIONS]->(Object)`
- `(Chapter)-[SET_IN]->(Location)`
- `(Chapter)-[DESCRIBES]->(Event)`
- `(Chapter)-[CONVEYS]->(Intangible)`

### Chunk Relationships

- `(Chunk)-[PART_OF]->(Chapter)`
- `(Chunk)-[PRECEDED_BY]->(Chunk)`
- `(Chunk)-[FOLLOWED_BY]->(Chunk)`
- `(Chunk)-[MENTIONS]->(Actor)`
- `(Chunk)-[REFERENCES]->(Object)`
- `(Chunk)-[DESCRIBES]->(Location)`
- `(Chunk)-[DEPICTS]->(Event)`
- `(Chunk)-[CONVEYS]->(Intangible)`
- `(Chunk)-[CONTAINS_DIALOGUE_BY]->(Actor)`

## Output Requirements

**Generate a single structured JSON file containing both nodes and relationships.**

### JSON Structure: **knowledge_graph.json**

```json
{{
  "metadata": {{
    "generated_at": "{timestamp}",
    "total_nodes": 85,
    "total_relationships": 157,
    "entity_types": ["Actor", "Object", "Location", "Event", "Intangible", "Book", "Author", "Chapter", "Chunk"]
  }},
  "nodes": [
    {{
      "id": 1,
      "label": "Book",
      "name": "The Adventures of Huckleberry Finn",
      "description": "Classic American novel by Mark Twain",
      "properties": {{
        "genre": "Adventure Fiction",
        "publication_year": 1884
      }},
      "timestamp": "{timestamp}"
    }}
  ],
  "relationships": [
    {{
      "start_id": 1,
      "end_id": 2,
      "relationship_type": "WRITTEN_BY",
      "weight": 1.0,
      "properties": {{
        "relationship_strength": "primary"
      }},
      "timestamp": "{timestamp}"
    }}
  ]
}}```
"""


# ----------------------------------------------------------------------
# 2) Your generate function can remain exactly as before; it still expects `row[...]`
# ----------------------------------------------------------------------
async def generate_chunk_json_simple(client: genai.Client, row: pd.Series) -> str:
    now_iso = datetime.datetime.now(pytz.UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
    model = "gemini-2.0-flash-lite"

    user_text = (
        f"Author: {row['author']}\n"
        f"Book: {row['book']}\n"
        f"Chapter: {row['chapter']}\n"
        f"chunk_order_number: {row['chunk_order_number']}\n"
        f"Chunk: {row['chunk']}\n"
        f"Datetime: {now_iso}\n"
    )

    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=SYSTEM_TEXT + "\n" + user_text)],
        ),
    ]

    generate_content_config = types.GenerateContentConfig(
        response_mime_type="application/json",
        # (You can optionally set max_output_token_count here if you want to cap response length)
        # max_output_token_count=256
    )

    response = await client.aio.models.generate_content(
        model=model,
        contents=contents,
        config=generate_content_config,
    )
    return response.text.strip()


# ----------------------------------------------------------------------
# 3) New “process_dataframe_concurrent” that:
#    • launches all tasks immediately (no asyncio.sleep)
#    • uses a Semaphore(50) to cap concurrency at 50
#    • iterates with df.iterrows() so row[...] works
# ----------------------------------------------------------------------
async def process_dataframe_concurrent(df: pd.DataFrame) -> pd.DataFrame:
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
    semaphore = asyncio.Semaphore(50)  # cap at 50 in-flight calls

    async def generate_with_semaphore(row: pd.Series) -> str:
        async with semaphore:
            try:
                return await generate_chunk_json_simple(client, row)
            except Exception as e:
                return f'{{"error": "{e}"}}'

    # Kick off one task per row, but each call to generate will wait its turn under the semaphore
    tasks = [
        asyncio.create_task(generate_with_semaphore(row)) for _, row in df.iterrows()
    ]

    results = await asyncio.gather(*tasks)
    df2 = df.copy()
    df2["kg_json"] = results
    return df2


# ===== JUPYTER NOTEBOOK USAGE =====
# In your Jupyter cell, just call:
# updated_df = await process_dataframe_concurrent(df_novel)


In [18]:
# Then in Jupyter:
updated_df = await process_dataframe_concurrent(df_novel)

In [38]:
# display the first row's kg_json as multi-line text
import json
json.loads(updated_df["kg_json"].iloc[15])


[{'metadata': {'generated_at': '2024-02-29T12:00:00Z',
   'total_nodes': 6,
   'total_relationships': 5,
   'entity_types': ['Actor', 'Object', 'Book', 'Author', 'Chapter', 'Chunk']},
  'nodes': [{'id': 1,
    'label': 'Author',
    'name': 'Mark Twain',
    'description': 'Author of ADVENTURES OF HUCKLEBERRY FINN',
    'properties': {},
    'timestamp': '2025-06-01T16:36:15Z'},
   {'id': 2,
    'label': 'Book',
    'name': 'ADVENTURES OF HUCKLEBERRY FINN',
    'description': 'Book by Mark Twain',
    'properties': {},
    'timestamp': '2025-06-01T16:36:15Z'},
   {'id': 3,
    'label': 'Chapter',
    'name': 'CHAPTER I.',
    'description': 'Chapter 1 of ADVENTURES OF HUCKLEBERRY FINN',
    'properties': {},
    'timestamp': '2025-06-01T16:36:15Z'},
   {'id': 4,
    'label': 'Chunk',
    'name': 'Chunk 16',
    'description': 'Text passage from CHAPTER I.',
    'properties': {'chunk_order_number': 16},
    'timestamp': '2025-06-01T16:36:15Z'},
   {'id': 5,
    'label': 'Actor',
    'na

Unnamed: 0,chapter,chunk,chunk_order_number,author,book,kg_json
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
1,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2,Preamble,By Mark Twain,3,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
3,Preamble,NOTICE.,4,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
4,Preamble,Persons attempting to find a motive in this na...,5,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
...,...,...,...,...,...,...
2182,CHAPTER XLII.,"“Nemmine why, Huck—but he ain’t comin’ back no...",2183,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2183,CHAPTER XLII.,But I kept at him; so at last he says:,2184,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2184,CHAPTER XLII.,“Doan’ you ’member de house dat was float’n do...,2185,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."
2185,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,"[\n {\n ""metadata"": {\n ""generated_at..."


In [39]:
import uuid
# import hashlib
import json
from typing import Dict, List, Any


def generate_deterministic_uuid(label: str, name: str) -> str:
    """
    Generate a deterministic UUID based on label and name.
    Uses UUID5 with a custom namespace for consistency.
    """
    # Create a custom namespace UUID for our application
    namespace = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")  # Standard namespace

    # Combine label and name for the hash input
    combined_string = f"{label}:{name}"

    # Generate deterministic UUID5
    return str(uuid.uuid5(namespace, combined_string))


def convert_nodes_to_uuid(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Convert node IDs to UUIDs and update relationships accordingly.
    """
    converted_data = []

    for chunk in data:
        # Create mapping from old ID to new UUID
        id_to_uuid_mapping = {}

        # Process nodes and create UUID mapping
        new_nodes = []
        for node in chunk["nodes"]:
            old_id = node["id"]
            new_uuid = generate_deterministic_uuid(node["label"], node["name"])

            # Store mapping for relationship updates
            id_to_uuid_mapping[old_id] = new_uuid

            # Create new node with UUID
            new_node = node.copy()
            new_node["id"] = new_uuid
            new_nodes.append(new_node)

        # Process relationships with updated IDs
        new_relationships = []
        for rel in chunk["relationships"]:
            new_rel = rel.copy()
            new_rel["start_id"] = id_to_uuid_mapping[rel["start_id"]]
            new_rel["end_id"] = id_to_uuid_mapping[rel["end_id"]]
            new_relationships.append(new_rel)

        # Create new chunk with updated data
        new_chunk = chunk.copy()
        new_chunk["nodes"] = new_nodes
        new_chunk["relationships"] = new_relationships

        converted_data.append(new_chunk)

    return converted_data


def print_uuid_mapping(data: List[Dict[str, Any]]) -> None:
    """
    Print the mapping between old IDs and new UUIDs for verification.
    """
    print("ID to UUID Mapping:")
    print("-" * 50)

    for chunk_idx, chunk in enumerate(data):
        print(f"Chunk {chunk_idx + 1}:")
        for node in chunk["nodes"]:
            label = node["label"]
            name = node["name"]
            old_id = node["id"]
            new_uuid = generate_deterministic_uuid(label, name)
            print(f"  {old_id} -> {new_uuid} ({label}: {name})")
        print()


# Example usage with your sample data
sample_data = [
    {
        "metadata": {
            "generated_at": "2024-02-29T12:00:00Z",
            "total_nodes": 6,
            "total_relationships": 5,
            "entity_types": ["Actor", "Object", "Book", "Author", "Chapter", "Chunk"],
        },
        "nodes": [
            {
                "id": 1,
                "label": "Author",
                "name": "Mark Twain",
                "description": "Author of ADVENTURES OF HUCKLEBERRY FINN",
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "id": 2,
                "label": "Book",
                "name": "ADVENTURES OF HUCKLEBERRY FINN",
                "description": "Book by Mark Twain",
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "id": 3,
                "label": "Chapter",
                "name": "CHAPTER I.",
                "description": "Chapter 1 of ADVENTURES OF HUCKLEBERRY FINN",
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "id": 4,
                "label": "Chunk",
                "name": "Chunk 16",
                "description": "Text passage from CHAPTER I.",
                "properties": {"chunk_order_number": 16},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "id": 5,
                "label": "Actor",
                "name": "Moses",
                "description": "Biblical figure",
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "id": 6,
                "label": "Object",
                "name": "book",
                "description": "a book",
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
        ],
        "relationships": [
            {
                "start_id": 2,
                "end_id": 1,
                "relationship_type": "WRITTEN_BY",
                "weight": 1.0,
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "start_id": 3,
                "end_id": 2,
                "relationship_type": "PART_OF",
                "weight": 1.0,
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "start_id": 4,
                "end_id": 3,
                "relationship_type": "PART_OF",
                "weight": 1.0,
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "start_id": 4,
                "end_id": 5,
                "relationship_type": "MENTIONS",
                "weight": 1.0,
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
            {
                "start_id": 4,
                "end_id": 6,
                "relationship_type": "REFERENCES",
                "weight": 1.0,
                "properties": {},
                "timestamp": "2025-06-01T16:36:15Z",
            },
        ],
    }
]

if __name__ == "__main__":
    # Show the mapping for verification
    print_uuid_mapping(sample_data)

    # Convert the data
    converted_data = convert_nodes_to_uuid(sample_data)

    # Print the converted data
    print("Converted Data:")
    print("=" * 50)
    print(json.dumps(converted_data, indent=2))

    # Verify that the same entity gets the same UUID consistently
    print("\nTesting consistency - Mark Twain should always get the same UUID:")
    uuid1 = generate_deterministic_uuid("Author", "Mark Twain")
    uuid2 = generate_deterministic_uuid("Author", "Mark Twain")
    print(f"UUID 1: {uuid1}")
    print(f"UUID 2: {uuid2}")
    print(f"Same UUID: {uuid1 == uuid2}")


ID to UUID Mapping:
--------------------------------------------------
Chunk 1:
  1 -> f1912760-8749-51e0-908a-70e6ce194ca4 (Author: Mark Twain)
  2 -> f32bd123-42c0-5ba0-a943-797dc52d7e1a (Book: ADVENTURES OF HUCKLEBERRY FINN)
  3 -> d3cdfa5b-eeb7-59fb-afef-09fee86196ea (Chapter: CHAPTER I.)
  4 -> 56d34d02-4232-5a49-900e-363974bc6986 (Chunk: Chunk 16)
  5 -> 169b7f92-46f8-587d-8944-68f47c0f918a (Actor: Moses)
  6 -> c1e8b72b-ee1d-5fe8-8e33-2b28bee79ed4 (Object: book)

Converted Data:
[
  {
    "metadata": {
      "generated_at": "2024-02-29T12:00:00Z",
      "total_nodes": 6,
      "total_relationships": 5,
      "entity_types": [
        "Actor",
        "Object",
        "Book",
        "Author",
        "Chapter",
        "Chunk"
      ]
    },
    "nodes": [
      {
        "id": "f1912760-8749-51e0-908a-70e6ce194ca4",
        "label": "Author",
        "name": "Mark Twain",
        "description": "Author of ADVENTURES OF HUCKLEBERRY FINN",
        "properties": {},
        "time

In [31]:
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2187 entries, 0 to 2186
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   chapter             2187 non-null   object
 1   chunk               2187 non-null   object
 2   chunk_order_number  2187 non-null   int64 
 3   author              2187 non-null   object
 4   book                2187 non-null   object
 5   kg_json             2187 non-null   object
dtypes: int64(1), object(5)
memory usage: 102.6+ KB


In [32]:
# pip install pandas pyarrow

updated_df.to_feather("huckleberry_finn_kg_data_unnormalized.feather")


In [49]:
import pandas as pd
import json
import uuid
import time
import logging
import re
from typing import Dict, List, Any, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def generate_deterministic_uuid(label: str, name: Optional[str] = None, node_id: Optional[int] = None) -> str:
    """
    Generate a deterministic UUID based on label and name.
    Falls back to using label + node_id if name is missing.
    """
    namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
    
    if name:
        combined_string = f"{label}:{name}"
    elif node_id is not None:
        # Fallback: use label + original node_id if name is missing
        combined_string = f"{label}:node_{node_id}"
    else:
        # Last resort: just use label + random component
        combined_string = f"{label}:unnamed_{uuid.uuid4().hex[:8]}"
    
    return str(uuid.uuid5(namespace, combined_string))

def clean_json_string(json_str: str) -> str:
    """
    Clean common JSON issues like invalid escape sequences.
    """
    try:
        # Fix common escape sequence issues
        # Replace invalid escapes like \' and \" that aren't properly escaped
        cleaned = re.sub(r'\\(?!["\\/bfnrt]|u[0-9a-fA-F]{4})', r'\\\\', json_str)
        return cleaned
    except Exception:
        return json_str

def convert_nodes_to_uuid(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Convert node IDs to UUIDs and update relationships accordingly.
    Handles missing 'name' fields gracefully.
    """
    converted_data = []
    
    for chunk in data:
        # Create mapping from old ID to new UUID
        id_to_uuid_mapping = {}
        
        # Process nodes and create UUID mapping
        new_nodes = []
        for node in chunk.get('nodes', []):
            old_id = node.get('id')
            label = node.get('label', 'Unknown')
            name = node.get('name')  # This might be None/missing
            
            # Generate UUID with fallback handling
            new_uuid = generate_deterministic_uuid(label, name, old_id)
            
            # Store mapping for relationship updates
            if old_id is not None:
                id_to_uuid_mapping[old_id] = new_uuid
            
            # Create new node with UUID
            new_node = node.copy()
            new_node['id'] = new_uuid
            
            # Add name if missing (for consistency)
            if 'name' not in new_node or new_node['name'] is None:
                new_node['name'] = f"{label}_{old_id}" if old_id else f"{label}_unnamed"
            
            new_nodes.append(new_node)
        
        # Process relationships with updated IDs
        new_relationships = []
        for rel in chunk.get('relationships', []):
            new_rel = rel.copy()
            start_id = rel.get('start_id')
            end_id = rel.get('end_id')
            
            # Only update if we have the mapping
            if start_id in id_to_uuid_mapping:
                new_rel['start_id'] = id_to_uuid_mapping[start_id]
            if end_id in id_to_uuid_mapping:
                new_rel['end_id'] = id_to_uuid_mapping[end_id]
                
            new_relationships.append(new_rel)
        
        # Create new chunk with updated data
        new_chunk = chunk.copy()
        new_chunk['nodes'] = new_nodes
        new_chunk['relationships'] = new_relationships
        
        converted_data.append(new_chunk)
    
    return converted_data

def process_kg_json_row(kg_json_str: str, row_index: int) -> tuple[str, bool, str]:
    """
    Process a single row's kg_json string and convert to UUID format.
    Returns: (processed_json, success, error_message)
    """
    try:
        # Clean the JSON string first
        cleaned_json = clean_json_string(kg_json_str)
        
        # Parse the JSON string
        kg_data = json.loads(cleaned_json)
        
        # Handle both single chunk and list of chunks
        if isinstance(kg_data, dict):
            kg_list = [kg_data]
        elif isinstance(kg_data, list):
            kg_list = kg_data
        else:
            return kg_json_str, False, f"Unexpected data type: {type(kg_data)}"
        
        # Convert to UUID format
        converted_data = convert_nodes_to_uuid(kg_list)
        
        # Return the same format as input
        if isinstance(kg_data, dict):
            result = converted_data[0]
        else:
            result = converted_data
        
        return json.dumps(result), True, ""
        
    except json.JSONDecodeError as e:
        return kg_json_str, False, f"JSON decode error: {str(e)}"
    except KeyError as e:
        return kg_json_str, False, f"Missing key: {str(e)}"
    except Exception as e:
        return kg_json_str, False, f"Processing error: {str(e)}"

def analyze_errors(df: pd.DataFrame, sample_error_rows: List[int] = None) -> None:
    """
    Analyze the problematic rows to understand the issues better.
    """
    print("\n🔍 Analyzing error patterns...")
    
    if sample_error_rows is None:
        sample_error_rows = []
    
    for row_idx in sample_error_rows[:3]:  # Analyze first 3 error rows
        try:
            print(f"\n--- Row {row_idx} Analysis ---")
            kg_json = df.iloc[row_idx]['kg_json']
            
            # Try to parse and show structure
            try:
                cleaned = clean_json_string(kg_json)
                data = json.loads(cleaned)
                
                if isinstance(data, list) and len(data) > 0:
                    data = data[0]  # Look at first chunk
                
                if isinstance(data, dict):
                    nodes = data.get('nodes', [])
                    print(f"  Nodes found: {len(nodes)}")
                    
                    for i, node in enumerate(nodes[:2]):  # Show first 2 nodes
                        print(f"    Node {i}: {node}")
                        missing_fields = []
                        if 'name' not in node or node.get('name') is None:
                            missing_fields.append('name')
                        if 'label' not in node:
                            missing_fields.append('label')
                        if 'id' not in node:
                            missing_fields.append('id')
                        if missing_fields:
                            print(f"      Missing fields: {missing_fields}")
                            
            except Exception as e:
                print(f"  Analysis error: {e}")
                print(f"  First 200 chars: {kg_json[:200]}")
                
        except Exception as e:
            print(f"Row {row_idx}: Could not analyze - {e}")

def process_dataframe_kg_json(df: pd.DataFrame, batch_size: int = 100) -> tuple[pd.DataFrame, dict]:
    """
    Process all rows in the dataframe to convert kg_json to UUID format.
    Returns: (processed_dataframe, error_stats)
    """
    logger.info(f"Starting processing of {len(df)} rows...")
    
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # Track errors and statistics
    error_stats = {
        'total_errors': 0,
        'json_decode_errors': 0,
        'missing_name_errors': 0,
        'other_errors': 0,
        'error_rows': []
    }
    
    converted_json = []
    
    for idx, row in df.iterrows():
        processed_json, success, error_msg = process_kg_json_row(row['kg_json'], idx)
        converted_json.append(processed_json)
        
        if not success:
            error_stats['total_errors'] += 1
            error_stats['error_rows'].append(idx)
            
            if 'JSON decode error' in error_msg:
                error_stats['json_decode_errors'] += 1
            elif 'name' in error_msg.lower():
                error_stats['missing_name_errors'] += 1
            else:
                error_stats['other_errors'] += 1
            
            logger.error(f"Row {idx}: {error_msg}")
        
        # Log progress
        if (idx + 1) % batch_size == 0:
            print(f"Processed {idx + 1}/{len(df)} rows... (Errors so far: {error_stats['total_errors']})")
    
    # Update the dataframe
    result_df['kg_json'] = converted_json
    
    print(f"Processing complete! Processed {len(df)} rows with {error_stats['total_errors']} errors.")
    
    return result_df, error_stats

def quick_process_updated_df(updated_df):
    """
    Quick processing function with detailed error reporting.
    """
    print(f"📊 Processing dataframe with {len(updated_df)} rows...")
    print(f"📋 Columns: {list(updated_df.columns)}")
    
    start_time = time.time()
    
    # Process the kg_json column
    processed_df, error_stats = process_dataframe_kg_json(updated_df, batch_size=200)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    print(f"⏱️  Processing completed in {processing_time:.2f} seconds")
    print(f"📈 Average time per row: {processing_time/len(updated_df)*1000:.2f} ms")
    
    # Show detailed error statistics
    print(f"\n📊 Error Statistics:")
    print(f"   Total errors: {error_stats['total_errors']}")
    print(f"   JSON decode errors: {error_stats['json_decode_errors']}")
    print(f"   Missing name errors: {error_stats['missing_name_errors']}")
    print(f"   Other errors: {error_stats['other_errors']}")
    print(f"   Success rate: {((len(updated_df) - error_stats['total_errors']) / len(updated_df) * 100):.1f}%")
    
    if error_stats['error_rows']:
        analyze_errors(updated_df, error_stats['error_rows'])
    
    return processed_df, error_stats

def validate_conversion(original_df: pd.DataFrame, converted_df: pd.DataFrame):
    """
    Quick validation of the conversion.
    """
    print("\n🔍 Validating conversion...")
    
    try:
        # Check first row
        original = json.loads(original_df['kg_json'].iloc[0])
        converted = json.loads(converted_df['kg_json'].iloc[0])
        
        # Get nodes from the data structure
        if isinstance(original, dict):
            orig_nodes = original.get('nodes', [])
            conv_nodes = converted.get('nodes', [])
        else:
            orig_nodes = original[0].get('nodes', []) if original else []
            conv_nodes = converted[0].get('nodes', []) if converted else []
        
        print(f"   Original nodes: {len(orig_nodes)}")
        print(f"   Converted nodes: {len(conv_nodes)}")
        
        if len(orig_nodes) > 0 and len(conv_nodes) > 0:
            print(f"   Sample original ID: {orig_nodes[0]['id']}")
            print(f"   Sample converted ID: {conv_nodes[0]['id']}")
            
            # Verify UUID format
            try:
                uuid.UUID(conv_nodes[0]['id'])
                print(f"   ✅ Valid UUID format")
            except ValueError:
                print(f"   ❌ Invalid UUID format")
        
        # Check for consistent name handling
        if len(conv_nodes) > 0:
            has_name = 'name' in conv_nodes[0] and conv_nodes[0]['name'] is not None
            print(f"   ✅ Name field present: {has_name}")
                
    except Exception as e:
        print(f"   ⚠️  Validation error: {e}")

# MAIN EXECUTION
def process_updated_df_complete(updated_df):
    """
    Complete processing function with robust error handling.
    """
    print("🚀 Starting KG JSON processing with UUID conversion...")
    print("🛡️  Enhanced error handling for missing names and malformed JSON")
    
    # Show sample of original data
    print(f"\n📝 Sample original kg_json (first 150 characters):")
    print(updated_df['kg_json'].iloc[0][:150] + "...")
    
    # Process the dataframe
    processed_df, error_stats = quick_process_updated_df(updated_df)
    
    # Show sample of processed data
    print(f"\n✅ Sample processed kg_json (first 150 characters):")
    print(processed_df['kg_json'].iloc[0][:150] + "...")
    
    # Validate the conversion
    validate_conversion(updated_df, processed_df)
    
    # Show final statistics
    print(f"\n📊 Final Statistics:")
    print(f"   Original rows: {len(updated_df)}")
    print(f"   Processed rows: {len(processed_df)}")
    print(f"   Successful conversions: {len(updated_df) - error_stats['total_errors']}")
    print(f"   Rows with errors (kept original): {error_stats['total_errors']}")
    print(f"   Overall success rate: {((len(updated_df) - error_stats['total_errors']) / len(updated_df) * 100):.1f}%")
    
    print(f"\n🎉 Processing complete! Use the returned dataframe.")
    print(f"💡 Rows with errors kept their original JSON format")
    
    return processed_df, error_stats

# Call this function with your dataframe:
# processed_df, error_stats = process_updated_df_complete(updated_df)

print("✅ Enhanced functions loaded! Now handles missing names and JSON errors.")
print("📞 Call: processed_df, error_stats = process_updated_df_complete(updated_df)")

✅ Enhanced functions loaded! Now handles missing names and JSON errors.
📞 Call: processed_df, error_stats = process_updated_df_complete(updated_df)


In [50]:
processed_df, error_stats = process_updated_df_complete(updated_df)


INFO:__main__:Starting processing of 2187 rows...


🚀 Starting KG JSON processing with UUID conversion...
🛡️  Enhanced error handling for missing names and malformed JSON

📝 Sample original kg_json (first 150 characters):
[
  {
    "metadata": {
      "generated_at": "2024-11-02T10:00:00Z",
      "total_nodes": 3,
      "total_relationships": 1,
      "entity_types": [
...
📊 Processing dataframe with 2187 rows...
📋 Columns: ['chapter', 'chunk', 'chunk_order_number', 'author', 'book', 'kg_json']
Processed 200/2187 rows... (Errors so far: 0)
Processed 400/2187 rows... (Errors so far: 0)
Processed 600/2187 rows... (Errors so far: 0)
Processed 800/2187 rows... (Errors so far: 0)
Processed 1000/2187 rows... (Errors so far: 0)
Processed 1200/2187 rows... (Errors so far: 0)
Processed 1400/2187 rows... (Errors so far: 0)
Processed 1600/2187 rows... (Errors so far: 0)
Processed 1800/2187 rows... (Errors so far: 0)
Processed 2000/2187 rows... (Errors so far: 0)
Processing complete! Processed 2187 rows with 0 errors.
⏱️  Processing completed in 0.

In [54]:

json.loads(processed_df["kg_json"].iloc[1411])

[{'metadata': {'generated_at': '2025-06-01T16:39:21Z',
   'total_nodes': 6,
   'total_relationships': 7,
   'entity_types': ['Actor',
    'Book',
    'Author',
    'Chapter',
    'Chunk',
    'Intangible']},
  'nodes': [{'id': 'f1912760-8749-51e0-908a-70e6ce194ca4',
    'label': 'Author',
    'name': 'Mark Twain',
    'properties': {'nationality': 'American'},
    'timestamp': '2025-06-01T16:39:21Z'},
   {'id': 'f32bd123-42c0-5ba0-a943-797dc52d7e1a',
    'label': 'Book',
    'name': 'ADVENTURES OF HUCKLEBERRY FINN',
    'properties': {'genre': 'Adventure Fiction'},
    'timestamp': '2025-06-01T16:39:21Z'},
   {'id': 'a3df4c5c-9289-59f5-91c2-d52144590f46',
    'label': 'Chapter',
    'name': 'CHAPTER XXIX.',
    'properties': {},
    'timestamp': '2025-06-01T16:39:21Z'},
   {'id': '74e4287e-f952-5693-8bd0-db23bc6e4832',
    'label': 'Chunk',
    'text': '“He _can’t_ write with his left hand,” says the old gentleman. “If he\ncould use his right hand, you would see that he wrote his own l

In [52]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2187 entries, 0 to 2186
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   chapter             2187 non-null   object
 1   chunk               2187 non-null   object
 2   chunk_order_number  2187 non-null   int64 
 3   author              2187 non-null   object
 4   book                2187 non-null   object
 5   kg_json             2187 non-null   object
dtypes: int64(1), object(5)
memory usage: 102.6+ KB


In [55]:
import pandas as pd
import json
import logging
from typing import Dict, List, Any, Tuple

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def extract_nodes_from_kg_json(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract all nodes from kg_json into individual DataFrame rows.
    Retains original columns and adds node-specific columns.
    """
    print("📊 Extracting nodes from kg_json...")

    node_rows = []
    total_nodes = 0
    errors = 0

    for idx, row in processed_df.iterrows():
        try:
            # Parse the kg_json
            kg_data = json.loads(row["kg_json"])

            # Handle both single chunk and list of chunks
            chunks = [kg_data] if isinstance(kg_data, dict) else kg_data

            for chunk_idx, chunk in enumerate(chunks):
                nodes = chunk.get("nodes", [])

                for node in nodes:
                    # Create new row with original columns plus node data
                    node_row = {
                        # Original columns
                        "chapter": row["chapter"],
                        "chunk": row["chunk"],
                        "chunk_order_number": row["chunk_order_number"],
                        "author": row["author"],
                        "book": row["book"],
                        # Node-specific columns
                        "entity_type": "node",
                        "chunk_index": chunk_idx,
                        "node_id": node.get("id", ""),
                        "node_label": node.get("label", ""),
                        "node_name": node.get("name", ""),
                        "node_description": node.get("description", ""),
                        "node_properties": json.dumps(node.get("properties", {})),
                        "node_timestamp": node.get("timestamp", ""),
                        # For compatibility - these will be empty for nodes
                        "relationship_type": "",
                        "start_node_id": "",
                        "end_node_id": "",
                        "relationship_weight": None,
                        "relationship_properties": "{}",
                        "relationship_timestamp": "",
                    }

                    node_rows.append(node_row)
                    total_nodes += 1

        except Exception as e:
            logger.error(f"Row {idx}: Error extracting nodes - {e}")
            errors += 1
            continue

    print(f"✅ Extracted {total_nodes} nodes with {errors} errors")
    return pd.DataFrame(node_rows)


def extract_relationships_from_kg_json(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract all relationships from kg_json into individual DataFrame rows.
    Retains original columns and adds relationship-specific columns.
    """
    print("🔗 Extracting relationships from kg_json...")

    relationship_rows = []
    total_relationships = 0
    errors = 0

    for idx, row in processed_df.iterrows():
        try:
            # Parse the kg_json
            kg_data = json.loads(row["kg_json"])

            # Handle both single chunk and list of chunks
            chunks = [kg_data] if isinstance(kg_data, dict) else kg_data

            for chunk_idx, chunk in enumerate(chunks):
                relationships = chunk.get("relationships", [])

                for rel in relationships:
                    # Create new row with original columns plus relationship data
                    rel_row = {
                        # Original columns
                        "chapter": row["chapter"],
                        "chunk": row["chunk"],
                        "chunk_order_number": row["chunk_order_number"],
                        "author": row["author"],
                        "book": row["book"],
                        # Relationship-specific columns
                        "entity_type": "relationship",
                        "chunk_index": chunk_idx,
                        "relationship_type": rel.get("relationship_type", ""),
                        "start_node_id": rel.get("start_id", ""),
                        "end_node_id": rel.get("end_id", ""),
                        "relationship_weight": rel.get("weight"),
                        "relationship_properties": json.dumps(
                            rel.get("properties", {})
                        ),
                        "relationship_timestamp": rel.get("timestamp", ""),
                        # For compatibility - these will be empty for relationships
                        "node_id": "",
                        "node_label": "",
                        "node_name": "",
                        "node_description": "",
                        "node_properties": "{}",
                        "node_timestamp": "",
                    }

                    relationship_rows.append(rel_row)
                    total_relationships += 1

        except Exception as e:
            logger.error(f"Row {idx}: Error extracting relationships - {e}")
            errors += 1
            continue

    print(f"✅ Extracted {total_relationships} relationships with {errors} errors")
    return pd.DataFrame(relationship_rows)


def extract_all_entities(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Extract both nodes and relationships into a single DataFrame.
    """
    print("🎯 Extracting all entities (nodes + relationships) from kg_json...")

    # Extract nodes and relationships separately
    nodes_df = extract_nodes_from_kg_json(processed_df)
    relationships_df = extract_relationships_from_kg_json(processed_df)

    # Combine them
    combined_df = pd.concat([nodes_df, relationships_df], ignore_index=True)

    print(f"🎉 Combined extraction complete!")
    print(f"   Total nodes: {len(nodes_df)}")
    print(f"   Total relationships: {len(relationships_df)}")
    print(f"   Total entities: {len(combined_df)}")

    return combined_df


def create_separate_dataframes(
    processed_df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create separate DataFrames for nodes and relationships.
    Returns: (nodes_df, relationships_df)
    """
    print("📋 Creating separate DataFrames for nodes and relationships...")

    nodes_df = extract_nodes_from_kg_json(processed_df)
    relationships_df = extract_relationships_from_kg_json(processed_df)

    # Clean up columns - remove empty relationship columns from nodes_df
    nodes_columns = [
        "chapter",
        "chunk",
        "chunk_order_number",
        "author",
        "book",
        "entity_type",
        "chunk_index",
        "node_id",
        "node_label",
        "node_name",
        "node_description",
        "node_properties",
        "node_timestamp",
    ]

    # Clean up columns - remove empty node columns from relationships_df
    relationships_columns = [
        "chapter",
        "chunk",
        "chunk_order_number",
        "author",
        "book",
        "entity_type",
        "chunk_index",
        "relationship_type",
        "start_node_id",
        "end_node_id",
        "relationship_weight",
        "relationship_properties",
        "relationship_timestamp",
    ]

    clean_nodes_df = nodes_df[nodes_columns].copy()
    clean_relationships_df = relationships_df[relationships_columns].copy()

    return clean_nodes_df, clean_relationships_df


def analyze_extraction_results(
    nodes_df: pd.DataFrame, relationships_df: pd.DataFrame, original_df: pd.DataFrame
):
    """
    Analyze and display statistics about the extraction.
    """
    print("\n📊 Extraction Analysis:")
    print("=" * 50)

    print(f"Original DataFrame:")
    print(f"  Rows: {len(original_df)}")
    print(f"  Columns: {list(original_df.columns)}")

    print(f"\nNodes DataFrame:")
    print(f"  Rows: {len(nodes_df)}")
    print(f"  Columns: {list(nodes_df.columns)}")

    print(f"\nRelationships DataFrame:")
    print(f"  Rows: {len(relationships_df)}")
    print(f"  Columns: {list(relationships_df.columns)}")

    # Analyze node types
    if len(nodes_df) > 0:
        print(f"\nNode Label Distribution:")
        label_counts = nodes_df["node_label"].value_counts()
        for label, count in label_counts.head(10).items():
            print(f"  {label}: {count}")

    # Analyze relationship types
    if len(relationships_df) > 0:
        print(f"\nRelationship Type Distribution:")
        rel_counts = relationships_df["relationship_type"].value_counts()
        for rel_type, count in rel_counts.head(10).items():
            print(f"  {rel_type}: {count}")

    # Show sample data
    print(f"\nSample Node Row:")
    if len(nodes_df) > 0:
        sample_node = nodes_df.iloc[0]
        for col in ["chapter", "node_label", "node_name", "node_id"]:
            print(f"  {col}: {sample_node[col]}")

    print(f"\nSample Relationship Row:")
    if len(relationships_df) > 0:
        sample_rel = relationships_df.iloc[0]
        for col in ["chapter", "relationship_type", "start_node_id", "end_node_id"]:
            print(f"  {col}: {sample_rel[col]}")


def get_unique_entities_summary(nodes_df: pd.DataFrame) -> pd.DataFrame:
    """
    Get summary of unique entities across the entire knowledge graph.
    """
    print("\n🔍 Analyzing unique entities...")

    # Group by node_id to get unique entities
    unique_entities = (
        nodes_df.groupby("node_id")
        .agg(
            {
                "node_label": "first",
                "node_name": "first",
                "node_description": "first",
                "chapter": lambda x: list(
                    set(x)
                ),  # Unique chapters where this entity appears
                "book": "first",
                "author": "first",
            }
        )
        .reset_index()
    )

    # Add appearance count
    appearance_counts = nodes_df["node_id"].value_counts()
    unique_entities["appearance_count"] = unique_entities["node_id"].map(
        appearance_counts
    )

    # Sort by appearance count
    unique_entities = unique_entities.sort_values("appearance_count", ascending=False)

    print(f"✅ Found {len(unique_entities)} unique entities")
    print(f"📈 Top entities by appearance:")

    top_entities = unique_entities.head(10)
    for _, entity in top_entities.iterrows():
        chapters = (
            entity["chapter"]
            if isinstance(entity["chapter"], list)
            else [entity["chapter"]]
        )
        chapter_str = (
            f"({len(chapters)} chapters)"
            if len(chapters) > 1
            else f"Chapter: {chapters[0]}"
        )
        print(
            f"  {entity['node_label']}: {entity['node_name']} - {entity['appearance_count']} times {chapter_str}"
        )

    return unique_entities


# MAIN EXECUTION FUNCTIONS
def extract_entities_combined(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Main function: Extract all entities into a single DataFrame.
    """
    print("🚀 Starting entity extraction (combined)...")

    combined_df = extract_all_entities(processed_df)

    print(f"\n📊 Extraction Summary:")
    print(f"   Input rows: {len(processed_df)}")
    print(f"   Output entity rows: {len(combined_df)}")
    print(f"   Expansion ratio: {len(combined_df)/len(processed_df):.1f}x")

    return combined_df


def extract_entities_separate(
    processed_df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Main function: Extract entities into separate DataFrames.
    Returns: (nodes_df, relationships_df, unique_entities_df)
    """
    print("🚀 Starting entity extraction (separate DataFrames)...")

    nodes_df, relationships_df = create_separate_dataframes(processed_df)
    unique_entities_df = get_unique_entities_summary(nodes_df)

    analyze_extraction_results(nodes_df, relationships_df, processed_df)

    return nodes_df, relationships_df, unique_entities_df


# Usage examples:
print("✅ Entity extraction functions loaded!")
print("\n📞 Usage options:")
print("1. Combined: entities_df = extract_entities_combined(processed_df)")
print(
    "2. Separate: nodes_df, relationships_df, unique_entities_df = extract_entities_separate(processed_df)"
)


✅ Entity extraction functions loaded!

📞 Usage options:
1. Combined: entities_df = extract_entities_combined(processed_df)
2. Separate: nodes_df, relationships_df, unique_entities_df = extract_entities_separate(processed_df)


In [57]:
# After running your UUID processing:
# processed_df, error_stats = process_updated_df_complete(updated_df)

# Extract entities (choose one approach):

# Option 1: Everything together
entities_df = extract_entities_combined(processed_df)

# Option 2: Separate analysis
# nodes_df, relationships_df, unique_entities_df = extract_entities_separate(processed_df)

# Now you have individual rows for each entity with full metadata!


🚀 Starting entity extraction (combined)...
🎯 Extracting all entities (nodes + relationships) from kg_json...
📊 Extracting nodes from kg_json...
✅ Extracted 16090 nodes with 0 errors
🔗 Extracting relationships from kg_json...
✅ Extracted 18014 relationships with 0 errors
🎉 Combined extraction complete!
   Total nodes: 16090
   Total relationships: 18014
   Total entities: 34104

📊 Extraction Summary:
   Input rows: 2187
   Output entity rows: 34104
   Expansion ratio: 15.6x


  combined_df = pd.concat([nodes_df, relationships_df], ignore_index=True)


In [58]:
entities_df

Unnamed: 0,chapter,chunk,chunk_order_number,author,book,entity_type,chunk_index,node_id,node_label,node_name,node_description,node_properties,node_timestamp,relationship_type,start_node_id,end_node_id,relationship_weight,relationship_properties,relationship_timestamp
0,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,node,0,f32bd123-42c0-5ba0-a943-797dc52d7e1a,Book,ADVENTURES OF HUCKLEBERRY FINN,Classic American novel by Mark Twain,{},2025-06-01T16:36:15Z,,,,,{},
1,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,node,0,f1912760-8749-51e0-908a-70e6ce194ca4,Author,Mark Twain,American author,{},2025-06-01T16:36:15Z,,,,,{},
2,Preamble,ADVENTURES\nOF\nHUCKLEBERRY FINN,1,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,node,0,1bcd403f-f184-58af-a991-abee627ceca8,Chapter,Preamble,The Preamble of the book.,{},2025-06-01T16:36:15Z,,,,,{},
3,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,node,0,f32bd123-42c0-5ba0-a943-797dc52d7e1a,Book,ADVENTURES OF HUCKLEBERRY FINN,Classic American novel,"{""author"": ""Mark Twain""}",2025-06-01T16:36:15Z,,,,,{},
4,Preamble,(Tom Sawyer’s Comrade),2,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,node,0,f1912760-8749-51e0-908a-70e6ce194ca4,Author,Mark Twain,Author of ADVENTURES OF HUCKLEBERRY FINN,{},2025-06-01T16:36:15Z,,,,,{},
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34099,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,relationship,0,,,,,{},,CONVEYS,c4242b73-9976-53ae-8fb8-da9b6de337f2,99c5329c-d511-53f1-af10-bb903c7649bb,,{},
34100,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,relationship,0,,,,,{},,CONTAINS_DIALOGUE_BY,c4242b73-9976-53ae-8fb8-da9b6de337f2,51a17aa9-af7b-5ec4-896c-4ee7ce8eb1c0,,{},
34101,CHAPTER XLII.,"Tom’s most well now, and got his bullet around...",2186,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,relationship,0,,,,,{},,CONTAINS,f32bd123-42c0-5ba0-a943-797dc52d7e1a,b518aab7-c3e5-5a46-864b-ba4178ccd28b,,{},
34102,CHAPTER XLII.,"THE END. YOURS TRULY, _HUCK FINN_.",2187,Mark Twain,ADVENTURES OF HUCKLEBERRY FINN,relationship,0,,,,,{},,WRITTEN_BY,f32bd123-42c0-5ba0-a943-797dc52d7e1a,f1912760-8749-51e0-908a-70e6ce194ca4,1.0,{},2024-11-02T16:40:58Z


In [56]:
# After running your UUID processing:
# processed_df, error_stats = process_updated_df_complete(updated_df)

# Extract entities (choose one approach):

# Option 1: Everything together
# entities_df = extract_entities_combined(processed_df)

# Option 2: Separate analysis
nodes_df, relationships_df, unique_entities_df = extract_entities_separate(processed_df)

# Now you have individual rows for each entity with full metadata!


🚀 Starting entity extraction (separate DataFrames)...
📋 Creating separate DataFrames for nodes and relationships...
📊 Extracting nodes from kg_json...
✅ Extracted 16090 nodes with 0 errors
🔗 Extracting relationships from kg_json...
✅ Extracted 18014 relationships with 0 errors

🔍 Analyzing unique entities...
✅ Found 5305 unique entities
📈 Top entities by appearance:
  Book: ADVENTURES OF HUCKLEBERRY FINN - 2185 times (43 chapters)
  Author: Mark Twain - 2131 times (43 chapters)
  Actor: I - 327 times (40 chapters)
  Actor: Jim - 271 times (32 chapters)
  Actor: Tom - 122 times (17 chapters)
  Chapter: CHAPTER XXVI. - 100 times Chapter: CHAPTER XXVI.
  Actor: Narrator - 95 times (36 chapters)
  Chapter: CHAPTER XXVIII. - 95 times Chapter: CHAPTER XXVIII.
  Chapter: CHAPTER XXIX. - 89 times Chapter: CHAPTER XXIX.
  Chapter: CHAPTER XLII. - 86 times Chapter: CHAPTER XLII.

📊 Extraction Analysis:
Original DataFrame:
  Rows: 2187
  Columns: ['chapter', 'chunk', 'chunk_order_number', 'author

In [62]:
import pandas as pd
import json
import csv
import os
from typing import Dict, List, Any, Tuple
import re

def clean_for_csv(value: str) -> str:
    """Clean string values for CSV export to avoid import issues."""
    if pd.isna(value) or value is None:
        return ""
    
    value = str(value)
    value = value.replace('\n', '\\n').replace('\r', '\\r')
    value = value.replace('"', '\\"')
    value = value.replace('\t', ' ')
    
    if len(value) > 1000:
        value = value[:997] + "..."
    
    return value

def format_properties_for_memgraph(properties_json: str, additional_props: Dict[str, Any] = None) -> str:
    """Format properties JSON for Memgraph import."""
    try:
        if properties_json and properties_json != '{}':
            props = json.loads(properties_json)
        else:
            props = {}
        
        if additional_props:
            props.update(additional_props)
        
        cleaned_props = {}
        for key, value in props.items():
            if value is not None and value != "":
                clean_key = re.sub(r'[^a-zA-Z0-9_]', '_', str(key))
                
                if isinstance(value, (int, float)):
                    cleaned_props[clean_key] = value
                elif isinstance(value, bool):
                    cleaned_props[clean_key] = value
                else:
                    cleaned_props[clean_key] = clean_for_csv(str(value))
        
        return json.dumps(cleaned_props) if cleaned_props else "{}"
        
    except Exception as e:
        print(f"Warning: Error formatting properties: {e}")
        return "{}"

def prepare_nodes_for_memgraph(entities_df: pd.DataFrame) -> pd.DataFrame:
    """Prepare nodes data for Memgraph import."""
    print("📊 Preparing nodes for Memgraph import...")
    
    nodes_df = entities_df[entities_df['entity_type'] == 'node'].copy()
    
    if len(nodes_df) == 0:
        print("⚠️  No nodes found in entities_df")
        return pd.DataFrame()
    
    memgraph_nodes = pd.DataFrame()
    memgraph_nodes['id'] = nodes_df['node_id']
    memgraph_nodes['labels'] = nodes_df['node_label']
    
    properties_list = []
    for _, row in nodes_df.iterrows():
        additional_props = {
            'name': clean_for_csv(row['node_name']),
            'description': clean_for_csv(row['node_description']),
            'chapter': clean_for_csv(row['chapter']),
            'chunk': clean_for_csv(row['chunk']),
            'chunk_order_number': row['chunk_order_number'],
            'author': clean_for_csv(row['author']),
            'book': clean_for_csv(row['book']),
            'chunk_index': row.get('chunk_index', 0),
            'timestamp': clean_for_csv(row['node_timestamp'])
        }
        
        formatted_props = format_properties_for_memgraph(
            row['node_properties'], 
            additional_props
        )
        properties_list.append(formatted_props)
    
    memgraph_nodes['properties'] = properties_list
    
    initial_count = len(memgraph_nodes)
    memgraph_nodes = memgraph_nodes.drop_duplicates(subset=['id'], keep='first')
    final_count = len(memgraph_nodes)
    
    if initial_count > final_count:
        print(f"🔍 Removed {initial_count - final_count} duplicate nodes")
    
    print(f"✅ Prepared {len(memgraph_nodes)} unique nodes")
    return memgraph_nodes

def prepare_relationships_for_memgraph(entities_df: pd.DataFrame) -> pd.DataFrame:
    """Prepare relationships data for Memgraph import."""
    print("🔗 Preparing relationships for Memgraph import...")
    
    rels_df = entities_df[entities_df['entity_type'] == 'relationship'].copy()
    
    if len(rels_df) == 0:
        print("⚠️  No relationships found in entities_df")
        return pd.DataFrame()
    
    memgraph_rels = pd.DataFrame()
    memgraph_rels['start_id'] = rels_df['start_node_id']
    memgraph_rels['end_id'] = rels_df['end_node_id']
    memgraph_rels['type'] = rels_df['relationship_type']
    
    properties_list = []
    for _, row in rels_df.iterrows():
        additional_props = {
            'weight': row['relationship_weight'] if pd.notna(row['relationship_weight']) else 1.0,
            'chapter': clean_for_csv(row['chapter']),
            'chunk': clean_for_csv(row['chunk']),
            'chunk_order_number': row['chunk_order_number'],
            'author': clean_for_csv(row['author']),
            'book': clean_for_csv(row['book']),
            'chunk_index': row.get('chunk_index', 0),
            'timestamp': clean_for_csv(row['relationship_timestamp'])
        }
        
        formatted_props = format_properties_for_memgraph(
            row['relationship_properties'], 
            additional_props
        )
        properties_list.append(formatted_props)
    
    memgraph_rels['properties'] = properties_list
    
    initial_count = len(memgraph_rels)
    memgraph_rels = memgraph_rels.dropna(subset=['start_id', 'end_id'])
    memgraph_rels = memgraph_rels[(memgraph_rels['start_id'] != '') & (memgraph_rels['end_id'] != '')]
    final_count = len(memgraph_rels)
    
    if initial_count > final_count:
        print(f"🔍 Removed {initial_count - final_count} relationships with missing node references")
    
    print(f"✅ Prepared {len(memgraph_rels)} relationships")
    return memgraph_rels

def export_to_csv(nodes_df: pd.DataFrame, relationships_df: pd.DataFrame, output_dir: str = "memgraph_import") -> Tuple[str, str]:
    """Export nodes and relationships to CSV files for Memgraph import."""
    print(f"💾 Exporting to CSV files in '{output_dir}' directory...")
    
    os.makedirs(output_dir, exist_ok=True)
    
    nodes_file = os.path.join(output_dir, "nodes.csv")
    relationships_file = os.path.join(output_dir, "relationships.csv")
    
    if len(nodes_df) > 0:
        nodes_df.to_csv(nodes_file, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
        print(f"✅ Exported {len(nodes_df)} nodes to {nodes_file}")
    else:
        print("⚠️  No nodes to export")
    
    if len(relationships_df) > 0:
        relationships_df.to_csv(relationships_file, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
        print(f"✅ Exported {len(relationships_df)} relationships to {relationships_file}")
    else:
        print("⚠️  No relationships to export")
    
    return nodes_file, relationships_file

def generate_cypher_import_script(nodes_file: str, relationships_file: str, output_dir: str = "memgraph_import") -> str:
    """Generate Memgraph-specific Cypher script for importing data."""
    print("📝 Generating Memgraph Cypher import script...")
    
    script_content = f"""// Memgraph Import Script
// Generated from entities_df conversion
// Compatible with Memgraph (no APOC dependencies)

// Clear existing data (CAREFUL!)
// MATCH (n) DETACH DELETE n;

// Create index for faster lookups
CREATE INDEX ON :Author(id);
CREATE INDEX ON :Book(id);
CREATE INDEX ON :Chapter(id);
CREATE INDEX ON :Chunk(id);
CREATE INDEX ON :Actor(id);
CREATE INDEX ON :Object(id);

// Import Nodes
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
CALL {{
  WITH row
  WITH row.id as nodeId, row.labels as nodeLabel, row.properties as propsJson
  WITH nodeId, nodeLabel, 
       CASE 
         WHEN propsJson IS NULL OR propsJson = '' OR propsJson = '{{}}' 
         THEN {{}} 
         ELSE json.loads(propsJson) 
       END as props
  // Create node with dynamic label
  CALL {{ 
    WITH nodeId, nodeLabel, props
    WITH "CREATE (n:" + nodeLabel + " {{id: $nodeId}}) SET n += $props RETURN n" as query
    CALL query_module.call(query, {{nodeId: nodeId, props: props}}) YIELD result
    RETURN result
  }}
}} IN TRANSACTIONS OF 1000 ROWS;

// Import Relationships  
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
CALL {{
  WITH row
  WITH row.start_id as startId, row.end_id as endId, row.type as relType, row.properties as propsJson
  WITH startId, endId, relType,
       CASE 
         WHEN propsJson IS NULL OR propsJson = '' OR propsJson = '{{}}' 
         THEN {{}} 
         ELSE json.loads(propsJson) 
       END as props
  MATCH (start {{id: startId}})
  MATCH (end {{id: endId}})
  // Create relationship with dynamic type
  CALL {{
    WITH start, end, relType, props
    WITH "CREATE (start)-[r:" + relType + "]->(end) SET r += $props RETURN r" as query
    CALL query_module.call(query, {{props: props}}) YIELD result
    RETURN result
  }}
}} IN TRANSACTIONS OF 1000 ROWS;

// Verification queries
MATCH (n) RETURN labels(n) as label, count(n) as count ORDER BY count DESC;
MATCH ()-[r]-() RETURN type(r) as relationship_type, count(r) as count ORDER BY count DESC;
"""

    # Alternative simpler script for manual execution
    simple_script = f"""// Simple Memgraph Import Script
// Execute these queries one by one in Memgraph Lab

// Clear existing data (CAREFUL!)
// MATCH (n) DETACH DELETE n;

// Create indexes for performance
CREATE INDEX ON :Author(id);
CREATE INDEX ON :Book(id); 
CREATE INDEX ON :Chapter(id);
CREATE INDEX ON :Chunk(id);
CREATE INDEX ON :Actor(id);
CREATE INDEX ON :Object(id);

// Import Authors
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Author'
WITH row, json.loads(row.properties) as props
CREATE (n:Author {{id: row.id}})
SET n += props;

// Import Books
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Book'
WITH row, json.loads(row.properties) as props
CREATE (n:Book {{id: row.id}})
SET n += props;

// Import Chapters
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Chapter'
WITH row, json.loads(row.properties) as props
CREATE (n:Chapter {{id: row.id}})
SET n += props;

// Import Chunks
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Chunk'
WITH row, json.loads(row.properties) as props
CREATE (n:Chunk {{id: row.id}})
SET n += props;

// Import Actors
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Actor'
WITH row, json.loads(row.properties) as props
CREATE (n:Actor {{id: row.id}})
SET n += props;

// Import Objects
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels = 'Object'
WITH row, json.loads(row.properties) as props
CREATE (n:Object {{id: row.id}})
SET n += props;

// Import other node types (add as needed)
LOAD CSV WITH HEADERS FROM "file:///nodes.csv" AS row
WHERE row.labels NOT IN ['Author', 'Book', 'Chapter', 'Chunk', 'Actor', 'Object']
WITH row, json.loads(row.properties) as props, row.labels as label
CALL {{
  WITH row, props, label
  WITH "CREATE (n:" + label + " {{id: $id}}) SET n += $props" as query
  CALL query_module.call(query, {{id: row.id, props: props}}) YIELD result
  RETURN result
}};

// Import WRITTEN_BY relationships
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
WHERE row.type = 'WRITTEN_BY'
WITH row, json.loads(row.properties) as props
MATCH (start {{id: row.start_id}})
MATCH (end {{id: row.end_id}})
CREATE (start)-[r:WRITTEN_BY]->(end)
SET r += props;

// Import PART_OF relationships
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
WHERE row.type = 'PART_OF'
WITH row, json.loads(row.properties) as props
MATCH (start {{id: row.start_id}})
MATCH (end {{id: row.end_id}})
CREATE (start)-[r:PART_OF]->(end)
SET r += props;

// Import MENTIONS relationships
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
WHERE row.type = 'MENTIONS'
WITH row, json.loads(row.properties) as props
MATCH (start {{id: row.start_id}})
MATCH (end {{id: row.end_id}})
CREATE (start)-[r:MENTIONS]->(end)
SET r += props;

// Import REFERENCES relationships
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
WHERE row.type = 'REFERENCES'
WITH row, json.loads(row.properties) as props
MATCH (start {{id: row.start_id}})
MATCH (end {{id: row.end_id}})
CREATE (start)-[r:REFERENCES]->(end)
SET r += props;

// Import other relationship types
LOAD CSV WITH HEADERS FROM "file:///relationships.csv" AS row
WHERE row.type NOT IN ['WRITTEN_BY', 'PART_OF', 'MENTIONS', 'REFERENCES']
WITH row, json.loads(row.properties) as props, row.type as relType
MATCH (start {{id: row.start_id}})
MATCH (end {{id: row.end_id}})
CALL {{
  WITH start, end, props, relType
  WITH "CREATE (start)-[r:" + relType + "]->(end) SET r += $props" as query
  CALL query_module.call(query, {{props: props}}) YIELD result
  RETURN result
}};

// Verification and sample queries
MATCH (n) RETURN labels(n) as label, count(n) as count ORDER BY count DESC;
MATCH ()-[r]-() RETURN type(r) as relationship_type, count(r) as count ORDER BY count DESC;
MATCH (a:Author) RETURN a.name, a.book LIMIT 5;
MATCH (b:Book)-[r:WRITTEN_BY]->(a:Author) RETURN b.name, a.name LIMIT 5;
MATCH (c:Chapter)-[r:PART_OF]->(b:Book) RETURN c.name, b.name LIMIT 10;
"""
    
    script_file = os.path.join(output_dir, "import_script.cypher")
    simple_script_file = os.path.join(output_dir, "import_script_manual.cypher")
    
    with open(script_file, 'w', encoding='utf-8') as f:
        f.write(script_content)
    
    with open(simple_script_file, 'w', encoding='utf-8') as f:
        f.write(simple_script)
    
    print(f"✅ Generated Memgraph import script: {script_file}")
    print(f"✅ Generated manual execution script: {simple_script_file}")
    
    return script_file

def analyze_memgraph_data(nodes_df: pd.DataFrame, relationships_df: pd.DataFrame):
    """Analyze the prepared data for Memgraph import."""
    print("\n📊 Memgraph Import Analysis:")
    print("=" * 50)
    
    if len(nodes_df) > 0:
        print(f"Nodes Summary:")
        print(f"  Total nodes: {len(nodes_df)}")
        print(f"  Node labels distribution:")
        label_counts = nodes_df['labels'].value_counts()
        for label, count in label_counts.head(10).items():
            print(f"    {label}: {count}")
    
    if len(relationships_df) > 0:
        print(f"\nRelationships Summary:")
        print(f"  Total relationships: {len(relationships_df)}")
        print(f"  Relationship types distribution:")
        type_counts = relationships_df['type'].value_counts()
        for rel_type, count in type_counts.head(10).items():
            print(f"    {rel_type}: {count}")

def convert_entities_to_memgraph(entities_df: pd.DataFrame, output_dir: str = "memgraph_import") -> Dict[str, str]:
    """
    Main function to convert entities_df to Memgraph import format.
    """
    print("🚀 Converting entities_df to Memgraph import format...")
    
    nodes_df = prepare_nodes_for_memgraph(entities_df)
    relationships_df = prepare_relationships_for_memgraph(entities_df)
    
    analyze_memgraph_data(nodes_df, relationships_df)
    
    nodes_file, relationships_file = export_to_csv(nodes_df, relationships_df, output_dir)
    script_file = generate_cypher_import_script(nodes_file, relationships_file, output_dir)
    
    print(f"\n🎉 Conversion complete!")
    print(f"📁 Files created in '{output_dir}':")
    print(f"   - nodes.csv: {len(nodes_df)} nodes")
    print(f"   - relationships.csv: {len(relationships_df)} relationships") 
    print(f"   - import_script.cypher: Cypher import commands")
    print(f"   - import_script_simple.cypher: Simple version without APOC")
    
    return {
        'nodes_file': nodes_file,
        'relationships_file': relationships_file,
        'script_file': script_file,
        'output_dir': output_dir
    }

# MAIN EXECUTION FUNCTION
def convert_to_memgraph_complete(entities_df):
    """
    Complete conversion function - call this with your entities_df!
    """
    print("🚀 Starting complete Memgraph conversion...")
    
    # Show input summary
    print(f"\n📊 Input Summary:")
    print(f"   Total entities: {len(entities_df)}")
    print(f"   Nodes: {len(entities_df[entities_df['entity_type'] == 'node'])}")
    print(f"   Relationships: {len(entities_df[entities_df['entity_type'] == 'relationship'])}")
    print(f"   Books: {entities_df['book'].nunique()}")
    print(f"   Authors: {entities_df['author'].nunique()}")
    
    # Convert to Memgraph format
    memgraph_files = convert_entities_to_memgraph(entities_df)
    
    # Show instructions
    print(f"""
🎯 MEMGRAPH IMPORT INSTRUCTIONS:

1. Copy files to Memgraph import directory:
   - Usually: /var/lib/memgraph/import/ (Linux)
   - Or configure import path in Memgraph

2. Open Memgraph Lab (usually http://localhost:3000)

3. Upload or copy the import_script.cypher content

4. Run the script to import your knowledge graph

5. Verify the import:
   MATCH (n) RETURN count(n);
   MATCH ()-[r]-() RETURN count(r);

6. Explore your data:
   MATCH (a:Author) RETURN a.name, a.book LIMIT 10;
   MATCH (b:Book)-[r:WRITTEN_BY]->(a:Author) RETURN b.name, a.name;
   MATCH (c:Chapter)-[r:PART_OF]->(b:Book) RETURN c.name, b.name LIMIT 10;
""")
    
    return memgraph_files

# Usage
print("✅ Complete Memgraph conversion functions loaded!")
print("📞 Call: memgraph_files = convert_to_memgraph_complete(entities_df)")

✅ Complete Memgraph conversion functions loaded!
📞 Call: memgraph_files = convert_to_memgraph_complete(entities_df)


In [63]:
# After running the complete script above:
memgraph_files = convert_to_memgraph_complete(entities_df)


🚀 Starting complete Memgraph conversion...

📊 Input Summary:
   Total entities: 34104
   Nodes: 16090
   Relationships: 18014
   Books: 1
   Authors: 1
🚀 Converting entities_df to Memgraph import format...
📊 Preparing nodes for Memgraph import...
🔍 Removed 10785 duplicate nodes
✅ Prepared 5305 unique nodes
🔗 Preparing relationships for Memgraph import...
✅ Prepared 18014 relationships

📊 Memgraph Import Analysis:
Nodes Summary:
  Total nodes: 5305
  Node labels distribution:
    Chunk: 2105
    Object: 909
    Actor: 818
    Intangible: 648
    Event: 448
    Location: 328
    Chapter: 45
    Book: 2
    Author: 2

Relationships Summary:
  Total relationships: 18014
  Relationship types distribution:
    CONTAINS: 2622
    MENTIONS: 2407
    PART_OF: 2064
    CONTAINS_DIALOGUE_BY: 1486
    WRITTEN_BY: 1220
    WROTE: 1204
    REFERENCES: 994
    INTERACTED_WITH: 660
    USED: 602
    CONVEYS: 542
💾 Exporting to CSV files in 'memgraph_import' directory...
✅ Exported 5305 nodes to memgra

In [64]:
import pandas as pd
import json
from neo4j import GraphDatabase
from typing import Dict, List, Any, Optional
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class MemgraphImporter:
    """Direct importer for Memgraph from entities_df."""

    def __init__(
        self, uri: str = "bolt://localhost:7687", user: str = "", password: str = ""
    ):
        """
        Initialize connection to Memgraph.

        Args:
            uri: Memgraph connection URI (default for Docker)
            user: Username (empty for default Memgraph)
            password: Password (empty for default Memgraph)
        """
        self.uri = uri
        self.user = user
        self.password = password
        self.driver = None

    def connect(self):
        """Establish connection to Memgraph."""
        try:
            self.driver = GraphDatabase.driver(
                self.uri, auth=(self.user, self.password)
            )

            # Test connection
            with self.driver.session() as session:
                result = session.run("RETURN 1 as test")
                test_value = result.single()["test"]
                if test_value == 1:
                    print("✅ Successfully connected to Memgraph!")
                    return True
        except Exception as e:
            print(f"❌ Failed to connect to Memgraph: {e}")
            print("💡 Make sure Memgraph Docker is running:")
            print("   docker run -p 7687:7687 -p 7444:7444 memgraph/memgraph")
            return False

    def close(self):
        """Close connection to Memgraph."""
        if self.driver:
            self.driver.close()
            print("🔌 Disconnected from Memgraph")

    def clear_database(self, confirm: bool = False):
        """Clear all data from Memgraph."""
        if not confirm:
            print("⚠️  To clear database, call with confirm=True")
            return

        with self.driver.session() as session:
            result = session.run("MATCH (n) DETACH DELETE n RETURN count(n) as deleted")
            deleted = result.single()["deleted"]
            print(f"🗑️  Cleared {deleted} nodes from database")

    def create_indexes(self):
        """Create indexes for better performance."""
        print("📊 Creating indexes...")

        indexes = [
            "CREATE INDEX ON :Author(id);",
            "CREATE INDEX ON :Book(id);",
            "CREATE INDEX ON :Chapter(id);",
            "CREATE INDEX ON :Chunk(id);",
            "CREATE INDEX ON :Actor(id);",
            "CREATE INDEX ON :Object(id);",
        ]

        with self.driver.session() as session:
            for index_query in indexes:
                try:
                    session.run(index_query)
                except Exception as e:
                    if "already exists" not in str(e).lower():
                        print(f"⚠️  Index creation warning: {e}")

        print("✅ Indexes created")

    def clean_properties(
        self, properties_json: str, additional_props: Dict[str, Any] = None
    ) -> Dict[str, Any]:
        """Clean and prepare properties for Memgraph."""
        try:
            if properties_json and properties_json != "{}":
                props = json.loads(properties_json)
            else:
                props = {}

            if additional_props:
                props.update(additional_props)

            # Clean properties for Cypher
            cleaned_props = {}
            for key, value in props.items():
                if value is not None and value != "":
                    # Clean key name for Cypher
                    clean_key = (
                        key.replace(" ", "_").replace("-", "_").replace(".", "_")
                    )
                    clean_key = "".join(c for c in clean_key if c.isalnum() or c == "_")

                    # Format value
                    if isinstance(value, (int, float, bool)):
                        cleaned_props[clean_key] = value
                    else:
                        # String - limit length and escape
                        str_value = str(value)
                        if len(str_value) > 500:
                            str_value = str_value[:497] + "..."
                        cleaned_props[clean_key] = str_value

            return cleaned_props

        except Exception as e:
            logger.warning(f"Error cleaning properties: {e}")
            return {}

    def import_nodes_batch(self, nodes_data: List[Dict], batch_size: int = 1000):
        """Import nodes in batches."""
        print(f"📦 Importing {len(nodes_data)} nodes in batches of {batch_size}...")

        total_imported = 0

        with self.driver.session() as session:
            for i in range(0, len(nodes_data), batch_size):
                batch = nodes_data[i : i + batch_size]

                # Group by label for efficiency
                by_label = {}
                for node in batch:
                    label = node["label"]
                    if label not in by_label:
                        by_label[label] = []
                    by_label[label].append(node)

                # Import each label group
                for label, label_nodes in by_label.items():
                    query = f"""
                    UNWIND $nodes as node
                    CREATE (n:{label} {{id: node.id}})
                    SET n += node.props
                    """

                    params = {
                        "nodes": [
                            {"id": node["id"], "props": node["props"]}
                            for node in label_nodes
                        ]
                    }

                    try:
                        result = session.run(query, params)
                        summary = result.consume()
                        total_imported += summary.counters.nodes_created

                    except Exception as e:
                        logger.error(f"Error importing {label} nodes: {e}")

                # Progress update
                print(
                    f"   Processed {min(i + batch_size, len(nodes_data))}/{len(nodes_data)} nodes..."
                )

        print(f"✅ Imported {total_imported} nodes")
        return total_imported

    def import_relationships_batch(
        self, relationships_data: List[Dict], batch_size: int = 1000
    ):
        """Import relationships in batches."""
        print(
            f"🔗 Importing {len(relationships_data)} relationships in batches of {batch_size}..."
        )

        total_imported = 0

        with self.driver.session() as session:
            for i in range(0, len(relationships_data), batch_size):
                batch = relationships_data[i : i + batch_size]

                # Group by relationship type
                by_type = {}
                for rel in batch:
                    rel_type = rel["type"]
                    if rel_type not in by_type:
                        by_type[rel_type] = []
                    by_type[rel_type].append(rel)

                # Import each type group
                for rel_type, type_rels in by_type.items():
                    query = f"""
                    UNWIND $rels as rel
                    MATCH (start {{id: rel.start_id}})
                    MATCH (end {{id: rel.end_id}})
                    CREATE (start)-[r:{rel_type}]->(end)
                    SET r += rel.props
                    """

                    params = {
                        "rels": [
                            {
                                "start_id": rel["start_id"],
                                "end_id": rel["end_id"],
                                "props": rel["props"],
                            }
                            for rel in type_rels
                        ]
                    }

                    try:
                        result = session.run(query, params)
                        summary = result.consume()
                        total_imported += summary.counters.relationships_created

                    except Exception as e:
                        logger.error(f"Error importing {rel_type} relationships: {e}")

                print(
                    f"   Processed {min(i + batch_size, len(relationships_data))}/{len(relationships_data)} relationships..."
                )

        print(f"✅ Imported {total_imported} relationships")
        return total_imported

    def prepare_nodes_data(self, entities_df: pd.DataFrame) -> List[Dict]:
        """Prepare node data from entities_df."""
        print("📊 Preparing nodes data...")

        nodes_df = entities_df[entities_df["entity_type"] == "node"].copy()
        nodes_data = []

        for _, row in nodes_df.iterrows():
            additional_props = {
                "name": str(row["node_name"]) if pd.notna(row["node_name"]) else "",
                "description": str(row["node_description"])
                if pd.notna(row["node_description"])
                else "",
                "chapter": str(row["chapter"]),
                "chunk": str(row["chunk"]),
                "chunk_order_number": int(row["chunk_order_number"]),
                "author": str(row["author"]),
                "book": str(row["book"]),
                "chunk_index": int(row.get("chunk_index", 0)),
                "timestamp": str(row["node_timestamp"])
                if pd.notna(row["node_timestamp"])
                else "",
            }

            cleaned_props = self.clean_properties(
                row["node_properties"], additional_props
            )

            nodes_data.append(
                {
                    "id": row["node_id"],
                    "label": row["node_label"],
                    "props": cleaned_props,
                }
            )

        # Remove duplicates by ID
        seen_ids = set()
        unique_nodes = []
        for node in nodes_data:
            if node["id"] not in seen_ids:
                seen_ids.add(node["id"])
                unique_nodes.append(node)

        print(
            f"✅ Prepared {len(unique_nodes)} unique nodes (removed {len(nodes_data) - len(unique_nodes)} duplicates)"
        )
        return unique_nodes

    def prepare_relationships_data(self, entities_df: pd.DataFrame) -> List[Dict]:
        """Prepare relationship data from entities_df."""
        print("🔗 Preparing relationships data...")

        rels_df = entities_df[entities_df["entity_type"] == "relationship"].copy()
        relationships_data = []

        for _, row in rels_df.iterrows():
            additional_props = {
                "weight": float(row["relationship_weight"])
                if pd.notna(row["relationship_weight"])
                else 1.0,
                "chapter": str(row["chapter"]),
                "chunk": str(row["chunk"]),
                "chunk_order_number": int(row["chunk_order_number"]),
                "author": str(row["author"]),
                "book": str(row["book"]),
                "chunk_index": int(row.get("chunk_index", 0)),
                "timestamp": str(row["relationship_timestamp"])
                if pd.notna(row["relationship_timestamp"])
                else "",
            }

            cleaned_props = self.clean_properties(
                row["relationship_properties"], additional_props
            )

            relationships_data.append(
                {
                    "start_id": row["start_node_id"],
                    "end_id": row["end_node_id"],
                    "type": row["relationship_type"],
                    "props": cleaned_props,
                }
            )

        # Filter out relationships with missing nodes
        valid_rels = [
            r
            for r in relationships_data
            if r["start_id"]
            and r["end_id"]
            and str(r["start_id"]) != "nan"
            and str(r["end_id"]) != "nan"
        ]

        print(
            f"✅ Prepared {len(valid_rels)} relationships (filtered {len(relationships_data) - len(valid_rels)} invalid)"
        )
        return valid_rels

    def verify_import(self):
        """Verify the imported data."""
        print("\n🔍 Verifying import...")

        with self.driver.session() as session:
            # Count nodes
            result = session.run(
                "MATCH (n) RETURN labels(n) as label, count(n) as count ORDER BY count DESC"
            )
            print("📊 Node counts by label:")
            for record in result:
                label = record["label"][0] if record["label"] else "Unknown"
                count = record["count"]
                print(f"   {label}: {count}")

            # Count relationships
            result = session.run(
                "MATCH ()-[r]-() RETURN type(r) as rel_type, count(r) as count ORDER BY count DESC"
            )
            print("\n🔗 Relationship counts by type:")
            for record in result:
                rel_type = record["rel_type"]
                count = record["count"]
                print(f"   {rel_type}: {count}")

            # Sample queries
            print("\n📖 Sample data:")

            # Authors
            result = session.run("MATCH (a:Author) RETURN a.name, a.book LIMIT 3")
            for record in result:
                print(f"   Author: {record['a.name']} - Book: {record['a.book']}")

            # Books and authors
            result = session.run(
                "MATCH (b:Book)-[r:WRITTEN_BY]->(a:Author) RETURN b.name, a.name LIMIT 3"
            )
            for record in result:
                print(f"   Book: {record['b.name']} by {record['a.name']}")


def import_entities_to_memgraph(
    entities_df: pd.DataFrame,
    uri: str = "bolt://localhost:7687",
    clear_db: bool = False,
    batch_size: int = 1000,
) -> bool:
    """
    Main function to import entities_df directly to Memgraph.

    Args:
        entities_df: DataFrame with nodes and relationships
        uri: Memgraph connection URI
        clear_db: Whether to clear existing data
        batch_size: Batch size for imports

    Returns:
        True if successful, False otherwise
    """
    print("🚀 Starting direct import to Memgraph...")

    importer = MemgraphImporter(uri)

    try:
        # Connect
        if not importer.connect():
            return False

        # Clear database if requested
        if clear_db:
            importer.clear_database(confirm=True)

        # Create indexes
        importer.create_indexes()

        # Prepare data
        nodes_data = importer.prepare_nodes_data(entities_df)
        relationships_data = importer.prepare_relationships_data(entities_df)

        # Import data
        start_time = time.time()

        nodes_imported = importer.import_nodes_batch(nodes_data, batch_size)
        relationships_imported = importer.import_relationships_batch(
            relationships_data, batch_size
        )

        end_time = time.time()

        # Verify
        importer.verify_import()

        print(f"\n🎉 Import completed successfully!")
        print(f"   ⏱️  Total time: {end_time - start_time:.2f} seconds")
        print(f"   📊 Nodes imported: {nodes_imported}")
        print(f"   🔗 Relationships imported: {relationships_imported}")

        return True

    except Exception as e:
        print(f"❌ Import failed: {e}")
        return False

    finally:
        importer.close()


# Usage functions
print("✅ Memgraph direct import functions loaded!")
print("\n📞 Usage:")
print("# Basic import:")
print("success = import_entities_to_memgraph(entities_df)")
print("\n# Clear database and import:")
print("success = import_entities_to_memgraph(entities_df, clear_db=True)")
print("\n# Custom connection:")
print("success = import_entities_to_memgraph(entities_df, uri='bolt://localhost:7687')")


✅ Memgraph direct import functions loaded!

📞 Usage:
# Basic import:
success = import_entities_to_memgraph(entities_df)

# Clear database and import:
success = import_entities_to_memgraph(entities_df, clear_db=True)

# Custom connection:
success = import_entities_to_memgraph(entities_df, uri='bolt://localhost:7687')


In [65]:
success = import_entities_to_memgraph(entities_df, clear_db=True)


🚀 Starting direct import to Memgraph...
✅ Successfully connected to Memgraph!
🗑️  Cleared 9 nodes from database
📊 Creating indexes...
✅ Indexes created
📊 Preparing nodes data...
✅ Prepared 5305 unique nodes (removed 10785 duplicates)
🔗 Preparing relationships data...
✅ Prepared 18014 relationships (filtered 0 invalid)
📦 Importing 5305 nodes in batches of 1000...
   Processed 1000/5305 nodes...
   Processed 2000/5305 nodes...
   Processed 3000/5305 nodes...
   Processed 4000/5305 nodes...
   Processed 5000/5305 nodes...
   Processed 5305/5305 nodes...
✅ Imported 5305 nodes
🔗 Importing 18014 relationships in batches of 1000...
   Processed 1000/18014 relationships...
   Processed 2000/18014 relationships...
   Processed 3000/18014 relationships...
   Processed 4000/18014 relationships...
   Processed 5000/18014 relationships...
   Processed 6000/18014 relationships...
   Processed 7000/18014 relationships...
   Processed 8000/18014 relationships...
   Processed 9000/18014 relationships..