# YouTube Podcast GraphRAG with Neo4j

This notebook demonstrates a Graph-based Question Answering system using Neo4j and LangChain for YouTube podcast data. It creates a knowledge graph from podcast transcripts, insights, and product mentions to enable complex queries about the content.

In [1]:
%pip install -qU langchain-groq neo4j langchain langchain-community langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import os
import json
from datetime import datetime
import uuid

load_dotenv()

# API Keys
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Neo4j Connection
NEO4J_URI = os.getenv('NEO4J_URI') 
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PW = os.getenv('NEO4J_PW')

# Local LLM (Ollama)
LLM_BASE_URL = os.getenv('LLM_BASE_URL')
LLM_MODEL = os.getenv('LLM_MODEL')
LLM_TEMPERATURE = float(os.getenv('LLM_TEMPERATURE', '0.0'))

# Data directory
DATA_DIR = os.path.join(os.getcwd(), 'local_data')

print("Environment loaded successfully!")

Environment loaded successfully!


In [3]:
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_groq import ChatGroq
from neo4j import GraphDatabase
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load pipeline configuration
config_path = os.path.join(os.getcwd(), 'pipeline_config.json')
with open(config_path, 'r') as f:
    config = json.load(f)

# Define data file paths
json_files = {
    'episodes': os.path.join(DATA_DIR, config['data_files']['episodes']),
    'segments': os.path.join(DATA_DIR, config['data_files']['segments']),
    'insights': os.path.join(DATA_DIR, config['data_files']['insights']),
    'products': os.path.join(DATA_DIR, config['data_files']['products']),
    'links': os.path.join(DATA_DIR, config['data_files']['links'])
}

# Load all data
data = {}
for key, path in json_files.items():
    if os.path.exists(path):
        with open(path, 'r') as f:
            data[key] = json.load(f)
            print(f"Loaded {len(data[key])} {key}")
    else:
        data[key] = []
        print(f"Warning: {key} file not found at {path}")

Loaded 1 episodes
Loaded 19 segments
Loaded 0 insights
Loaded 30 products
Loaded 0 links


In [5]:
# Initialize Neo4j driver for data population
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PW))

# Test connection
try:
    with driver.session() as session:
        result = session.run("RETURN 1")
        print("Neo4j connection successful!")
except Exception as e:
    print(f"Neo4j connection failed: {e}")
    raise

Neo4j connection successful!


## Clear Existing Data (Optional)

In [6]:
# Clear existing data - uncomment to reset database
with driver.session() as session:
     session.run("MATCH (n) DETACH DELETE n")
     print("Database cleared")

Database cleared


## Create Graph Schema

In [7]:
# Create constraints and indexes for better performance
with driver.session() as session:
    # Create constraints
    constraints = [
        "CREATE CONSTRAINT IF NOT EXISTS FOR (e:Episode) REQUIRE e.id IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (s:Segment) REQUIRE s.id IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (sp:Speaker) REQUIRE sp.name IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (i:Insight) REQUIRE i.id IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (p:Product) REQUIRE p.name IS UNIQUE",
        "CREATE CONSTRAINT IF NOT EXISTS FOR (t:Topic) REQUIRE t.name IS UNIQUE"
    ]
    
    for constraint in constraints:
        try:
            session.run(constraint)
            print(f"Created: {constraint.split('FOR')[1].split('REQUIRE')[0].strip()}")
        except Exception as e:
            if "already exists" not in str(e):
                print(f"Error creating constraint: {e}")

Created: (e:Episode)
Created: (s:Segment)
Created: (sp:Speaker)
Created: (i:Insight)
Created: (p:Product)
Created: (t:Topic)


## Populate Graph Database

In [8]:
def populate_episodes(session, episodes):
    """Create Episode nodes"""
    query = """
    UNWIND $episodes AS episode
    MERGE (e:Episode {id: episode.id})
    SET e.title = episode.title,
        e.youtube_video_id = episode.youtube_video_id,
        e.youtube_url = episode.youtube_url,
        e.duration = episode.duration,
        e.published_at = episode.published_at,
        e.summary = episode.summary
    """
    session.run(query, episodes=episodes)
    print(f"Created {len(episodes)} Episode nodes")

def populate_segments_and_speakers(session, segments):
    """Create Segment nodes and Speaker nodes, with relationships"""
    # Create segments
    segment_query = """
    UNWIND $segments AS segment
    MERGE (s:Segment {id: segment.id})
    SET s.start_time = segment.start_time,
        s.end_time = segment.end_time,
        s.text = segment.display_text,
        s.confidence = segment.confidence,
        s.duration = segment.duration,
        s.episode_id = segment.episode_id
    WITH s, segment
    MATCH (e:Episode {id: segment.episode_id})
    MERGE (e)-[:HAS_SEGMENT]->(s)
    """
    session.run(segment_query, segments=segments)
    print(f"Created {len(segments)} Segment nodes")
    
    # Create speakers and relationships
    speaker_query = """
    UNWIND $segments AS segment
    MERGE (sp:Speaker {name: segment.speaker})
    WITH sp, segment
    MATCH (s:Segment {id: segment.id})
    MERGE (sp)-[:SPEAKS_IN]->(s)
    """
    session.run(speaker_query, segments=segments)
    
    # Create FOLLOWED_BY relationships for temporal flow
    temporal_query = """
    MATCH (s1:Segment), (s2:Segment)
    WHERE s1.episode_id = s2.episode_id 
      AND s1.end_time = s2.start_time
    MERGE (s1)-[:FOLLOWED_BY]->(s2)
    """
    session.run(temporal_query)
    print("Created temporal relationships between segments")

def populate_products(session, products):
    """Create Product nodes"""
    query = """
    UNWIND $products AS product
    MERGE (p:Product {name: product.name})
    SET p.id = product.id,
        p.mention_count = product.mention_count
    """
    session.run(query, products=products)
    print(f"Created {len(products)} Product nodes")

def populate_insights(session, insights):
    """Create Insight nodes and relationships"""
    query = """
    UNWIND $insights AS insight
    MERGE (i:Insight {id: insight.id})
    SET i.content = insight.content,
        i.category = insight.category,
        i.confidence_score = insight.confidence_score,
        i.segment_start = insight.segment_start,
        i.segment_end = insight.segment_end
    WITH i, insight
    MATCH (e:Episode {id: insight.episode_id})
    MERGE (e)-[:CONTAINS_INSIGHT]->(i)
    """
    session.run(query, insights=insights)
    print(f"Created {len(insights)} Insight nodes")
    
    # Create Topic nodes from insight categories
    topic_query = """
    MATCH (i:Insight)
    WITH DISTINCT i.category AS category
    MERGE (t:Topic {name: category})
    SET t.category = 'insight_category'
    """
    session.run(topic_query)
    
    # Link insights to topics
    link_query = """
    MATCH (i:Insight)
    MATCH (t:Topic {name: i.category})
    MERGE (i)-[:RELATED_TO]->(t)
    """
    session.run(link_query)
    print("Created Topic nodes and relationships")

def link_products_to_content(session, segments, products, insights):
    """Create relationships between products and content"""
    # Get product names for matching
    product_names = [p['name'] for p in products]
    
    # Link segments to products they mention
    for segment in segments[:100]:  # Limit for performance
        for product_name in product_names:
            if product_name.lower() in segment.get('display_text', '').lower():
                query = """
                MATCH (s:Segment {id: $segment_id})
                MATCH (p:Product {name: $product_name})
                MERGE (s)-[:MENTIONS]->(p)
                """
                session.run(query, segment_id=segment['id'], product_name=product_name)
    
    # Link insights to products they discuss
    for insight in insights:
        for product_name in product_names:
            if product_name.lower() in insight.get('content', '').lower():
                query = """
                MATCH (i:Insight {id: $insight_id})
                MATCH (p:Product {name: $product_name})
                MERGE (i)-[:ABOUT]->(p)
                """
                session.run(query, insight_id=insight['id'], product_name=product_name)
    
    print("Created product relationships")

In [9]:
# Populate the graph database
with driver.session() as session:
    if data['episodes']:
        populate_episodes(session, data['episodes'])
    
    if data['segments']:
        populate_segments_and_speakers(session, data['segments'])
    
    if data['products']:
        populate_products(session, data['products'])
    
    if data['insights']:
        populate_insights(session, data['insights'])
    
    if data['segments'] and data['products'] and data['insights']:
        link_products_to_content(session, data['segments'], data['products'], data['insights'])

print("\nGraph database population complete!")

Created 1 Episode nodes
Created 19 Segment nodes
Created temporal relationships between segments
Created 30 Product nodes

Graph database population complete!


## Print Graph Schema

In [10]:
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PW)
graph.refresh_schema()
print(graph.schema)

Node properties:
Episode {id: STRING, title: STRING, youtube_video_id: STRING, youtube_url: STRING, duration: INTEGER, published_at: STRING, summary: STRING}
Segment {id: STRING, duration: FLOAT, start_time: FLOAT, end_time: FLOAT, text: STRING, confidence: FLOAT, episode_id: STRING}
Speaker {name: STRING}
Product {id: STRING, name: STRING, mention_count: INTEGER}
Relationship properties:

The relationships:
(:Episode)-[:HAS_SEGMENT]->(:Segment)
(:Segment)-[:FOLLOWED_BY]->(:Segment)
(:Speaker)-[:SPEAKS_IN]->(:Segment)


## Print Enhanced Graph Schema

In [11]:
enhanced_graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PW,
    enhanced_schema=True,
)
print(enhanced_graph.schema)



Node properties:
- **Episode**
  - `id`: STRING Available options: ['890caf28-4dfd-461b-861a-5fe95034183a']
  - `title`: STRING Available options: ['The Build - Agents as MCP Tools']
  - `youtube_video_id`: STRING Available options: ['9I_WxAMQ9_0']
  - `youtube_url`: STRING Available options: ['https://www.youtube.com/watch?v=9I_WxAMQ9_0&t=6s']
  - `duration`: INTEGER Min: 733, Max: 733
  - `published_at`: STRING Available options: ['2025-07-24T10:27:52.186400']
  - `summary`: STRING Available options: ['']
- **Segment**
  - `id`: STRING Example: "bf4d13ff-f34e-40c0-9c27-d3e9e54d3bc4"
  - `duration`: FLOAT Min: 1.04, Max: 110.87
  - `start_time`: FLOAT Min: 0.16, Max: 712.57
  - `end_time`: FLOAT Min: 3.04, Max: 732.57
  - `text`: STRING Example: "Did you just say that applications as agents?"
  - `confidence`: FLOAT Min: 0.93134767, Max: 0.99505615
  - `episode_id`: STRING Available options: ['890caf28-4dfd-461b-861a-5fe95034183a']
- **Speaker**
  - `name`: STRING Available options: [

## Query the Graph

### Example Queries for YouTube Podcast Data

In [12]:
query = "Which products were discussed most frequently in the podcast? Show top 10 with their mention counts."

In [13]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="o4-mini"), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True, 
    return_intermediate_steps=True,
)
result = chain.invoke({"query": query})
print(f"Intermediate steps: {result['intermediate_steps']}")
print(f"Final answer: {result['result']}")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Product)
RETURN p.name AS product, p.mention_count AS mentions
ORDER BY p.mention_count DESC
LIMIT 10[0m
Full Context:
[32;1m[1;3m[{'product': 'IO', 'mentions': 11}, {'product': 'MCP', 'mentions': 6}, {'product': 'Tool Swarm', 'mentions': 4}, {'product': 'MCP Tool', 'mentions': 4}, {'product': 'Claude', 'mentions': 3}, {'product': 'LangChain', 'mentions': 3}, {'product': 'CLAUDE Desktop', 'mentions': 2}, {'product': 'Claude Desktop', 'mentions': 2}, {'product': 'Claude desktop', 'mentions': 2}, {'product': 'Flow', 'mentions': 2}][0m

[1m> Finished chain.[0m
Intermediate steps: [{'query': 'MATCH (p:Product)\nRETURN p.name AS product, p.mention_count AS mentions\nORDER BY p.mention_count DESC\nLIMIT 10'}, {'context': [{'product': 'IO', 'mentions': 11}, {'product': 'MCP', 'mentions': 6}, {'product': 'Tool Swarm', 'mentions': 4}, {'product': 'MCP Tool', 'mentions': 4}, {'product': 'Claude', '

# GROQ MODELS LIST
--- Production ---  
distil-whisper-large-v3-en  
gemma2-9b-it  
llama-3.1-8b-instant  
llama-3.3-70b-versatile  
meta-llama/llama-guard-4-12b  
whisper-large-v3  
whisper-large-v3-turbo  

--- Preview ---  
deepseek-r1-distill-llama-70b  
meta-llama/llama-4-maverick-17b-128e-instruct  
meta-llama/llama-4-scout-17b-16e-instruct  
meta-llama/llama-prompt-guard-2-22m  
meta-llama/llama-prompt-guard-2-86m  
mistral-saba-24b  
moonshotai/kimi-k2-instruct  
playai-tts 
playai-tts-arabic   
qwen/qwen3-32b  


In [14]:
model = "moonshotai/kimi-k2-instruct"

chain = GraphCypherQAChain.from_llm(
    ChatGroq(model=model, temperature=0, max_tokens=None, timeout=None, max_retries=2,), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True, 
    return_intermediate_steps=True,
)

result = chain.invoke({"query": query})
print(f"Intermediate steps: {result['intermediate_steps']}")
print(f"Final answer: {result['result']}")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Product)
RETURN p.name AS product, p.mention_count AS mentions
ORDER BY p.mention_count DESC
LIMIT 10[0m
Full Context:
[32;1m[1;3m[{'product': 'IO', 'mentions': 11}, {'product': 'MCP', 'mentions': 6}, {'product': 'Tool Swarm', 'mentions': 4}, {'product': 'MCP Tool', 'mentions': 4}, {'product': 'Claude', 'mentions': 3}, {'product': 'LangChain', 'mentions': 3}, {'product': 'CLAUDE Desktop', 'mentions': 2}, {'product': 'Claude Desktop', 'mentions': 2}, {'product': 'Claude desktop', 'mentions': 2}, {'product': 'Flow', 'mentions': 2}][0m

[1m> Finished chain.[0m
Intermediate steps: [{'query': 'MATCH (p:Product)\nRETURN p.name AS product, p.mention_count AS mentions\nORDER BY p.mention_count DESC\nLIMIT 10'}, {'context': [{'product': 'IO', 'mentions': 11}, {'product': 'MCP', 'mentions': 6}, {'product': 'Tool Swarm', 'mentions': 4}, {'product': 'MCP Tool', 'mentions': 4}, {'product': 'Claude', '

# OLLAMA MODELS LIST  
gemma3n:e4b  
deepseek-r1:latest  
devstral:latest  
gemma3:12b  
gemma3:4b  
gemma3:latest  
llama3.2:latest  
llama3.1:latest  
nomic-embed-text:latest  
phi4-mini:latest  
qwen2.5vl:latest  
qwen3:14b  
qwen3:4b  
qwen3:1.7b  
qwen3:0.6b  
qwen3:latest  

In [15]:
model = "qwen3:latest"

chain = GraphCypherQAChain.from_llm(
    ChatOllama(base_url=LLM_BASE_URL, model=model), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True, 
    return_intermediate_steps=True,
)

result = chain.invoke({"query": query})
print(f"Intermediate steps: {result['intermediate_steps']}")
print(f"Final answer: {result['result']}")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3m<think>
Okay, let's see. The user wants to find out which products were discussed most frequently in the podcast and show the top 10 with their mention counts. 

First, I need to look at the schema provided. The nodes involved here are Product and Segment. The Product node has a mention_count property, but wait, the Segment node has a mention_count? Wait, no. Wait, looking back: the Product node has mention_count as a property. Wait, no. Let me check again.

Wait, the schema says:

Node properties:
Episode {id, title, youtube_video_id, youtube_url, duration, published_at, summary}
Segment {id, duration, start_time, end_time, text, confidence, episode_id}
Speaker {name}
Product {id, name, mention_count}

Oh, right! The Product node has a mention_count property. But how does that relate to the Segments? Because the question is about products discussed in the podcast. So maybe the mention_count in the Prod

CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '<': expected 'FOREACH', 'ALTER', 'ORDER BY', 'CALL', 'USING PERIODIC COMMIT', 'CREATE', 'LOAD CSV', 'START DATABASE', 'STOP DATABASE', 'DEALLOCATE', 'DELETE', 'DENY', 'DETACH', 'DROP', 'DRYRUN', 'FINISH', 'GRANT', 'INSERT', 'LIMIT', 'MATCH', 'MERGE', 'NODETACH', 'OFFSET', 'OPTIONAL', 'REALLOCATE', 'REMOVE', 'RENAME', 'RETURN', 'REVOKE', 'ENABLE SERVER', 'SET', 'SHOW', 'SKIP', 'TERMINATE', 'UNWIND', 'USE' or 'WITH' (line 1, column 1 (offset: 0))
"<think>"
 ^}

## More Example Queries

### Query: Timeline of Topics

In [None]:
timeline_query = "Show the timeline of different insight categories discussed in the episode, ordered by when they first appear"

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini", temperature=0), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": timeline_query})
print(f"\nResult: {result['result']}")

### Query: Speaker Analysis

In [None]:
speaker_query = "Which speaker spoke the most in the podcast? Show the number of segments for each speaker."

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini", temperature=0), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": speaker_query})
print(f"\nResult: {result['result']}")

### Query: Product Insights

In [None]:
product_insights_query = "Find all insights that mention MCP or agents. Show the insight content and confidence score."

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini", temperature=0), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": product_insights_query})
print(f"\nResult: {result['result']}")

### Query: Conversation Flow

In [None]:
flow_query = "Show a sample of the conversation flow - find 5 consecutive segments and show who spoke and what they said"

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini", temperature=0), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": flow_query})
print(f"\nResult: {result['result']}")

## Visualize Graph Statistics

In [None]:
# Get node counts
with driver.session() as session:
    node_counts = {}
    for label in ['Episode', 'Segment', 'Speaker', 'Insight', 'Product', 'Topic']:
        result = session.run(f"MATCH (n:{label}) RETURN count(n) as count")
        node_counts[label] = result.single()['count']
    
    # Get relationship counts
    rel_counts = {}
    relationships = [
        'HAS_SEGMENT', 'SPEAKS_IN', 'MENTIONS', 'CONTAINS_INSIGHT', 
        'ABOUT', 'RELATED_TO', 'FOLLOWED_BY'
    ]
    for rel in relationships:
        result = session.run(f"MATCH ()-[r:{rel}]->() RETURN count(r) as count")
        rel_counts[rel] = result.single()['count']

print("Graph Statistics:")
print("\nNode Counts:")
for label, count in node_counts.items():
    print(f"  {label}: {count}")

print("\nRelationship Counts:")
for rel, count in rel_counts.items():
    print(f"  {rel}: {count}")

print(f"\nTotal Nodes: {sum(node_counts.values())}")
print(f"Total Relationships: {sum(rel_counts.values())}")

## Advanced Queries

### Find Product Discussion Patterns

In [None]:
pattern_query = """
Which products are most frequently discussed together in the same insights? 
Show pairs of products that appear in the same insights.
"""

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini"), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": pattern_query})
print(f"\nResult: {result['result']}")

### Topic Evolution Analysis

In [None]:
evolution_query = """
How does the discussion evolve over time? 
Show the progression of topics (insight categories) throughout the episode timeline.
"""

chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(model="gpt-4.1-mini", temperature=0), 
    graph=enhanced_graph, 
    verbose=True, 
    allow_dangerous_requests=True
)

result = chain.invoke({"query": evolution_query})
print(f"\nResult: {result['result']}")

## Close Neo4j Connection

In [None]:
driver.close()
print("Neo4j connection closed.")

### OLLAMA LLM