### Installing basic liberaries

In [None]:
%pip install --upgrade --quiet langchain langchain-community langchain-groq langchain-experimental neo4j wikipedia tiktoken yfiles_jupyter_graphs sentence-transformers


In [None]:
from langchain_community.graphs import Neo4jGraph

In [None]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import ConfigurableField


In [None]:
# Graph and database imports
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph
import os


### Environment Setup

In [None]:
NEO4J_URI="NEO4J_URI"
NEO4J_USERNAME="NEO4J_USERNAME"
NEO4J_PASSWORD="NEO4J_PASSWORD"

import os
os.environ["GROQ_API_KEY"] = "API_KEY"
os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USERNAME
os.environ["NEO4J_PASSWORD"] = NEO4J_PASSWORD


### Model Setup (tried using openai api but ran out of credits so will be using groq, I am deliberately leaving my key here)

In [None]:
# Comment out original OpenAI implementation
# from langchain_openai import ChatOpenAI
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125")

# New Groq implementation
from langchain_groq import ChatGroq
llm = ChatGroq(
    temperature=0,
    groq_api_key="API_KEY",
    model_name="mixtral-8x7b-32768"
)


### Vector Store Setup and hugging face for vector embedding as it is open source.

In [None]:
# Comment out OpenAI embeddings
# from langchain_openai import OpenAIEmbeddings
# vector_index = Neo4jVector.from_existing_graph(
#     OpenAIEmbeddings(),
#     search_type="hybrid",
#     node_label="Document",
#     text_node_properties=["text"],
#     embedding_node_property="embedding"
# )

# Import Neo4jVector
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import HuggingFaceEmbeddings

# Create vector index
vector_index = Neo4jVector.from_existing_graph(
    HuggingFaceEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)


  HuggingFaceEmbeddings(),
  HuggingFaceEmbeddings(),
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
graph = Neo4jGraph()

  graph = Neo4jGraph()


### Data from wikipidea page. Choose dataset which is rich enough in terms of content and entities (people, places, events)

In [56]:
from langchain.document_loaders import WikipediaLoader
raw_documents = WikipediaLoader(query="Robert_Downey_Jr.").load()

In [57]:
len(raw_documents)

25

In [58]:
raw_documents[:3]

[Document(metadata={'title': 'Robert Downey Jr.', 'summary': "Robert John Downey Jr. (born April 4, 1965) is an American actor. His films as a leading actor have grossed over $14 billion worldwide, making him one of the highest-grossing actors of all time. In 2008, Downey was named by Time magazine as one of the 100 most influential people in the world. From 2013 to 2015, he was listed by Forbes as Hollywood's highest-paid actor.\nAt the age of five, Downey made his acting debut in his father Robert Downey Sr.'s 1970 film Pound. He subsequently worked with the Brat Pack in the teen films Weird Science (1985) and Less than Zero (1987). Downey's portrayal of Charlie Chaplin in the 1992 biopic Chaplin garnered him a BAFTA Award for Best Actor and his first Academy Award nomination. Following a stint at the Corcoran Substance Abuse Treatment Facility on drug charges, he joined the TV series Ally McBeal in 2000 and won a Golden Globe Award for the role. Downey was fired from the show in 200

### Data preprocessing (could have done spell check, lower casing, stemming and lemetization, but not eeded because the source is Wikipedia)

In [59]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents[:3])

### Graph Transformer Setup

In [60]:
# Import the LLMGraphTransformer from langchain_experimental for graph-based transformations
# Initialize the transformer with the specified language model (llm) for processing graph data
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm)

In [61]:
# Convert the text documents into graph documents using the LLM transformer
# This transforms plain text into structured graph representations with nodes and relationships
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [62]:
graph_documents

[GraphDocument(nodes=[Node(id='Robert John Downey Jr.', type='Person', properties={}), Node(id='Robert Downey Sr.', type='Person', properties={}), Node(id='April 4, 1965', type='Date', properties={}), Node(id='Manhattan', type='Location', properties={}), Node(id='New York City', type='Location', properties={})], relationships=[Relationship(source=Node(id='Robert John Downey Jr.', type='Person', properties={}), target=Node(id='April 4, 1965', type='Date', properties={}), type='BIRTHDAY', properties={}), Relationship(source=Node(id='Robert John Downey Jr.', type='Person', properties={}), target=Node(id='Manhattan', type='Location', properties={}), type='BIRTHPLACE', properties={}), Relationship(source=Node(id='Robert John Downey Jr.', type='Person', properties={}), target=Node(id='New York City', type='Location', properties={}), type='BIRTHPLACE', properties={}), Relationship(source=Node(id='Robert John Downey Jr.', type='Person', properties={}), target=Node(id='Robert Downey Sr.', type=

In [63]:
# Add the transformed graph documents to the Neo4j graph database
# baseEntityLabel=True creates base entity labels for nodes
# include_source=True preserves the source document information
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

# Define a default Cypher query to visualize the graph
# This query matches all non-MENTIONS relationships between nodes
# Limits the result to 50 relationships for visualization
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [64]:
# directly show the graph resulting from the given Cypher query
default_cypher = "MATCH (s)-[r:!MENTIONS]->(t) RETURN s,r,t LIMIT 50"

In [65]:
# Import required libraries for graph visualization
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

# Enable custom widget manager for Google Colab compatibility
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

# Define a function to visualize the graph using yfiles_jupyter_graphs
def showGraph(cypher: str = default_cypher):
    # Create a Neo4j session using environment credentials
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    # Create and configure the graph widget
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    display(widget)
    return widget

In [68]:
showGraph()

GraphWidget(layout=Layout(height='800px', width='100%'))

GraphWidget(layout=Layout(height='800px', width='100%'))

In [69]:
# Import typing for type hints
from typing import Tuple, List, Optional

# Create a fulltext index on entity nodes for efficient text search
# The index is created on the 'id' property of nodes with label '__Entity__'
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

### Entity Extraction Setup

In [71]:
# Define a Pydantic model for entity extraction
from langchain_core.pydantic_v1 import BaseModel, Field

class Entities(BaseModel):
    """Identifying information about entities."""
    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that appear in the text",
    )

In [72]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate

### Prompt Templates and RAG integration

In [73]:
# Create a chat prompt template for entity extraction
# The system message defines the AI's role for entity extraction
# The human message provides the format for input processing
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are extracting organization and person entities from the text.",
    ),
    (
        "human",
        "Use the given format to extract information from the following input: {question}",
    ),
])

# Create an entity extraction chain by combining the prompt with the LLM
# The chain will output structured data in the Entities format
entity_chain = prompt | llm.with_structured_output(Entities)

In [83]:
# Test the entity extraction chain with a sample question
# The chain extracts 'Spiderman' as an entity from the question
entity_chain.invoke({"question": "Where was Spiderman born?"}).names


['Spiderman']

In [84]:
# Import utility function to handle Lucene special characters in text search
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

In [85]:
# Define a function to generate fuzzy text search queries
# Adds fuzzy matching (~2) to each word in the input query
def generate_full_text_query(input: str) -> str:
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Define a structured retriever function that uses entity extraction
# and fulltext search to find relevant information in the graph
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [87]:
print(structured_retriever("Who is Robert Downey Jr.?"))



Robert Downey Jr. - CHILD_OF -> Robert Downey Sr.
Robert Downey Jr. - BIRTH_PLACE -> Manhattan
Robert Downey Jr. - BIRTH_PLACE -> New York City
Robert Downey Jr. - PARENT -> Robert Downey Sr.
Robert Downey Jr. - PARENT -> Elsie Ann (Née Ford)
Robert Downey Jr. - BIRTH -> April 4, 1965
Robert Downey Jr. - LIVED -> Woodstock
Robert Downey Jr. - LIVED -> London
Robert Downey Jr. - LIVED -> New Mexico
Robert Downey Jr. - LIVED -> California
Robert Downey Jr. - LIVED -> Connecticut
Robert Downey Jr. - LIVED -> Greenwich Village
Robert Downey Jr. - APPEARED_IN -> Pound (1970)
Robert Downey Jr. - APPEARED_IN -> Greaser'S Palace (1972)
Robert Downey Jr. - ACTED_IN -> Pound
Robert Downey Jr. - ACTED_IN -> Weird Science
Robert Downey Jr. - ACTED_IN -> Back To School
Robert Downey Jr. - ACTED_IN -> Less Than Zero
Robert Downey Jr. - ACTED_IN -> Johnny Be Good
Robert Downey Jr. - ACTED_IN -> True Believer
Robert Downey Jr. - ACTED_IN -> Chances Are
Robert Downey Jr. - ACTED_IN -> Air America
Rober

### Retrival function

In [88]:
# Define a combined retriever function that merges structured and unstructured data
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ".join(unstructured_data)}
    """
    return final_data

In [89]:
# Define template for standalone question generation from chat history
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""

In [90]:
# Create a prompt template for question condensation
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [91]:
# Helper function to format chat history into a list of messages
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

# Create a branching search query processor
# Handles both cases: with and without chat history
_search_query = RunnableBranch(
    # If chat history exists, condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatGroq(
            temperature=0,
            groq_api_key="GROQ_API_KEY",
            model_name="mixtral-8x7b-32768"
        )
        | StrOutputParser(),
    ),
    # If no chat history, use the question as is
    RunnableLambda(lambda x : x["question"]),
)

In [93]:
# Define the template for answer generation
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""

# Create a chat prompt template from the answer template
prompt = ChatPromptTemplate.from_template(template)

# Build the final chain combining search, context retrieval, and answer generation
chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

### Some Example queries

In [98]:
# Fact-based Queries
chain.invoke({"question": "When and where was Robert Downey Jr. born?"})



Search query: When and where was Robert Downey Jr. born?




'Robert Downey Jr. was born on April 4, 1965, in Manhattan, New York City.'

In [99]:
# Relationship Queries
chain.invoke({"question": "Who are Robert Downey Jr.'s parents?"})


Search query: Who are Robert Downey Jr.'s parents?




"Robert Downey Jr.'s parents are Robert Downey Sr. and Elsie Ann (Née Ford)."

In [100]:
chain.invoke({"question": "What is the name of Robert Downey Jr.'s wife?"})





Search query: What is the name of Robert Downey Jr.'s wife?




'Robert Downey Jr. has been married three times. His first wife was Elsie Ann Ford, his second wife was Laura Ernst, and his third wife is Rosemary Rogers.'

In [101]:
# Complex Relationship Queries
chain.invoke({"question": "What was Robert Downey Jr.'s relationship with substance abuse?"})




Search query: What was Robert Downey Jr.'s relationship with substance abuse?




'Robert Downey Jr. struggled with substance abuse and had multiple run-ins with the law due to drug charges. He spent time at the Corcoran Substance Abuse Treatment Facility and underwent a court-ordered drug treatment program. He has maintained his sobriety since 2003.'

In [102]:
chain.invoke({"question": "What is Robert Downey Jr.'s ethnicity?"})

Search query: What is Robert Downey Jr.'s ethnicity?




'Robert Downey Jr. is of American ethnicity.'

In [103]:
# Concept-based Query
chain.invoke({"question": "How did Robert Downey Jr.'s career evolve after overcoming addiction?"})

Search query: How did Robert Downey Jr.'s career evolve after overcoming addiction?




"After overcoming addiction, Robert Downey Jr. made his acting comeback in the 2003 film The Singing Detective. He then starred in several successful films, including the black comedy Kiss Kiss Bang Bang (2005), the thriller Zodiac (2007), and the action comedy Tropic Thunder (2008). Downey gained global recognition for his role as Iron Man in ten Marvel Cinematic Universe films, from Iron Man (2008) to Avengers: Endgame (2019). He also played Sherlock Holmes in Guy Ritchie's Sherlock Holmes (2009) and its 2011 sequel. Downey has taken on dramatic parts in films like The Judge (2014) and Oppenheimer (2023), earning an Academy Award, a Golden Globe, and a BAFTA Award for his role in Oppenheimer. He was also nominated for a Primetime Emmy Award for his work in the black comedy miniseries The Sympathizer (2024) and made his Broadway debut in 2024."