# Lesson 3: Preparing Text Data for RAG

<p style="background-color:#fd4a6180; padding:15px; margin-left:20px"> <b>Note:</b> This notebook takes about 30 seconds to be ready to use. Please wait until the "Kernel starting, please wait..." message clears from the top of the notebook before running any cells. You may start the video while you wait.</p>


### Import packages and set up Neo4j

In [48]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [49]:
# # Load from environment
# load_dotenv('.env', override=True)
# NEO4J_URI = os.getenv('NEO4J_URI')
# NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
# NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
# NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# # Note the code below is unique to this course environment, and not a 
# # standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
OPENAI_ENDPOINT = "https://api.openai.com/v1/embeddings"

In [50]:
# Connect to the knowledge graph instance using LangChain

NEO4J_URI="bolt://localhost:7687"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="12345678"
NEO4J_DATABASE="neo4j"
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

### Create a vector index 

In [None]:
# for 5.13
# CALL db.index.vector.createNodeIndex(
#   'movie_tagline_embeddingss',
#   'Movie',
#   'taglineEmbedding',  
#   1536,
#   'cosine'
# );

In [42]:
# creating vector index assocated with movie nodes and thier taglinemebdding property, even they dont exist

kg.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
  FOR (m:Movie) ON (m.taglineEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }}"""
)


[]

In [None]:
# SHOW INDEXES; 5.13

In [51]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 4,
  'name': 'moviePlots',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['embedding'],
  'indexProvider': 'vector-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 3,
  'name': 'movie_tagline_embeddingss',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [24]:
cypher = """
  MATCH (n:Movie) 
  RETURN count(n)
  """
kg.query(cypher)

[{'count(n)': 38}]

### Populate the vector index
- Calculate vector representation for each movie tagline using OpenAI
- Add vector to the `Movie` node as `taglineEmbedding` property

In [35]:
kg.query("""
    MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
    WITH movie, genai.vector.encode(
        movie.tagline, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS vector
    CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

In [56]:
result = kg.query("""MATCH (n)
RETURN n { .* , taglineEmbedding: null }
 """
)
print(result)

[{'n': {'released': 1999, 'taglineEmbedding': None, 'title': 'The Matrix', 'tagline': 'Welcome to the Real World'}}, {'n': {'taglineEmbedding': None, 'born': 1964, 'name': 'Keanu Reeves'}}, {'n': {'taglineEmbedding': None, 'born': 1967, 'name': 'Carrie-Anne Moss'}}, {'n': {'taglineEmbedding': None, 'born': 1961, 'name': 'Laurence Fishburne'}}, {'n': {'taglineEmbedding': None, 'born': 1960, 'name': 'Hugo Weaving'}}, {'n': {'taglineEmbedding': None, 'born': 1967, 'name': 'Andy Wachowski'}}, {'n': {'taglineEmbedding': None, 'born': 1965, 'name': 'Lana Wachowski'}}, {'n': {'taglineEmbedding': None, 'born': 1952, 'name': 'Joel Silver'}}, {'n': {'taglineEmbedding': None, 'born': 1978, 'name': 'Emil Eifrem'}}, {'n': {'released': 2003, 'taglineEmbedding': None, 'title': 'The Matrix Reloaded', 'tagline': 'Free your mind'}}, {'n': {'released': 2003, 'taglineEmbedding': None, 'title': 'The Matrix Revolutions', 'tagline': 'Everything that has a beginning has an end'}}, {'n': {'released': 1997, 'ta

In [58]:
result = kg.query("""MATCH (n:Person)
RETURN n { .* }
 """
)
print(result)

[{'n': {'born': 1964, 'name': 'Keanu Reeves'}}, {'n': {'born': 1967, 'name': 'Carrie-Anne Moss'}}, {'n': {'born': 1961, 'name': 'Laurence Fishburne'}}, {'n': {'born': 1960, 'name': 'Hugo Weaving'}}, {'n': {'born': 1967, 'name': 'Andy Wachowski'}}, {'n': {'born': 1965, 'name': 'Lana Wachowski'}}, {'n': {'born': 1952, 'name': 'Joel Silver'}}, {'n': {'born': 1978, 'name': 'Emil Eifrem'}}, {'n': {'born': 1975, 'name': 'Charlize Theron'}}, {'n': {'born': 1940, 'name': 'Al Pacino'}}, {'n': {'born': 1944, 'name': 'Taylor Hackford'}}, {'n': {'born': 1962, 'name': 'Tom Cruise'}}, {'n': {'born': 1937, 'name': 'Jack Nicholson'}}, {'n': {'born': 1962, 'name': 'Demi Moore'}}, {'n': {'born': 1958, 'name': 'Kevin Bacon'}}, {'n': {'born': 1966, 'name': 'Kiefer Sutherland'}}, {'n': {'born': 1971, 'name': 'Noah Wyle'}}, {'n': {'born': 1968, 'name': 'Cuba Gooding Jr.'}}, {'n': {'born': 1957, 'name': 'Kevin Pollak'}}, {'n': {'born': 1943, 'name': 'J.T. Walsh'}}, {'n': {'born': 1967, 'name': 'James Marshal

In [36]:
result = kg.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [37]:
result[0]['m.tagline']

'Welcome to the Real World'

In [38]:
result[0]['m.taglineEmbedding'][:10]

[0.017445066943764687,
 -0.005481892731040716,
 -0.002013522433117032,
 -0.025571243837475777,
 -0.014404304325580597,
 0.016737302765250206,
 -0.017078077420592308,
 0.000485358847072348,
 -0.025217361748218536,
 -0.029516370967030525]

In [39]:
len(result[0]['m.taglineEmbedding'])

1536

### Similarity search
- Calculate embedding for question
- Identify matching movies based on similarity of question and `taglineEmbedding` vectors

In [40]:
question = "What movies are about love?"

In [None]:

#yield clause return output of a procedure, here the output was movie node and similarity score
#also i was right, vector index just refer to the embeddings place resides.

kg.query("""
    WITH genai.vector.encode(
        $question, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddingss', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"openAiApiKey":OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT,
            "question": question,
            "top_k": 5
            })

[{'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.9062913656234741},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.9022629261016846},
 {'movie.title': 'Snow Falling on Cedars',
  'movie.tagline': 'First loves last. Forever.',
  'score': 0.9013131856918335},
 {'movie.title': 'Sleepless in Seattle',
  'movie.tagline': 'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?',
  'score': 0.8945093750953674},
 {'movie.title': "You've Got Mail",
  'movie.tagline': 'At odds in life... in love on-line.',
  'score': 0.8920691013336182}]

### Try for yourself: ask you own question!
- Change the question below and run the graph query to find different movies

In [46]:
question = "What movies are about adventure?"

In [47]:
kg.query("""
    WITH genai.vector.encode(
        $question, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddingss', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"openAiApiKey":OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT,
            "question": question,
            "top_k": 5
            })

[{'movie.title': 'RescueDawn',
  'movie.tagline': "Based on the extraordinary true story of one man's fight for freedom",
  'score': 0.8998091816902161},
 {'movie.title': 'Cast Away',
  'movie.tagline': 'At the edge of the world, his journey begins.',
  'score': 0.8982738256454468},
 {'movie.title': 'Ninja Assassin',
  'movie.tagline': 'Prepare to enter a secret world of assassins',
  'score': 0.8880560398101807},
 {'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.8870126008987427},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.8856381773948669}]