# Imports

In [36]:
import os
import textwrap
from dotenv import load_dotenv
import pprint

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [37]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'film_overview'
VECTOR_NODE_LABEL = 'Film'
VECTOR_SOURCE_PROPERTY = 'overview'
VECTOR_EMBEDDING_PROPERTY = 'overviewEmbedding'

In [38]:
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    database=NEO4J_DATABASE
)

# Utils

## Delete all data

In [39]:
cypher = """
MATCH (n)
DETACH DELETE n
"""
kg.query(cypher)

[]

## Check for amount of total nodes

In [40]:
cypher = """
  MATCH (n) 
  RETURN count(n) AS numberOfNodes
"""
print(kg.query(cypher))

cypher = """
  MATCH (f:Film) 
  RETURN count(f) AS numberOfFilms
  """
print(kg.query(cypher))

[{'numberOfNodes': 0}]
[{'numberOfFilms': 0}]


## Remove and show indices

In [41]:
kg.query("DROP INDEX `film_keyword_index`")
kg.query("DROP INDEX `film_overview_index`")
kg.query("SHOW INDEXES")
# kg.refresh_schema()
# print(kg.schema)

[{'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 3, 14, 20, 39, 55, 210000000, tzinfo=<UTC>),
  'readCount': 103091},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 3, 14, 16, 35, 1, 369000000, tzinfo=<UTC>),
  'readCount': 2}]

# Creating list of films from CSV

In [42]:
import csv

all_films = []

YEARS = [2022, 2023]

for year in YEARS:
  with open(f'./data/{year}_movie_collection_data.csv', mode='r') as csv_file:
      csv_reader = csv.DictReader(csv_file)
      for row in csv_reader: # each row will be a dictionary
        all_films.append(row)

In [43]:
len(all_films)

193

In [44]:
first_film = all_films[0]
print(all_films)

[{'Title': 'Reign of Chaos', 'Runtime (minutes)': '77', 'Language': 'en', 'Overview': "When the world is gripped by a plague unleashed by the evil lord Chaos, and humans are turned into rabid creatures, mankind can only be saved by three young women, descendants of a Goddess, with the power to stop Chaos' evil.", 'Release Date': '2022-04-12', 'Genre': 'Action, Horror, Fantasy', 'Keywords': 'chaos, dystopia, warrior woman', 'Recommendation': "Krampus, Leo, Rebel Moon - Part One: A Child of Fire, Five Nights at Freddy's, Oppenheimer, The Creator, Napoleon, Killers of the Flower Moon, How to Train Your Dragon, Expend4bles, Candy Cane Lane, Trolls Band Together, WALL·E, Inception, Parasite, Deadpool, Interstellar, The Last Samurai, Barbie, Black Widow, The Whale", 'Actors': 'Ray Whelan, Peter Cosgrove, Kate Milner Evans, Mark Sears, Rebecca Finch', 'Director': 'Rebecca Matthews', 'Stream': '', 'Buy': 'Apple TV, Amazon Video, Google Play Movies, YouTube ', 'Rent': 'Apple TV, Amazon Video, G

# Create Film and Genre Node, with Relationship

In [45]:
# first_films = all_films[0:5]

for film in all_films:
    genre_list = film['Genre'].split(", ")
    actor_list = film['Actors'].split(", ")
    prod_comp_list = film['Production Companies'].split(", ")

    cypher_query = """
    MERGE (film:Film {title: $title})
    ON CREATE
        SET film.runtime = toInteger($runtime),
            film.language = $language,
            film.overview = $overview,
            film.release_date = datetime($release_date),
            film.keywords = $keywords,
            film.source = $website
    WITH film
    UNWIND $genres AS genre_type
    UNWIND $actors AS actor_name
    UNWIND $prodcution_companies AS company_name

    MERGE (genre:Genre {type: genre_type})
    MERGE (actor:Actor {name: actor_name})
    MERGE (director:Director {name: $director})
    MERGE (company:Production_Company {name: company_name})

    MERGE (film)-[:HAS_GENRE]->(genre)
    MERGE (director)-[:HAS_DIRECTED]->(film)
    MERGE (actor)-[:STARRED_IN]->(film)
    MERGE (company)-[:PRODUCED]->(film)

    """

    kg.query(cypher_query, params={
        'title': film['Title'],
        'runtime': film['Runtime (minutes)'],
        'language': film['Language'],
        'overview': film['Overview'],
        'release_date': film['Release Date'],
        'keywords': film['Keywords'],
        'website': film['Website'],
        'genres': genre_list,
        'actors': actor_list,
        'director': film['Director'],
        'prodcution_companies': prod_comp_list
    })

In [46]:
kg.refresh_schema()
print(textwrap.fill(kg.schema, 60))

Node properties are the following: Film {title: STRING,
runtime: INTEGER, language: STRING, overview: STRING,
release_date: DATE_TIME, keywords: STRING, source:
STRING},Genre {type: STRING},Actor {name: STRING},Director
{name: STRING},Production_Company {name: STRING}
Relationship properties are the following:  The
relationships are the following: (:Film)-[:HAS_GENRE]-
>(:Genre),(:Actor)-[:STARRED_IN]->(:Film),(:Director)-
[:HAS_DIRECTED]->(:Film),(:Production_Company)-[:PRODUCED]-
>(:Film)


# Create Vector Index for Overview, Keywords

In [47]:
# Creating embedding for overview property of films
kg.query("""
         CREATE VECTOR INDEX `film_overview_index` IF NOT EXISTS
          FOR (f:Film) ON (f.overviewEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

kg.query("""
    MATCH (film:Film) WHERE film.overviewEmbedding IS NULL
    WITH film, genai.vector.encode(
      film.overview, 
      "OpenAI", 
      {
        token: $openAiApiKey 
      }) AS vector
    CALL db.create.setNodeVectorProperty(film, "overviewEmbedding", vector)
    """,
    params={"openAiApiKey": OPENAI_API_KEY })

# Creating embedding for keywords property of films
kg.query("""
        CREATE VECTOR INDEX `film_keyword_index` IF NOT EXISTS
        FOR (f:Film) ON (f.keywordsEmbedding) 
        OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
        }}
""")

kg.query("""
    MATCH (film:Film) WHERE film.keywordsEmbedding IS NULL
    WITH film, genai.vector.encode(
      film.keywords, 
      "OpenAI", 
      {
        token: $openAiApiKey 
      }) AS vector
    CALL db.create.setNodeVectorProperty(film, "keywordsEmbedding", vector)
    """,
         params={"openAiApiKey": OPENAI_API_KEY})

# Viewing indexes
kg.query("SHOW INDEXES")

[{'id': 3,
  'name': 'film_keyword_index',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Film'],
  'properties': ['keywordsEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 4,
  'name': 'film_overview_index',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Film'],
  'properties': ['overviewEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 3, 14, 20, 46, 14, 993000000, tzinfo=<UTC>),
  'readCount': 141095},
 {'id': 1,
  'name': 'index_f7

# Sample Queries

## Getting all actors in a film

In [48]:
cypher = """
  MATCH (a:Actor)-[r:STARRED_IN]->(f:Film)
    WHERE f.title = $title
  RETURN a.name
"""

actors_info = kg.query(cypher, params={
    'title': 'Jurassic World Dominion'
})

for item in actors_info:
    print(item)

{'a.name': 'Laura Dern'}
{'a.name': 'Bryce Dallas Howard'}
{'a.name': 'Sam Neill'}
{'a.name': 'Chris Pratt'}
{'a.name': 'Jeff Goldblum'}


## Getting all films in a genre

In [49]:
cypher = """
  MATCH (f:Film)-[r:HAS_GENRE]->(g:Genre)
    WHERE g.type = $genre
  RETURN f.title
"""

films_in_genre = kg.query(cypher, params={
    'genre': 'Horror'
})

for item in films_in_genre:
    print(item)

{'f.title': 'Reign of Chaos'}
{'f.title': 'The OctoGames'}
{'f.title': 'Before Night Falls'}
{'f.title': 'The One Hundred'}
{'f.title': 'Beast'}
{'f.title': 'Halloween Ends'}
{'f.title': 'The Exorcism of God'}
{'f.title': 'Slash/Back'}
{'f.title': 'Nope'}
{'f.title': 'Witch Trials'}
{'f.title': 'Hellraiser'}
{'f.title': 'Scream'}
{'f.title': 'Project Wolf Hunting'}
{'f.title': 'Orphan: First Kill'}
{'f.title': 'Venus'}
{'f.title': 'Wyrmwood: Apocalypse'}
{'f.title': 'Smile'}
{'f.title': 'Pearl'}
{'f.title': 'The Black Phone'}
{'f.title': 'M3GAN'}
{'f.title': 'Jeepers Creepers: Reborn'}
{'f.title': 'The Exorcism of Hannah Stevenson'}
{'f.title': 'Good Boy'}
{'f.title': 'X'}
{'f.title': 'The Menu'}
{'f.title': 'Terrifier 2'}
{'f.title': 'Deep Fear'}
{'f.title': 'Fear'}
{'f.title': 'Godzilla Minus One'}
{'f.title': 'Scream VI'}
{'f.title': 'Sleep'}
{'f.title': "Five Nights at Freddy's"}
{'f.title': 'Squealer'}
{'f.title': 'Carousel'}
{'f.title': 'Saw X'}
{'f.title': 'Skal - Fight for Surv

# Get all films withing a few nodes of a particular film

In [112]:
cypher = """
    MATCH (start:Film {title: $title})-[r:HAS_DIRECTED|STARRED_IN|PRODUCED*1..2]-(connected:Film)
    WITH start, connected, r, type(head(r)) AS related_by
    OPTIONAL MATCH (start)<-[:PRODUCED]-(p:Production_Company)-[:PRODUCED]->(connected)
    OPTIONAL MATCH (start)<-[:STARRED_IN]-(a:Actor)-[:STARRED_IN]->(connected)
    OPTIONAL MATCH (start)<-[:HAS_DIRECTED]-(d:Director)-[:HAS_DIRECTED]->(connected)
    RETURN connected.title as related_film, 
            related_by, 
            collect(distinct p.name) as production_companies, 
            collect(distinct a.name) as actors, 
            collect(distinct d.name) as directors
    """

related_films = kg.query(cypher, params={
    'title': 'Everything Everywhere All at Once'
})

pprint.pprint(related_films)

[{'actors': [],
  'directors': [],
  'production_companies': ['A24'],
  'related_by': 'PRODUCED',
  'related_film': 'Past Lives'},
 {'actors': [],
  'directors': [],
  'production_companies': ['A24'],
  'related_by': 'PRODUCED',
  'related_film': 'The Zone of Interest'},
 {'actors': [],
  'directors': [],
  'production_companies': ['A24'],
  'related_by': 'PRODUCED',
  'related_film': 'X'},
 {'actors': [],
  'directors': [],
  'production_companies': ['A24'],
  'related_by': 'PRODUCED',
  'related_film': 'Pearl'},
 {'actors': [],
  'directors': [],
  'production_companies': ['A24'],
  'related_by': 'PRODUCED',
  'related_film': 'The Whale'},
 {'actors': ['Michelle Yeoh'],
  'directors': [],
  'production_companies': [],
  'related_by': 'STARRED_IN',
  'related_film': 'Minions: The Rise of Gru'},
 {'actors': ['Jamie Lee Curtis'],
  'directors': [],
  'production_companies': [],
  'related_by': 'STARRED_IN',
  'related_film': 'Halloween Ends'}]


## Get all films directed by the director of a particular film

In [51]:
# Getting all films directed by whoever directed 'The Next 365 Days'
cypher = """
    MATCH (start:Film {title: $title})<-[:HAS_DIRECTED]-(d:Director)-[:HAS_DIRECTED]->(connected:Film)
    RETURN connected.title
"""

related_films = kg.query(cypher, params={
    'title': 'The Next 365 Days'
})

for item in related_films:
    print(item)

# Getting directors who directed more than 1 film
cypher = """
MATCH (d:Director)-[:HAS_DIRECTED]->(f:Film)
WITH d, count(f) AS numFilms
WHERE numFilms > 1
RETURN d.name
"""

multifilm_directors = kg.query(cypher)
print(multifilm_directors)

{'connected.title': '365 Days: This Day'}
[{'d.name': 'Barbara Białowąs, Tomasz Mandes'}, {'d.name': 'Matt Bettinelli-Olpin, Tyler Gillett'}, {'d.name': 'Ti West'}]


## Getting Films (Title + Overview + Keywords) Directed by a Director

In [52]:
cypher = """
  MATCH (d:Director)-[r:HAS_DIRECTED]->(f:Film)
    WHERE d.name = $director
  RETURN f.title, f.overview, f.keywords
"""

director_films = kg.query(cypher, params={
    'director': 'Aaron Mirtes'
})

for item in director_films:
    print(item)

{'f.title': 'The OctoGames', 'f.overview': 'Eight contestants compete in eight deadly, classic children\'s games. They seek fame beyond their wildest dreams, competing for the chance to take over the YouTube channel of the famous yet elusive masked content creator known only as "JaxPro".', 'f.keywords': 'None'}


## Checking to see number of comedy films

In [53]:
cypher = """
  MATCH (f:Film)-[:HAS_GENRE]->(g:Genre {type: 'Comedy'})
  RETURN count(f) AS NumberOfComedyFilms
  """
kg.query(cypher)

[{'NumberOfComedyFilms': 47}]

## Getting all films produced by a production company

In [110]:
cypher = """
  MATCH (c:Production_Company)-[r:PRODUCED]->(f:Film)
    WHERE c.name = $name
  RETURN f.title, f.overview, f.keywords
"""

director_films = kg.query(cypher, params={
    'name': 'A24'
})

for item in director_films:
    print(item)

{'f.title': 'The Whale', 'f.overview': 'A reclusive English teacher suffering from severe obesity attempts to reconnect with his estranged teenage daughter for one last chance at redemption.', 'f.keywords': 'regret, nurse, missionary, idaho, bible, redemption, overweight man, addiction, based on play or musical, teacher, grief, neighbor, obesity, religion, death of lover, election, rebellious daughter, guilt, death, lgbt, sister-in-law, eating disorder, father daughter reunion, empathy, shame, english teacher, abandonment, one location, father daughter relationship, 2010s, gay theme, apartment, essay, food addiction, religious symbolism'}
{'f.title': 'Everything Everywhere All at Once', 'f.overview': "An aging Chinese immigrant is swept up in an insane adventure, where she alone can save what's important to her by connecting with the lives she could have led in other universes.", 'f.keywords': 'mother, martial arts, kung fu, philosophy, generations conflict, chinese woman, laundromat, 

# Use Similarity Search

In [55]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

retriever = neo4j_vector_store.as_retriever()

In [56]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0),
    chain_type="stuff",
    retriever=retriever
)

def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
                     return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [57]:
question = "List the titles of any films about a man has an accident at work, and his boss asks his wife to have sex with him in order to free him from responsibility.."
prettychain(question)

There are no films about a man having an accident at work,
and his boss asking his wife to have sex with him in order
to free him from responsibility.
