In [None]:
###################################
#  Neo4j Data Import Workflow
####################################
#
# This notebook imports movie, genre, and person (actor) data into Neo4j.
# It loads the following CSV files:
# - `genre.csv`
# - `movies-master.csv`
# - `movie_ids.csv`
# - `person_ids.csv`
# - `actor-movie-ids-master.csv`
# 
# Then it creates:
# - Genre nodes (from genre.csv)
# - Movie nodes (from movies-master.csv, filtered using movie_ids.csv)
# - Person nodes (from person_ids.csv)
# - HAS_GENRE relationships (Movie → Genre)
# - ACTED_IN relationships (Person → Movie)

# Necessary Libraries Imports
import pandas as pd
from neo4j import GraphDatabase

###################################
# Step 0: Neo4j Connection & Config
###################################

# Neo4j connection configuration
BOLT_URL = "bolt://neo4j:7687"  # neo4j bolt port
NEO4J_USER = "neo4j"           # Neo4j username
NEO4J_PASS = ""                # Neo4j password

# Initialize the Neo4j driver
driver = GraphDatabase.driver(BOLT_URL, auth=(NEO4J_USER, NEO4J_PASS))
print("Connected to Neo4j.")


In [None]:
###################################
# Step 1: Import Genre Nodes
###################################
# Load `genre.csv` and create Genre nodes. Use MERGE to avoid duplicates.


# Load genres and create Genre nodes
df_genre = pd.read_csv("~/work/cinescope/data/csv-files/genre.csv")
print(f"Loaded {len(df_genre)} genres from genre.csv")

genre_records = df_genre.to_dict('records')
with driver.session() as session:
    session.run(
        """
        UNWIND $genres AS genre
        MERGE (g:Genre {id: genre.id})
        ON CREATE SET g.name = genre.name
        """,
        genres=genre_records
    )
print("Genre nodes imported.")


In [None]:
###################################
# Step 2: Import Movie Nodes and Genre Relationships
###################################

# 2.1 Load valid movie IDs from `movie_ids.csv`.
df_movie_ids = pd.read_csv("~/work/cinescope/data/csv-files/movie_ids.csv")
valid_movie_ids = set(df_movie_ids['id'])
print(f"Loaded {len(valid_movie_ids)} valid movie IDs from movie_ids.csv")


# 2.2 Create Movie nodes from `movies-master.csv`
# We read in chunks, filter to valid movies, and insert key properties.

movies_file = "~/work/cinescope/data/csv-files/movies-master.csv"
chunk_size = 10000  # Adjust based on available memory

with pd.read_csv(movies_file, chunksize=chunk_size) as reader:
    for chunk in reader:
        # Filter to only include movies with IDs in our valid list
        chunk = chunk[chunk["id"].isin(valid_movie_ids)]
        if chunk.empty:
            continue

        # Replace NaN with None
        chunk = chunk.where(pd.notnull(chunk), None)

        movie_records = chunk[['id', 'title', 'popularity', 'vote_average']].to_dict('records')
        with driver.session() as session:
            session.run(
                """
                UNWIND $movies AS movie
                MERGE (m:Movie {id: movie.id})
                ON CREATE SET 
                    m.title = movie.title,
                    m.popularity = movie.popularity,
                    m.vote_average = movie.vote_average
                """,
                movies=movie_records
            )
print("Movie nodes imported.")


# 2.3 Create HAS_GENRE relationships
# Parse the `genres` field (a JSON-like string) for each movie and link Movie nodes to Genre nodes.

# Process movies again in chunks to create genre relationships
with pd.read_csv(movies_file, chunksize=chunk_size) as reader:
    for chunk in reader:
        chunk = chunk[chunk["id"].isin(valid_movie_ids)]
        if chunk.empty:
            continue

        # Replace NaN with None
        chunk = chunk.where(pd.notnull(chunk), None)

        movie_genre_pairs = []
        for _, row in chunk.iterrows():
            movie_id = row['id']
            genres_field = row['genres']
            if genres_field is None:
                continue
            try:
                # Attempt to convert the string representation to a list
                genres_list = eval(genres_field) if isinstance(genres_field, str) else genres_field
            except Exception:
                genres_list = []
            for g in genres_list:
                if not g or "id" not in g:
                    continue
                movie_genre_pairs.append({"movie_id": movie_id, "genre_id": int(g["id"])})
        
        if movie_genre_pairs:
            with driver.session() as session:
                session.run(
                    """
                    UNWIND $pairs AS pair
                    MATCH (m:Movie {id: pair.movie_id})
                    MATCH (g:Genre {id: pair.genre_id})
                    MERGE (m)-[:HAS_GENRE]->(g)
                    """,
                    pairs=movie_genre_pairs
                )
print("HAS_GENRE relationships created.")


# 2.4 Ensure All Movie Nodes Exist
# Some movie IDs from `movie_ids.csv` might not have been in `movies-master.csv`.
# Create minimal Movie nodes for those IDs.

with driver.session() as session:
    result = session.run("MATCH (m:Movie) RETURN collect(m.id) AS ids")
    imported_movie_ids = set(result.single()["ids"])
missing_movie_ids = valid_movie_ids - imported_movie_ids

if missing_movie_ids:
    print(f"Adding {len(missing_movie_ids)} movie nodes with minimal info.")
    with driver.session() as session:
        session.run(
            """
            UNWIND $ids AS mid
            MERGE (m:Movie {id: mid})
            """,
            ids=list(missing_movie_ids)
        )
else:
    print("No missing movie nodes found.")


In [None]:
###################################
# Step 3: Import Person (Actor) Nodes from `person_ids.csv`
###################################

person_file = "~/work/cinescope/data/csv-files/person_ids.csv"
chunk_size = 10000

with pd.read_csv(person_file, chunksize=chunk_size) as reader:
    for chunk in reader:
        chunk = chunk.where(pd.notnull(chunk), None)
        chunk['id'] = chunk['id'].astype(int, errors='ignore')
        person_records = chunk[['id', 'name', 'popularity', 'adult']].to_dict('records')
        with driver.session() as session:
            session.run(
                """
                UNWIND $batch AS person
                MERGE (p:Person {id: person.id})
                ON CREATE SET 
                    p.name = person.name,
                    p.popularity = person.popularity,
                    p.adult = person.adult
                """,
                batch=person_records
            )
print("Person nodes imported.")


In [None]:
###################################
# Step 4: Create ACTED_IN Relationships from `actor-movie-ids-master.csv`
###################################
# Link Person nodes to Movie nodes based on actor and movie IDs.

rel_file = "~/work/cinescope/data/csv-files/actor-movie-ids-master.csv"
chunk_size = 10000

with pd.read_csv(rel_file, chunksize=chunk_size) as reader:
    for chunk in reader:
        chunk['actor_id'] = chunk['actor_id'].astype(int)
        chunk['movie_id'] = chunk['movie_id'].astype(int)
        rel_pairs = chunk.to_dict('records')
        with driver.session() as session:
            session.run(
                """
                UNWIND $pairs AS pair
                MATCH (p:Person {id: pair.actor_id})
                MATCH (m:Movie {id: pair.movie_id})
                MERGE (p)-[:ACTED_IN]->(m)
                """,
                pairs=rel_pairs
            )
print("ACTED_IN relationships created.")


# Finally, close the Neo4j driver.
driver.close()
print("Data import completed and Neo4j connection closed.")


In [None]:
###################################
# Step 5: Create Indexes
###################################
with driver.session() as session:
    session.run(
        """
        CREATE INDEX FOR (p:Person) ON (p.name);
        CREATE INDEX FOR (g:Genre) ON (g.name);
        CREATE INDEX FOR (m:Movie) ON (m.title);
        """
        )
# Finally, close the Neo4j driver.
driver.close()
print("Indexes Created")

