In [None]:
#### FIX Movie data


import pandas as pd
from neo4j import GraphDatabase

# Configuration
BOLT_URL = "bolt://neo4j:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = ""
CSV_FILE = "~/work/cinescope/data/csv-files/movies-master.csv" 
CHUNK_SIZE = 1000000  # Adjust as needed

# Connect to Neo4j
driver = GraphDatabase.driver(BOLT_URL, auth=(NEO4J_USER, NEO4J_PASS))

def get_existing_movie_ids():
    """Retrieve the set of movie IDs already present in Neo4j."""
    existing_ids = set()
    with driver.session() as session:
        result = session.run("MATCH (m:Movie) RETURN m.id AS id")
        for record in result:
            existing_ids.add(record["id"])
    return existing_ids

def import_missing_movies():
    """Read the CSV in chunks and import missing Movie nodes into Neo4j."""
    existing_ids = get_existing_movie_ids()
    print("Already imported movie count:", len(existing_ids))
    
    total_imported = 0
    with pd.read_csv(CSV_FILE, chunksize=CHUNK_SIZE, encoding="utf-8") as reader:
        for chunk in reader:
            # Ensure the 'id' column is integer type
            chunk["id"] = chunk["id"].astype(int)
            # Filter rows that are not already imported
            missing_chunk = chunk[~chunk["id"].isin(existing_ids)]
            if missing_chunk.empty:
                continue
            
            # For each missing movie, keep the id and title (adjust columns as needed)
            records = missing_chunk[["id", "title"]].to_dict("records")
            with driver.session() as session:
                # Use a write transaction for batch insertion
                session.execute_write(batch_import_movies, records)
            imported_count = len(records)
            total_imported += imported_count
            print(f"Imported {imported_count} movies in this chunk; total imported: {total_imported}")
            # Update existing_ids so subsequent chunks skip these movies
            existing_ids.update(missing_chunk["id"].tolist())
    print("Completed importing missing movies.")

def batch_import_movies(tx, records):
    """
    Batch MERGE missing movies into Neo4j.
    This query uses UNWIND for efficiency.
    """
    query = """
    UNWIND $records AS row
    MERGE (m:Movie {id: row.id})
    ON CREATE SET m.title = row.title
    """
    tx.run(query, records=records)

if __name__ == "__main__":
    import_missing_movies()
    driver.close()


Already imported movie count: 788417
