In [12]:
!pip install neo4j

[0m

In [24]:
import psycopg2
from neo4j import GraphDatabase

def fetch_latest_data():
    """
    Query PostgreSQL to fetch movies updated in the last day,
    along with their genre(s) and actor(s).
    Adjust the query to match your actual schema.
    """
    
    pg_params = {
        "dbname": "postgres",
        "user": "cinescope",
        "password": "cine$c0pe",
        "host": "localhost",
        "port": "5432"
    }

    query = """
    SELECT 
        m.id AS movie_id,
        m.title,
        m.release_year,
        m.popularity,
        g.name AS genre,
        a.name AS actor
    FROM movies m
    JOIN movie_genre mg ON m.id = mg.movie_id
    JOIN genre g ON mg.genre_id = g.id
    JOIN movie_actor ma ON m.id = ma.movie_id
    JOIN actor a ON ma.actor_id = a.id
    WHERE m.updated_at > NOW() - INTERVAL '1 day'
    ORDER BY m.id;
    """
    
    conn = psycopg2.connect(**pg_params)
    cur = conn.cursor()
    cur.execute(query)
    rows = cur.fetchall()
    cur.close()
    conn.close()
    
    # Organize data into a dictionary keyed by movie_id.
    # Each movie will have a title, release_year, popularity, set of genres, and list of actors.
    movies = {}
    for row in rows:
        movie_id, title, release_year, popularity, genre, actor = row
        if movie_id not in movies:
            movies[movie_id] = {
                "movie_id": movie_id,
                "title": title,
                "release_year": release_year,
                "popularity": popularity,
                "genres": set(),
                "actors": set()
            }
        movies[movie_id]["genres"].add(genre)
        movies[movie_id]["actors"].add(actor)
    
    # Convert sets to lists
    for movie in movies.values():
        movie["genres"] = list(movie["genres"])
        movie["actors"] = list(movie["actors"])
    
    return list(movies.values())

# --- Neo4j Part ---
def update_neo4j(movies_data):
    """
    Update Neo4j with the movies data.
    This script creates/updates:
      - Movie nodes (with properties: id, title, release_year, popularity)
      - Genre nodes (with property: name)
      - Actor nodes (with property: name)
      - Relationships: Movie-HAS_GENRE->Genre, Actor-ACTED_IN->Movie,
        and actor-COACTED_WITH-actor (derived from shared movies).
    """

    neo4j_uri = "bolt://neo4j:7687"
    neo4j_username = "neo4j"
    neo4j_password = ""
    
    driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))
    
    with driver.session() as session:
        for movie in movies_data:
            # Create/update the Movie node
            session.run(
                """
                MERGE (m:Movie {id: $movie_id})
                ON CREATE SET m.title = $title, m.release_year = $release_year, m.popularity = $popularity
                ON MATCH SET m.title = $title, m.release_year = $release_year, m.popularity = $popularity
                """,
                {
                    "movie_id": movie["movie_id"],
                    "title": movie["title"],
                    "release_year": movie["release_year"],
                    "popularity": movie["popularity"]
                }
            )
            # For each genre, create Genre node and HAS_GENRE relationship
            for genre in movie["genres"]:
                session.run(
                    """
                    MERGE (g:Genre {name: $genre})
                    WITH g
                    MATCH (m:Movie {id: $movie_id})
                    MERGE (m)-[:HAS_GENRE]->(g)
                    """,
                    {"genre": genre, "movie_id": movie["movie_id"]}
                )
            # For each actor, create Actor node and ACTED_IN relationship
            for actor in movie["actors"]:
                session.run(
                    """
                    MERGE (a:Actor {name: $actor})
                    WITH a
                    MATCH (m:Movie {id: $movie_id})
                    MERGE (a)-[:ACTED_IN]->(m)
                    """,
                    {"actor": actor, "movie_id": movie["movie_id"]}
                )
    
        # Optionally, create COACTED_WITH relationships per movie:
        for movie in movies_data:
            actors = movie["actors"]
            for i in range(len(actors)):
                for j in range(i+1, len(actors)):
                    session.run(
                        """
                        MATCH (a:Actor {name: $actor1}), (b:Actor {name: $actor2})
                        MERGE (a)-[:COACTED_WITH]-(b)
                        """,
                        {"actor1": actors[i], "actor2": actors[j]}
                    )
    
    driver.close()

# --- Run the ETL Pipeline ---
movies_data = fetch_latest_data()
print("Fetched {} movies from PostgreSQL".format(len(movies_data)))
update_neo4j(movies_data)
print("Neo4j has been updated with the latest data.")


OperationalError: connection to server at "localhost" (::1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 5432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?


In [13]:
from neo4j import GraphDatabase

neo4j_uri = "bolt://neo4j:7687"  # Using the forwarded Bolt port
neo4j_username = "neo4j"
neo4j_password = ""  # No password, as configured

driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

def test_connection():
    with driver.session() as session:
        result = session.run("RETURN 'Hello, Neo4j!' AS greeting")
        for record in result:
            print(record["greeting"])

test_connection()


Hello, Neo4j!


In [18]:
import pandas as pd
import ast

# Read the CSV file (adjust the file path if necessary)
df = pd.read_csv("movies.csv")

# Print out the column names and a preview of the data
print("Columns:", df.columns.tolist())
print(df.head())

# Convert the 'movie_cast' column (which is a string representation of a list) into an actual list.
def convert_movie_cast(cast_str):
    if pd.isnull(cast_str):
        return []
    try:
        # Try to safely evaluate the string to a Python list.
        return ast.literal_eval(cast_str)
    except Exception:
        # If that fails, assume it is a comma-separated string.
        return [actor.strip() for actor in cast_str.split(",")]

df["actor_list"] = df["movie_cast"].apply(convert_movie_cast)

# Since your CSV does not contain a 'genre' column, we assign a default genre.
default_genre = "Unknown"

# Use the 'movie' column as the title.
# For popularity, we use the 'vote_average' column.
# We don't have release_year in the CSV, so we set it to None.
df["title"] = df["movie"]            # Rename for clarity
df["popularity"] = pd.to_numeric(df["vote_average"], errors='coerce')
df["release_year"] = None

# For demonstration, select the top 5 movies overall based on popularity.
top_movies = df.sort_values("popularity", ascending=False).head(5)

# Define the genres list (only one default genre here)
genres_list = [default_genre]

# Group the top movies by genre into a dictionary for easier processing.
# Here, every movie gets the default genre.
top_movies_by_genre = {default_genre: top_movies.to_dict("records")}

# Also get a flattened list of movies for creating COACTED_WITH relationships.
all_top_movies = top_movies.to_dict("records")

# Print a summary to verify
print("Unique genres:", genres_list)
print("Top movies by genre:", top_movies_by_genre)
print("All top movies:", all_top_movies)


Columns: ['movie', 'viewer_score_classification', 'vote_average', 'vote_classification', 'vote_count', 'movie_cast', 'summary']
                                               movie  \
0                                    The Dark Knight   
1      The Lord of the Rings: The Return of the King   
2                                      Seven Samurai   
3                                          Inception   
4  The Lord of the Rings: The Fellowship of the Ring   

  viewer_score_classification  vote_average vote_classification  vote_count  \
0                        High           8.5                High   10.417119   
1                        High           8.5                High   10.114720   
2                        High           8.5                High    8.241967   
3                        High           8.4                High   10.520240   
4                        High           8.4                High   10.150855   

                                          movie_cast  \
0  [

In [22]:
from neo4j import GraphDatabase

# Neo4j connection details (using your Kubernetes setup)
neo4j_uri = "bolt://neo4j:7687"  # using service DNS name in your cluster
neo4j_username = "neo4j"
neo4j_password = ""  # no password as configured

driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

with driver.session() as session:
    # Create Genre nodes
    for genre_name in genres_list:
        session.run(
            "MERGE (g:Genre {name: $name})",
            {"name": genre_name}
        )
    
    # Create Movie nodes, link them to the Genre, and create Actor nodes with ACTED_IN relationships.
    for genre, movies in top_movies_by_genre.items():
        for movie in movies:
            session.run(
                """
                MERGE (m:Movie {title: $title})
                ON CREATE SET m.release_year = $year, m.popularity = $pop
                MERGE (g:Genre {name: $genre})
                MERGE (m)-[:HAS_GENRE]->(g)
                """,
                {
                    "title": movie["title"],
                    "year": movie["release_year"],
                    "pop": movie["popularity"],
                    "genre": genre
                }
            )
            # Create Actor nodes and ACTED_IN relationships for each movie
            for actor in movie["actor_list"]:
                session.run(
                    """
                    MERGE (a:Actor {name: $name})
                    MERGE (a)-[:ACTED_IN]->(m)
                    """,
                    {"name": actor}
                )
    
    # Create COACTED_WITH relationships between actors in each movie
    for movie in all_top_movies:
        actors = movie["actor_list"]
        for i in range(len(actors)):
            for j in range(i+1, len(actors)):
                session.run(
                    """
                    MATCH (a:Actor {name: $name1}), (b:Actor {name: $name2})
                    MERGE (a)-[:COACTED_WITH]-(b)
                    """,
                    {"name1": actors[i], "name2": actors[j]}
                )
