# 3.Storing Data in a Graph Database  
i. Choose a graph database such as Neo4j.

ii. Define an appropriate graph schema (nodes, edges, relationships). 

iii. Insert structured data into the database using Py2neo (Python)

In [1]:
from py2neo import Graph, Node, Relationship
import pandas as pd

In [2]:
# Connecting to Neo4j database
graph = Graph("bolt://localhost:7687", auth=("neo4j", "moviesgraphdb"))

In [3]:
# Clear existing data (optional, for fresh start)
graph.run("MATCH (n) DETACH DELETE n")

In [4]:
merged_df=pd.read_csv('merged_movies.csv')
merged_df.head()

Unnamed: 0,Title,YEAR,IMDb Rating,Runtime,Votes,Metascore,MPAA Rating,Short Description
0,The Shawshank Redemption,1994,9.3,142,3000000,82.0,TV-14,"Wrongly convicted, Andy Dufresne (Tim Robbins)..."
1,The Godfather,1972,9.2,175,2100000,100.0,TV-14,Francis Ford Coppola's epic features Marlon Br...
2,The Dark Knight,2008,9.0,152,3000000,84.0,TV-14,Batman raises the stakes in his war on crime. ...
3,12 Angry Men,1957,9.0,96,914000,97.0,Approved,"12 Angry Men, by Sidney Lumet, is a behind-clo..."
4,The Lord of the Rings: The Return of the King,2003,9.0,201,2100000,94.0,PG-13,Sauron's forces have laid siege to Minas Tirit...


In [13]:
print(merged_df.columns)
print(merged_df[['Title', 'YEAR', 'MPAA Rating']].head(10))
print(merged_df.isnull().sum())

Index(['Title', 'YEAR', 'IMDb Rating', 'Runtime', 'Votes', 'Metascore',
       'MPAA Rating', 'Short Description'],
      dtype='object')
                                               Title  YEAR MPAA Rating
0                           The Shawshank Redemption  1994       TV-14
1                                      The Godfather  1972       TV-14
2                                    The Dark Knight  2008       TV-14
3                                       12 Angry Men  1957    Approved
4      The Lord of the Rings: The Return of the King  2003       PG-13
5                                   Schindler's List  1993           R
6                                       Pulp Fiction  1994       TV-14
7  The Lord of the Rings: The Fellowship of the Ring  2001       PG-13
8                     The Good, the Bad and the Ugly  1966       TV-14
9                                       Forrest Gump  1994       TV-PG
Title                0
YEAR                 0
IMDb Rating          0
Runtime     

In [5]:
# Insert data into Neo4j
for _, row in merged_df.iterrows():
    # Create or match Movie node
    movie = Node(
        "Movie",
        Title=row["Title"],
        Year=row["YEAR"],
        IMDB_Rating=row["IMDb Rating"],
        Runtime=row["Runtime"],
        Votes=row["Votes"],
        Metascore=row["Metascore"],
        Short_Description=row["Short Description"]
    )
    graph.merge(movie, "Movie", "Title")

    # Create or match MPAA Rating node
    mpaa_rating = Node("MPAA_Rating", Rating=row["MPAA Rating"])
    graph.merge(mpaa_rating, "MPAA_Rating", "Rating")

    # Create relationship (Movie)-[:HAS_RATING]->(MPAA_Rating)
    has_rating = Relationship(movie, "HAS_RATING", mpaa_rating)
    graph.merge(has_rating)


Movie Nodes → Contain movie details like Title, Year, IMDb Rating, Runtime, Votes, Metascore, Short Description.

MPAA_Rating Nodes → Unique rating categories (G, PG, PG-13, R, etc.).

HAS_RATING Relationship → Links each Movie node to its respective MPAA_Rating node.

In [35]:
import webbrowser
from pyvis.network import Network
from py2neo import Graph

def visualize_graph():
    # Create Pyvis network (notebook=True for Jupyter)
    net = Network(notebook=True, directed=True, cdn_resources="in_line")
    
    # Query to fetch nodes & relationships
    query = """
    MATCH (m:Movie)-[r]->(n)
    RETURN m.Title AS Movie, type(r) AS Relationship, labels(n)[0] AS NodeType, 
           COALESCE(n.Title, n.name, 'Unknown') AS ConnectedNode
    """
    result = graph.run(query).data()

    # Add nodes and edges to Pyvis network
    for record in result:
        movie = record["Movie"]
        connected_node = record["ConnectedNode"]
        relationship = record["Relationship"]

        net.add_node(movie, label=movie, color="blue")  # Movie nodes in blue
        net.add_node(connected_node, label=connected_node, color="green")  # Other nodes in green
        net.add_edge(movie, connected_node, label=relationship)

    # Save and open the graph
    output_file = "graph.html"
    net.show(output_file)
    webbrowser.open(output_file)  # Open in browser automatically

# Call the function to visualize the graph
visualize_graph()


graph.html


## The DataBase Looks Like This : 

<img src='DataBasePic.png'></img>

# 4. Querying the Graph Database 

Write at least three queries using Cypher (Neo4j) to retrieve insights.

1. Find all Movies and their MPAA Ratings

In [20]:
query = """
MATCH (m:Movie)-[:HAS_RATING]->(r:MPAA_Rating)
RETURN m.Title AS Title, m.Year AS Year, r.Rating AS MPAA_Rating
"""

result = graph.run(query).data()
pd.DataFrame(result)

Unnamed: 0,Title,Year,MPAA_Rating
0,The Lion King,1994,PG
1,Spider-Man: Across the Spider-Verse,2023,PG
2,Cinema Paradiso,1988,PG
3,Your Name.,2016,PG
4,Scarface,1983,PG
...,...,...,...
230,Room,2015,Not
231,The Handmaiden,2016,Not
232,The Battle of Algiers,1966,Not
233,Gangs of Wasseypur,2012,Not


2. Retrieve Data for a Specific Movie

In [48]:
movie_title = "Inception"

query = f"""
MATCH (m:Movie)
WHERE m.Title = '{movie_title}'
RETURN m
"""

result = graph.run(query).data()

if result:
    movie_data = result[0]["m"]
    df = pd.DataFrame([movie_data])
df


Unnamed: 0,Runtime,Year,Metascore,Title,IMDB_Rating,Votes,Short_Description
0,148,2010,74.0,Inception,8.8,2700000,"Dom Cobb is a skilled thief, the absolute best..."


3. Find MPAA Ratings and how many movies belong to each category

In [49]:
query = """
MATCH (m:Movie)-[:HAS_RATING]->(r:MPAA_Rating)
RETURN r.Rating AS MPAA_Rating, COUNT(m) AS Movie_Count
ORDER BY Movie_Count DESC
"""

result = graph.run(query).data()
pd.DataFrame(result)


Unnamed: 0,MPAA_Rating,Movie_Count
0,R,50
1,TV-14,48
2,TV-PG,27
3,PG-13,24
4,TV-MA,20
5,PG,15
6,Not,15
7,Passed,12
8,Approved,9
9,TV-G,6


4. Find all movies released in a specific year (e.g., 2010)

In [50]:
query = """
MATCH (m:Movie) 
WHERE m.Year = 2010
RETURN m.Title AS Title, m.IMDB_Rating AS IMDb_Rating, m.Metascore AS Metascore
ORDER BY m.IMDB_Rating DESC
"""

result = graph.run(query).data()
pd.DataFrame(result)


Unnamed: 0,Title,IMDb_Rating,Metascore
0,Inception,8.8,74.0
1,Toy Story 3,8.3,92.0
2,Incendies,8.3,80.0
3,Shutter Island,8.2,63.0
4,How to Train Your Dragon,8.1,75.0


5. Find movies with IMDb rating greater than 9 and Metascore greater than 95

In [51]:
query = """
MATCH (m:Movie) 
WHERE m.IMDB_Rating > 9 AND m.Metascore > 95
RETURN m.Title AS Title, m.IMDB_Rating AS IMDb_Rating, m.Metascore AS Metascore
ORDER BY m.IMDB_Rating DESC, m.Metascore DESC
"""

result = graph.run(query).data()
pd.DataFrame(result)


Unnamed: 0,Title,IMDb_Rating,Metascore
0,The Godfather,9.2,100.0


6. Find top 10 highest-rated movies (IMDb rating descending)

In [52]:
query = """
MATCH (m:Movie) 
RETURN m.Title AS Title, m.IMDB_Rating AS IMDb_Rating
ORDER BY m.IMDB_Rating DESC
LIMIT 10
"""

result = graph.run(query).data()
pd.DataFrame(result)


Unnamed: 0,Title,IMDb_Rating
0,The Shawshank Redemption,9.3
1,The Godfather,9.2
2,The Lord of the Rings: The Return of the King,9.0
3,The Dark Knight,9.0
4,12 Angry Men,9.0
5,Schindler's List,9.0
6,Pulp Fiction,8.9
7,The Lord of the Rings: The Fellowship of the Ring,8.9
8,"The Good, the Bad and the Ugly",8.8
9,Forrest Gump,8.8


7. Find movies with the longest runtime (Top 5)

In [53]:
query = """
MATCH (m:Movie) 
RETURN m.Title AS Title, m.Runtime AS Runtime
ORDER BY m.Runtime DESC
LIMIT 5
"""

result = graph.run(query).data()
pd.DataFrame(result)


Unnamed: 0,Title,Runtime
0,Gangs of Wasseypur,321
1,Gone with the Wind,238
2,Once Upon a Time in America,229
3,Ben-Hur,212
4,Seven Samurai,207
