## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Connecting to Graph DB

In [2]:
from py2neo import Graph, Node

graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))
print("Connected to Neo4j!")
graph.delete_all()

Connected to Neo4j!


## Loading Dataset

In [3]:
df = pd.read_csv('bollywood_data_set.csv')
df.shape

(9999, 10)

## Data Cleaning

In [4]:
print(df.isnull().sum())

Unnamed: 0            0
imdb-id               0
movie_name            0
year_of_release     200
runtime               0
IMDB_rating           0
no_of_votes           0
plot_description      0
director              0
actors              465
dtype: int64


In [5]:
df = df.drop_duplicates(subset=['imdb-id']).dropna(subset=['movie_name', 'plot_description', 'director', 'actors', 'IMDB_rating'])

df['no_of_votes'] = df['no_of_votes'].str.replace(',', '').str.strip()  
df['no_of_votes'] = pd.to_numeric(df['no_of_votes'], errors='coerce')  

df = df.dropna(subset=['no_of_votes'])
df.shape

(7645, 10)

In [6]:
df['year_of_release'] = df['year_of_release'].str.extract(r'(\d{4})')
df['year_of_release'] = pd.to_numeric(df['year_of_release'], errors='coerce') 
df = df.dropna(subset=['year_of_release'])  

In [7]:
df = df[df['plot_description'] != 'Add a Plot']

In [8]:
df.shape

(6580, 10)

## Normalize Numerical Features

In [9]:
scaler = MinMaxScaler()
df['votes_scaled'] = scaler.fit_transform(df[['no_of_votes']])
df['year_scaled'] = scaler.fit_transform(df[['year_of_release']])

print(df[['no_of_votes', 'votes_scaled', 'year_of_release', 'year_scaled']].head())

   no_of_votes  votes_scaled  year_of_release  year_scaled
0     548031.0      1.000000           2022.0     1.000000
1     387020.0      0.706198           2009.0     0.857143
2     188938.0      0.344752           2007.0     0.835165
3     183452.0      0.334741           2016.0     0.934066
4     180108.0      0.328640           2014.0     0.912088


## TF-IDF for plot descriptions

In [10]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['plot_description'])

print("Data preprocessing complete!")

Data preprocessing complete!


## Populating Neo4j Database

### Add movies as nodes

In [11]:
for _, row in df.iterrows():
    movie_node = Node(
        "Movie",
        imdb_id=row['imdb-id'],
        name=row['movie_name'],
        year=row['year_of_release'],
        votes=row['no_of_votes'],
        rating=row['IMDB_rating'],
        plot=row['plot_description'],
        directors=row['director'],
        actors=row['actors']
    )
    graph.create(movie_node)

print("Movies added to Neo4j!")

Movies added to Neo4j!


## Compute Movie Similarities

In [12]:

plot_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to compute actor/director similarity (exact match for simplicity)
def compute_actor_director_similarity(row1, row2):
    if pd.isna(row1['actors']) or pd.isna(row2['actors']) or pd.isna(row1['director']) or pd.isna(row2['director']):
        return 0  

    actors1, actors2 = set(row1['actors'].split(', ')), set(row2['actors'].split(', '))
    directors1, directors2 = set(row1['director'].split(', ')), set(row2['director'].split(', '))
    common_actors = len(actors1 & actors2) / max(len(actors1 | actors2), 1)
    common_directors = len(directors1 & directors2) / max(len(directors1 | directors2), 1)
    return 0.5 * common_actors + 0.5 * common_directors

# Combine similarities into a score
def compute_similarity(idx1, idx2):
    plot_sim = plot_similarity[idx1, idx2]
    actor_director_sim = compute_actor_director_similarity(df.iloc[idx1], df.iloc[idx2])
    
    # Normalize differences
    year_diff = abs(df.iloc[idx1]['year_scaled'] - df.iloc[idx2]['year_scaled'])
    vote_diff = abs(df.iloc[idx1]['votes_scaled'] - df.iloc[idx2]['votes_scaled'])
    
    # Weighted similarity
    return 0.4 * plot_sim + 0.3 * actor_director_sim + 0.2 * (1 - year_diff) + 0.1 * (1 - vote_diff)

# Create relationships in Neo4j based on similarity
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        sim_score = compute_similarity(i, j)
        if sim_score > 0.5:  
            query = """
            MATCH (m1:Movie {imdb_id: $imdb1}), (m2:Movie {imdb_id: $imdb2})
            CREATE (m1)-[:SIMILAR_TO {score: $score}]->(m2);
            """
            graph.run(query, imdb1=df.iloc[i]['imdb-id'], imdb2=df.iloc[j]['imdb-id'], score=float(sim_score))

print("Similar relationships created!")

Similar relationships created!


## Recommendation System

In [3]:
def recommend_movies(movie_title):
    query = """
    MATCH (m:Movie {name: $movie_title})-[:SIMILAR_TO]->(similar:Movie)
    RETURN similar.name AS Recommendation, similar.rating AS Rating, similar.votes AS Votes
    ORDER BY similar.rating DESC, similar.votes DESC
    LIMIT 3;
    """
    recommendations = graph.run(query, movie_title=movie_title).data()
    return recommendations

## Test the system

In [5]:
movie_title = "Doraemon: Nobita no Dorabian Naito"
recommendations = recommend_movies(movie_title)

print(f"Recommendations for '{movie_title}':")
for idx, rec in enumerate(recommendations, start=1):
    print(f"{idx}. {rec['Recommendation']} (Rating: {rec['Rating']}, Votes: {rec['Votes']})")

Recommendations for 'Doraemon: Nobita no Dorabian Naito':
