# Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from neo4j import *
import dotenv
import os

In [2]:
movies_data = pd.read_csv('Input/movies.csv')
ratings_data = pd.read_csv('Input/ratings.csv')

In [3]:
movies_data.head()

Unnamed: 0,MovieID,Title,Year,Genre
0,1,The Shawshank Redemption,1994,Drama
1,2,The Godfather,1972,Drama|Crime
2,3,The Dark Knight,2008,Drama|Crime|Action
3,4,The Godfather Part II,1974,Drama|Crime
4,5,12 Angry Men,1957,Drama|Crime


In [4]:
ratings_data.head()

Unnamed: 0,MovieID,SourceSite,Rating,NumberOfRatings
0,1,IMDb,9.2,2609677
1,1,RottenTomatoes,9.8,250000
2,1,Metacritic,9.0,2013
3,2,IMDb,9.2,1805379
4,2,RottenTomatoes,9.8,250000


In [5]:
def set_genres(genres,col):
    if genres in str(col).split('|'): return 1
    else: return 0

In [6]:
movies_data.head()
mov_genres_df = movies_data

The below cell creates an adjacency matrix which relates each movie to all the genres it belongs to. If the value is 1, it means that movie belongs to the genre and if the value is 0, it means that movie is not linked to the genre.

In [7]:
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['Genre']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['Genre']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['Genre']), axis=1)
mov_genres_df["Biography"] = mov_genres_df.apply(lambda x: set_genres("Biography",x['Genre']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['Genre']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['Genre']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['Genre']), axis=1)
mov_genres_df["Family"] = mov_genres_df.apply(lambda x: set_genres("Family",x['Genre']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['Genre']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['Genre']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['Genre']), axis=1)
mov_genres_df["History"] = mov_genres_df.apply(lambda x: set_genres("History",x['Genre']), axis=1)
mov_genres_df["Music"] = mov_genres_df.apply(lambda x: set_genres("Music",x['Genre']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['Genre']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['Genre']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['Genre']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['Genre']), axis=1)
mov_genres_df["Sport"] = mov_genres_df.apply(lambda x: set_genres("Sport",x['Genre']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['Genre']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['Genre']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['Genre']), axis=1)
mov_genres_df["(no_genres_listed)"] = mov_genres_df.apply(lambda x: set_genres("(no_genres_listed)",x['Genre']), axis=1)

In [8]:
mov_genres_df.drop(['Title','Year','Genre'], axis = 1, inplace=True)
mov_genres_df.head()

Unnamed: 0,MovieID,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,(no_genres_listed)
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


The function set_year_scale is used to provide a value based on a scale of 0 to 10 to the movies based on the year of release.

In [9]:
movies =  pd.read_csv('Input/movies.csv')

def set_year_scale(year):
    if (year < 1920): return 0 + (year)/(1920)
    elif (1921 <= year <= 1945): return 1 + (year-1921)/(1945-1921)
    elif (1946 <= year <= 1960): return 2 + (year-1946)/(1960-1946)
    elif (1961 <= year <= 1975): return 3 + (year-1961)/(1975-1961)
    elif (1976 <= year <= 1985): return 4 + (year-1976)/(1985-1976)
    elif (1986 <= year <= 1995): return 5 + (year-1986)/(1995-1986)
    elif (1996 <= year <= 2000): return 6 + (year-1996)/(2000-1996)
    elif (2001 <= year <= 2010): return 7 + (year-2001)/(2010-2001)
    elif (2011 <= year <= 2015): return 8 + (year-2011)/(2015-2011)
    elif (2016 <= year <= 2020): return 9 + (year-2016)/(2020-2016)
    elif (2021 <= year): return 10
    else: return 0

movies['YearScale'] = movies.apply(lambda x: set_year_scale(x['Year']), axis=1)
#no need title and year fields
movies.drop(['Title', 'Genre'], axis = 1, inplace=True)

In [10]:
movies.head()

Unnamed: 0,MovieID,Year,YearScale
0,1,1994,5.888889
1,2,1972,3.785714
2,3,2008,7.777778
3,4,1974,3.928571
4,5,1957,2.785714


Calculating the weighted mean of ratings based on the ratings on IMDb, Rotten Tomatoes and Metacritic, and also calculating the total number of user ratings given to a movie.

In [11]:
def getMeanAndCount(data):
    df = {}
    df['RatingMean'] = np.average(data['Rating'], weights= data['NumberOfRatings'])
    df['RatingCount'] = np.sum(data['NumberOfRatings'])
    return pd.Series(df)

agg_movies_rat = ratings_data.groupby(['MovieID']).apply(lambda x: getMeanAndCount(x)).reset_index()
agg_movies_rat.columns = ['MovieID', 'RatingMean', 'RatingCount']
agg_movies_rat.head()

Unnamed: 0,MovieID,RatingMean,RatingCount
0,1,9.252276,2861690.0
1,2,9.272844,2059209.0
2,3,9.035001,2838257.0
3,4,9.117343,1493290.0
4,5,8.897516,781745.0


The function set_rating_scale is used to provide a value based on a scale of 0 to 10 to the movies based on the number of ratings.

In [12]:
def set_rating_scale(rating_counts):
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10000): return 1 + (rating_counts-2)/(10000-2)
    elif (10001 <= rating_counts <= 20000): return 2 + (rating_counts-10001)/(20000-10001)
    elif (20001 <= rating_counts <= 30000): return 3 + (rating_counts-20001)/(30000-20001)
    elif (30001 <= rating_counts <= 40000): return 4 + (rating_counts-40000)/(40000-30001)
    elif (40001 <= rating_counts <= 60000): return 5 + (rating_counts-40001)/(60000-40001)
    elif (60001 <= rating_counts <= 80000): return 6 + (rating_counts-60001)/(80000-60001)
    elif (80001 <= rating_counts <= 110000): return 7 + (rating_counts-80001)/(110000-80001)
    elif (110001 <= rating_counts <= 140000): return 8 + (rating_counts-110001)/(140000-110001)
    elif (140001 <= rating_counts <= 200000): return 9 + (rating_counts-140001)/(200000-140001)
    elif (200001 <= rating_counts): return 10
    else: return 0

agg_movies_rat['RatingScale'] = agg_movies_rat.apply(lambda x: set_rating_scale(x['RatingCount']), axis=1)
#no need rating_counts field
agg_movies_rat.drop('RatingCount', axis = 1, inplace=True)
mov_rating_df = movies.merge(agg_movies_rat, left_on='MovieID', right_on='MovieID', how='left')
mov_rating_df = mov_rating_df.fillna(0)
mov_rating_df.drop('YearScale', axis = 1, inplace=True)
mov_rating_df.head()

Unnamed: 0,MovieID,Year,RatingMean,RatingScale
0,1,1994,9.252276,10.0
1,2,1972,9.272844,10.0
2,3,2008,9.035001,10.0
3,4,1974,9.117343,10.0
4,5,1957,8.897516,10.0


In [13]:
print(mov_rating_df)

     MovieID  Year  RatingMean  RatingScale
0          1  1994    9.252276    10.000000
1          2  1972    9.272844    10.000000
2          3  2008    9.035001    10.000000
3          4  1974    9.117343    10.000000
4          5  1957    8.897516    10.000000
..       ...   ...         ...          ...
245      246  1992    8.152811    10.000000
246      247  1982    8.117807    10.000000
247      248  2021    8.000000     9.872765
248      249  2011    8.162427    10.000000
249      250  1991    7.999992    10.000000

[250 rows x 4 columns]


In [14]:
mov_year_df = movies.set_index('MovieID')
mov_genres_df = mov_genres_df.set_index('MovieID')
mov_rating_df = mov_rating_df.set_index('MovieID')

In [15]:
mov_year_df.head()

Unnamed: 0_level_0,Year,YearScale
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1994,5.888889
2,1972,3.785714
3,2008,7.777778
4,1974,3.928571
5,1957,2.785714


In [16]:
mov_genres_df.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,...,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,(no_genres_listed)
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
mov_rating_df.head()

Unnamed: 0_level_0,Year,RatingMean,RatingScale
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1994,9.252276,10.0
2,1972,9.272844,10.0
3,2008,9.035001,10.0
4,1974,9.117343,10.0
5,1957,8.897516,10.0


The cosine similarity measures the similarity between vector lists by calculating the cosine angle between the two vector lists. If you consider the cosine function, its value at 0 degrees is 1 and -1 at 180 degrees. This means for two overlapping vectors, the value of cosine will be maximum and minimum for two precisely opposite vectors.

We use the cosine similarity function to find similarity score between any two movies. We take similarities based on all three data frames i.e. YearScale (0.2 weight), Genre (0.4 weight) and RatingScale and RatingMean (0.4 weight).

In [18]:
#cosine similarity for mov_tag_df
cos_year = cosine_similarity(mov_year_df.values)*0.2
#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df.values)*0.4
#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df.values)*0.4
#creating the final similarity score
cos = cos_year+cos_genres+cos_rating

In [19]:
cols = mov_year_df.index.values
inx = mov_year_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
movies_sim.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,241,242,243,244,245,246,247,248,249,250
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.882843,0.83094,0.882843,0.882842,0.83094,0.83094,0.882843,0.83094,0.6,...,0.599999,0.6,0.83094,0.830938,0.83094,0.6,0.882843,0.83094,1.0,0.6
2,0.882843,1.0,0.926598,1.0,1.0,0.763299,0.763299,1.0,0.763299,0.6,...,0.6,0.6,0.763299,0.763297,0.763299,0.6,0.8,0.926598,0.882842,0.6
3,0.83094,0.926598,1.0,0.926598,0.926598,0.733333,0.866667,0.926599,0.866667,0.6,...,0.599999,0.6,0.733333,0.733331,0.733333,0.6,0.763299,0.866666,0.83094,0.6
4,0.882843,1.0,0.926598,1.0,1.0,0.763299,0.763299,1.0,0.763299,0.6,...,0.6,0.6,0.763299,0.763297,0.763299,0.6,0.8,0.926598,0.882842,0.6
5,0.882842,1.0,0.926598,1.0,1.0,0.763299,0.763299,1.0,0.763299,0.6,...,0.6,0.6,0.763299,0.763297,0.763298,0.6,0.8,0.926597,0.882842,0.6


- The get_similar method that takes the MovieID of any movie and compares it with every other movie to give us the movies_similarity dataframe that has the MovieID, SimilarMovieID and the SimilarityScore between the two movies. 
- The SimilarityScore is sorted in descending order. 

In [20]:
def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='MovieID', var_name='SimilarMovieID', value_name='SimilarityScore'). \
            sort_values('SimilarityScore', axis=0, ascending=False)
    df.drop(df.index[df['MovieID'] == df['SimilarMovieID']], inplace=True)
    return df
#create empty df
movies_similarity = pd.DataFrame(columns=['MovieID','SimilarMovieID','SimilarityScore'])

In [21]:
for x in movies_sim.index.tolist():
    
    movies_similarity = pd.concat((movies_similarity,get_similar(x)), axis = 0)
movies_similarity.head()

Unnamed: 0,MovieID,SimilarMovieID,SimilarityScore
11,1,12,1.0
167,1,168,1.0
66,1,67,1.0
83,1,84,1.0
17,1,18,1.0


In [22]:
movies =  pd.read_csv('Input/movies.csv') 
movies_df = movies.drop('Genre', axis = 1)
#mean of ratings for each movies
agg_rating_avg = ratings_data.groupby(['MovieID']).apply(lambda x: np.average(x['Rating'], weights = x['NumberOfRatings'])).reset_index()
agg_rating_avg.columns = ['MovieID', 'RatingMean']

movies_df = movies_df.merge(agg_rating_avg, left_on='MovieID', right_on='MovieID', how='left')
movies_df.head()

Unnamed: 0,MovieID,Title,Year,RatingMean
0,1,The Shawshank Redemption,1994,9.252276
1,2,The Godfather,1972,9.272844
2,3,The Dark Knight,2008,9.035001
3,4,The Godfather Part II,1974,9.117343
4,5,12 Angry Men,1957,8.897516


In [23]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Biography",
    "Comedy",
    "Crime",
    "Drama",
    "Family",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "History",
    "Music",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Sport",
    "Thriller",
    "War",
    "Western",
    "(no_genres_listed)"
    ]

genres_df = pd.DataFrame(genres, columns=['Genre'])
genres_df.head()

Unnamed: 0,Genre
0,Action
1,Adventure
2,Animation
3,Biography
4,Comedy


In [24]:
users_movies_df = ratings_data
users_movies_df.head()

Unnamed: 0,MovieID,SourceSite,Rating,NumberOfRatings
0,1,IMDb,9.2,2609677
1,1,RottenTomatoes,9.8,250000
2,1,Metacritic,9.0,2013
3,2,IMDb,9.2,1805379
4,2,RottenTomatoes,9.8,250000


In [25]:
movies_genres_df = movies.drop('Title', axis = 1)

In [26]:
def get_movie_genres(movieId):
    movie = movies_genres_df[movies_genres_df['MovieID']==movieId]
    genres = movie['Genre'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['Genre'])
    df.insert(loc=0, column='MovieID', value=movieId)
    return df

In [27]:
#create empty df
movies_genres=pd.DataFrame(columns=['MovieID','Genre'])
for x in movies_genres_df['MovieID'].tolist():
    movies_genres=pd.concat((movies_genres,get_movie_genres(x)), axis = 0)
movies_genres.head()

Unnamed: 0,MovieID,Genre
0,1,Drama
0,2,Drama
1,2,Crime
0,3,Drama
1,3,Crime


### Generating the CSV files required for the Knowledge Graph

In [28]:
movies_df.to_csv("Output/movies.csv", sep=',', header=True, index=False)

In [29]:
genres_df.to_csv("Output/genres.csv", sep=',', header=True, index=False)

In [30]:
movies_genres.to_csv("Output/movies_genres.csv", sep=',', header=True, index=False)

In [31]:
movies_similarity.to_csv("Output/movies_similarity.csv", sep=',', header=True, index=False)

### Connecting the code to Neo4j Database and Running a Sample Query

Proceed to this part of code only after creating the Neoo4j Aura Database and saving the credentials environment file in the project folder.

In [32]:
dotenv.load_dotenv("credentials-04d4e7f0.env")

NEO4J_URI = os.environ.get("NEO4J_URI")
NEO4J_USERNAME = os.environ.get("NEO4J_USERNAME")
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")
AURA_INSTANCENAME = os.environ.get("AURA_INSTANCENAME")

In [33]:
driver = GraphDatabase.driver(
    NEO4J_URI,
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
)

session = driver.session()

In [34]:
query = '''MATCH (m:Movie{MovieID: 14})-[r1:`Similar to`]->(s:Movie)-[r2:`Released in`]->(y:Year) 
return s.Title, r1.SimilarityScore, s.RatingMean, y.Year 
ORDER BY r1.SimilarityScore DESC LIMIT 5'''

result = session.run(query)

similar_movies = [(record["s.Title"], round(float(record["r1.SimilarityScore"])*100, 2), round(float(record["s.RatingMean"]), 2), record["y.Year"]) for record in result]

print(similar_movies)

session.close()
driver.close()

[('The Lord of the Rings: The Return of the King', 100.0, 8.86, 2003), ('The Lord of the Rings: The Fellowship of the Ring', 100.0, 8.89, 2001), ('Gladiator', 99.99, 8.53, 2000), ('Avengers: Endgame', 99.98, 8.33, 2019), ('Top Gun: Maverick', 92.66, 8.75, 2022)]
