In [1]:
# Welcome to Project MovieMind - A MOVIE RECOMMENDER WITH MOVIE GENRE AND RATINGS
# Created by Manufacturing Engineering Technology Students - MANTECH 4AI3 : Artificial Intelligence
# Group Memebers: Nikesh Sethuraman - Muhammad Shah - Dylan Miguel and Calvin Hu
# Note: Kindly used add all csv and database files to run the program smoothly
# These are the names of the database csv files to upload: genome-scores, genome-tags, links, movies, ratings and tags.

import pandas as pd # imports the pandas library and gives it an alias
import sklearn # imports the scikit-learn library for machine learning algorithms
import ipywidgets as widgets # imports the ipywidgets library for interactive widgets
from IPython.display import display # imports the display function from IPython.display module
import re # imports the regular expressions module
#After importing these modules, the code reads two CSV files into pandas DataFrame objects.
#The first CSV file is named movies.csv, and the second CSV file is named ratings.csv.
#Both files are stored in the DataFrame objects named movies and ratings, respectively
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

def clean_title(title):# This function takes a movie title as input, removes any characters that are not alphanumeric or spaces
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

def clean_genre(genre_str): # This function takes a string of movie genres as input, converts the string to lowercase, removes any non-alphanumeric characters
    genre_str = genre_str.lower()
    genre_str = re.sub('[^a-zA-Z0-9\s]', '', genre_str)
    genre_str = re.sub('\s+', ' ', genre_str)
    genre_str = genre_str.strip()
    return genre_str

#Function below applies title and genres columns of the DataFrame using the apply method. 
#The cleaned titles are stored in a new column called clean_title.
#The cleaned genre strings are stored in a new column called clean_genres.
movies["clean_title"] = movies["title"].apply(clean_title)
movies["clean_genres"] = movies["genres"].apply(lambda x: ' '.join(x.split('|'))) # The lambda function is used to split the multiple genres in the genres column 

# This section of code converts the cleaned movie titles and genres into numerical features that can be used for machine learning models.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_title = TfidfVectorizer(ngram_range=(1,2)) # The vectorizer function will consider single words and pairs of consecutive words when creating the features.
vectorizer_genres = TfidfVectorizer()

tfidf_title = vectorizer_title.fit_transform(movies["clean_title"])
tfidf_genres = vectorizer_genres.fit_transform(movies["clean_genres"])

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# This section of code defines a search function that takes a movie title and a list of genres as inputs
# It then returns the top 5 movies that are most similar to the input based on their titles and genres.
def search(title, genres): #
    """Search function, using the sklearn cosine similarity."""
    title = clean_title(title)
    query_vec_title = vectorizer_title.transform([title]) # This line transforms the cleaned movie title into a matrix of numerical features
    query_vec_genres = None
    
    if genres is not None:
        genres = ' '.join(genres)
        genres = clean_genre(genres)
        query_vec_genres = vectorizer_genres.transform([genres])

    similarity_title = cosine_similarity(query_vec_title, tfidf_title).flatten() # Cosine similarity between the input movie title and all the movie titles
    
    if query_vec_genres is not None:
        similarity_genres = cosine_similarity(query_vec_genres, tfidf_genres).flatten()
        similarity = similarity_title + similarity_genres
    else:
        similarity = similarity_title

    indices = np.argpartition(similarity, -5)[-5:] # This finds the indices of the 5 movies in the movies DataFrame with the highest similarity scores. 
    results = movies.iloc[indices].iloc[::-1] #Argpartition function does partition to the similarity scores array, and then take the last 5 indices.
    
    return results

# This section analyses movie genres and names using natural language processing techniques, and it calculates movie similarity using cosine similarity.
def find_rec_list_mov(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percent = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percent.columns = ["similar", "all"]
    
    rec_percent["score"] = rec_percent["similar"] / rec_percent["all"]
    
    rec_percent = rec_percent.sort_values("score", ascending = False)
    
    return rec_percent.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

# This section creates a list of unique movie genres from the genres column of the movies dataframe
# It also creates two widgets: one for inputting the movie title and another for selecting the movie genres
# Also creates a widget output that will be used to display the recommended movies.
genre_list = sorted(list(set('|'.join(movies["genres"]).split('|'))))
movie_input_name = widgets.Text(
    value = "Toy Story", 
    description = "Movie Title:",
    disabled = False
)
movie_input_genres = widgets.SelectMultiple(
    options = sorted(list(movies["clean_genres"].unique())),
    description = "Movie Genres:",
    disabled = False
)
rec_list = widgets.Output()

# The movie_input_name widget is displayed along with the movie_input_genres and rec_list widgets.
def on_type(data):# Whenever the widget is used or updated, this function is utilized.
    with rec_list:
        rec_list.clear_output()
        title = data["new"]
        genres = movie_input_genres.value
        if len(title)>5 or len(genres)>0:
            results = search(title, genres)
            movie_id = results.iloc[0]["movieId"]
            display(find_rec_list_mov(movie_id))
            
movie_input_name.observe(on_type, names = "value")
display(movie_input_name, movie_input_genres, rec_list)

Text(value='Toy Story', description='Movie Title:')

SelectMultiple(description='Movie Genres:', options=('(no genres listed)', 'Action', 'Action Adventure', 'Acti…

Output()