In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display

# Data collection

In [3]:
movies = pd.read_csv("dataset/movies.csv")
ratings = pd.read_csv("dataset/ratings.csv")

# Data pre-processing

In [5]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)
movies["clean_title"] = movies["title"].apply(clean_title)

# Movie title vectorization (for searching algorithm)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
movies_vec = vectorizer.fit_transform(movies["clean_title"])

# Search engine

In [7]:
def search(title):
    title = clean_title(title)
    input_vec = vectorizer.transform([title]) # vector of user input
    similarity = cosine_similarity(input_vec, movies_vec).flatten() # movies_vec --> vector for all the movies
    indices = np.argpartition(similarity, -5)[-5:] # returns 5 similar titles only
    results = movies.iloc[indices][::-1]
    return results 

# Search engine UI

In [8]:
movie_input = widgets.Text(
    value=None,
    description="Search: ",
    disabled=False
)

movie_output = widgets.Output()
# on_type = Event Handler
# Whenever a character is entered in the input box, on_type is called for it like - for one character, for first two character, for first three character etc.
def on_type(data):
    with movie_output:
        movie_output.clear_output() # clears users past input
        title = data["new"]
        if len(title) >= 3:
            display(search(title))

movie_input.observe(on_type, names="value") # put observe on input field to create an event
display(movie_input, movie_output)

Text(value='', description='Search: ')

Output()

# Recommender

In [16]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["title", "genres"]]

# Recommender UI

In [17]:
movie_name_input = widgets.Text(
    value=None,
    description="Movie Title: ",
    disabled=False
)
recommendations = widgets.Output()

def on_type(data):
    with recommendations:
        recommendations.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            if not results.empty:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies(movie_id))
            else:
                print("No results found for the given title.")

movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendations)

Text(value='', description='Movie Title: ')

Output()