Reading Movie Dataset using Pandas

In [1]:
import pandas as pd

movie = pd.read_csv("movie.csv")
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


**CLEANING THE TITLES AND BUILDING A SEARCH ENGINE**

Cleaning Movie Titles using Regular Expressions

In [2]:
import re

# Creating a function to clean titles
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","", title)

In [3]:
# Creating and adding a new column in movie dataframe
movie["clean_title"] = movie["title"].apply(clean_title)
movie

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,Kein Bund frs Leben 2007
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,Feuer Eis Dosenbier 2002
27275,131258,The Pirates (2014),Adventure,The Pirates 2014
27276,131260,Rentun Ruusu (2001),(no genres listed),Rentun Ruusu 2001


Creating A TFIDF Matrix

In [4]:
# TFIDF -> Term Frequency, Inverse Document Frequency (tf*idf) - idf finds terms that are unique
from sklearn.feature_extraction.text import TfidfVectorizer

# This will make the search even more accurate
# ngram - group of two words that are consecutive
vectorizer = TfidfVectorizer(ngram_range = (1,2))

# Using vectorizer to turn sets of titles into a matrix (a set of numbers)
tfidf = vectorizer.fit_transform(movie["clean_title"])

Creating A Search Function

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)

    # Using vectorizer to turn the term entered into a set of numbers (matrix)
    query_vec = vectorizer.transform([title])

    # Computing similarity between the term entered by user and all of the movies listed in the dataset using cosine similarity
    # flatten() -> returns a copy of a given array in such a way that it is collapsed into one dimension (flatten).
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    # Finding 5 most similar titles to the search term
    indices = np.argpartition(similarity,-5)[-5:]
    # Reversing the result so that the most similar title is on top
    results = movie.iloc[indices][::-1]
    return results


Building an Interactive Search Box with Jupyter

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Creating an input text widget
movie_input = widgets.Text( 
    value = "Enter a movie name", 
    description = "Movie Title: ", 
    disabled = False
)

# Creating an input text widget to show results
movie_list = widgets.Output()

#Funtion that will be called whenever when one types something in the input widget
def on_type(data):

    # with output widget
    with movie_list:
        movie_list.clear_output()
        title  = data["new"]
        if len(title) > 2:
            display(search(title))

# Hooking up the input widget with the on_type function
movie_input.observe(on_type, names = 'value')
# Displaying both the widgets
display(movie_input, movie_list)

Text(value='Enter a movie name', description='Movie Title: ')

Output()

**BUILDING IN THE ACTUAL RECOMMENDATION SYSTEM**

Reading In The Rating Dataset

In [7]:
rating = pd.read_csv("rating.csv")
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [8]:
rating.dtypes

userId         int64
movieId        int64
rating       float64
timestamp     object
dtype: object

Finding Users Who Liked The Same Movie

In [9]:
# To convert timestamps's data type from object to datetime64
rating["timestamp"] = pd.to_datetime(rating.timestamp, errors = 'coerce')

In [10]:
rating.dtypes

userId                int64
movieId               int64
rating              float64
timestamp    datetime64[ns]
dtype: object

In [11]:
#hardcoding the code to check proper working of the below code
movie_id = 1

# Finding user who also liked same movies as us, and then getting unique user IDs
similar_users = rating[(rating["movieId"] == movie_id) & (rating["rating"] >= 4)]["userId"].unique()
similar_users

array([     3,      6,      8, ..., 138481, 138483, 138486], dtype=int64)

In [12]:
# Other movies that similar_users liked
similar_user_recommendations = rating[(rating["userId"].isin(similar_users)) & (rating["rating"] >= 4)]["movieId"]
similar_user_recommendations

236             1
238            32
239            50
242           175
244           223
            ...  
19999506    53953
19999507    54771
19999508    55232
19999509    55282
19999510    56174
Name: movieId, Length: 4223798, dtype: int64

In [13]:
# Finding only movies that greater than 10% of similar_users also liked and dividing it by number of users to convert it into percentage
similar_user_recs = similar_user_recommendations.value_counts() / len(similar_users)

# Taking the ones that are greater than 10% liked
similar_user_recs = similar_user_recs[ similar_user_recs > 0.1]
similar_user_recs

movieId
1       1.000000
318     0.527783
260     0.522917
296     0.492011
356     0.490028
          ...   
3408    0.101159
899     0.101129
3421    0.100949
708     0.100799
910     0.100318
Name: count, Length: 267, dtype: float64

Finding How Much All Users Like Movies

In [14]:
# Finding how much all users liked the movies present in our dataset
all_users = rating[(rating["movieId"].isin(similar_user_recs.index)) & (rating["rating"] > 4)]

# Finding what percentage of all users will recommend each of these movies that are there in similar_user_recs
all_users_recommendations = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recommendations

movieId
318     0.307220
296     0.269810
593     0.221542
527     0.212621
356     0.210485
          ...   
2302    0.018750
586     0.018478
2321    0.017114
2       0.016024
708     0.013010
Name: count, Length: 267, dtype: float64

Creating A Recommendation Score

In [15]:
# Comparing the percentages by combining both the series together
# We want movies that have a big difference between these two numbers
rec_percentages = pd.concat([similar_user_recs, all_users_recommendations], axis = 1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.122361
318,0.527783,0.307220
260,0.522917,0.209221
296,0.492011,0.269810
356,0.490028,0.210485
...,...,...
3408,0.101159,0.020325
899,0.101129,0.034486
3421,0.100949,0.027436
708,0.100799,0.013010


In [16]:
# Creating a score by dividing one number by the other - finding ratio
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# Sorting values on score - biggest value at beginning
# Higher the score, better the recommendation is
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [17]:
# Taking top 10 recommendations and merging it with the movie dataset
rec_percentages.head(10).merge(movie, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.122361,8.172546,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
696,0.100799,0.01301,7.747789,708,"Truth About Cats & Dogs, The (1996)",Comedy|Romance,Truth About Cats Dogs The 1996
2270,0.202919,0.026792,7.573747,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
640,0.209798,0.031798,6.597806,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,Mission Impossible 1996
1,0.103652,0.016024,6.46857,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2236,0.107437,0.017114,6.27754,2321,Pleasantville (1998),Comedy|Drama|Fantasy,Pleasantville 1998
436,0.127591,0.020606,6.192057,440,Dave (1993),Comedy|Romance,Dave 1993
586,0.232594,0.038394,6.058099,592,Batman (1989),Action|Crime|Thriller,Batman 1989
2711,0.167388,0.027663,6.050878,2797,Big (1988),Comedy|Drama|Fantasy|Romance,Big 1988
2901,0.165405,0.027527,6.008827,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,Who Framed Roger Rabbit 1988


Building A Recommendation Function

In [18]:
# Putting it all together in a recommendation function

# Output of this function will be a dataframe that you can display in the widget created in the final step
def find_similar_movies(movie_id):
    similar_users = rating[(rating["movieId"] == movie_id) & (rating["rating"] >= 4)]["userId"].unique()
    similar_user_recs = rating[(rating["userId"].isin(similar_users)) & (rating["rating"] >= 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[ similar_user_recs > 0.1]
    
    all_users = rating[(rating["movieId"].isin(similar_user_recs.index)) & (rating["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending = False)
     
    # Selecting only three columns to show - "score", "title", and "genres"
    return rec_percentages.head(10).merge(movie, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

The Final Part - Creating An Interactive Recommendation Widget

In [19]:
# Creating an input widget
movie_name_input = widgets.Text(
    value = "Enter a movie name",
    description = "Movie Title",
    disabled = False
)

# Creating an output widget
recommendation_list = widgets.Output()

#creating an on_type function which is slightly different from the previous one
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()

        # Grabbing title from the input widget
        title = data["new"]
        if len(title) > 2:
            results = search(title)

            # Extract the movie ID - first row of result has the highest confidence, so take it's movie id  
            movie_id = results.iloc[0]["movieId"]

            # call find_similar_movies function and pass movie_id to that function, and then display the dataframe output obtained
            display(find_similar_movies(movie_id))

# Observe on_type on movie_input
movie_name_input.observe(on_type, names = "value")

# Displaying the widgets
display( movie_name_input, recommendation_list)

Text(value='Enter a movie name', description='Movie Title')

Output()