In [84]:
# Loading useful libraries

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [99]:
# Loading the movie data set to we'll use to build our recommendation system

df_movies = pd.read_csv("ml-25m/movies.csv")
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [86]:
# clean the titles of movies by removing stop words and punctuations

import re

def clean_title(title): 
    return re.sub("[^a-zA-Z0-9]", " ", title)


In [100]:
# creating a column for our cleaned title

df_movies["clean_title"] = df_movies["title"].apply(clean_title)
df_movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [101]:
# this line allows for searchign words by using a range (our range in this case is 2)
# teh vectorizer transforms our titles into a matrix of numbers since that's the language computers understand. 

vectorizer = TfidfVectorizer(ngram_range= (1,2))

# tfidf is an algorithms that calculates the frequency of a term in a corpus of documents to determine its level of significance 
# fit_transform also learns about the vocabulary of our title. Vectorizer fit_transform is a method in scikit-learn that converts a collection of text documents into a numerical matrix. This matrix can then be used as input for machine learning models. The process of converting text into numbers is known as vectorization.
tfidf = vectorizer.fit_transform(df_movies['clean_title']) 

In [89]:
# creating a search engine 

def search(title):
    title = clean_title(title)

    query_vec = vectorizer.transform([title]) # transforms our title into a matrix of numbers
    similarity = cosine_similarity(query_vec, tfidf).flatten() # calculates the similaritity between the title we're searching for with other titles in our dataset
    indices = np.argpartition(similarity, -5) [-5:]
    results = df_movies.iloc[indices][::-1]
    return results

search("Avatar")

Unnamed: 0,movieId,title,genres,clean_title
14102,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,Avatar 2009
19997,103676,My Avatar and Me (Min Avatar og mig) (2010),Documentary,My Avatar and Me Min Avatar og mig 2010
46636,172851,Avatar: Creating the World of Pandora (2010),Documentary,Avatar Creating the World of Pandora 2010
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,Fuck You Goethe Fack Ju G hte 2013
20806,107563,"Princess for Christmas, A (2011)",Children|Comedy,Princess for Christmas A 2011


In [106]:
# creating an interactive search bar to enter that shows our search and the results

import ipywidgets as widget
from IPython.display import display

movie_input = widget.Text(
    value = "Avatar",
    description = "Movie Title:",
    disabled = False
)
movie_output = widget.Output()

def on_type(data):
    with movie_output:
        movie_output.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))


movie_input.observe(on_type, names = 'value')

display(movie_input, movie_output)

Text(value='Avatar', description='Movie Title:')

Output()

In [103]:
# uploading the rates from the ratings data set

ratings = pd.read_csv("ml-25m/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [105]:
# creating to find similar movies to the ones we watched

def find_similar_movies(movie_id):
    # finding unique users who watched the same movie as us. This line gives the list of unique user-id of similar users
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

    # finding the movies users who watched the same movie as us watched
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # value_counts returns a list with the count of occurence of unique items in our table (in our case, items are movies). 
    # It counts how many times each movies appears 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # since we have a high number of movies we can recommend, we only want to keep the top movies with the highest recommenedation rates
    # this allows us to only consider the movies with more than 10% recommendation rate
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    # Now, we wan tto find only the movies people with similar tastes as us also liked
    # We want to stay away from movies everybody likes. We want movies that are related to the one we watched and also recommended by similar users
    # we are trying to find the people who highly rated the movies that are our recommendation list
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

    # finding the percentage of all users watched the same movies we were recommended 
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # comparing the recommendation percentage. We want to compare how much a movie was liked by similar users from all users
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    # calculating the ratio between the recommendation score from similar users and all other users
    # the higher the score, the better the recommendation
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    # sorting the recommendation score by descending score
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # returning the top ten recommended movies
    return rec_percentages.head(10).merge(df_movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [97]:
# building an interactive search bar that will show our recommended movies

# import the widget library

import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()