In [2]:
import pandas as pd
movies = pd.read_csv("movies.csv")


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
import re

def clean_title(title) :
    return re.sub('[^a-zA-Z0-9 ]', "", title) #replace characters not in [] list ("^" denotes negation) with "" (empty string) (so aka remove)



In [4]:
movies["clean_titles"] = movies["title"].apply(clean_title) #clean_titles is just "titles" with clean_title() applied
 # in the case above, the exact column is specified, so all elements of that column gets the clean_title() function applied

Unnamed: 0,movieId,title,genres,clean_titles
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) #take unigrams (1 word tuples) and bigrams (2 word tuples) for  term frequency chart

tfidf = vectorizer.fit_transform(movies["clean_titles"]) #take vectorizer to turn set of clean-titles into number set


<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title) :
    title = clean_title(title)
    query_vec = vectorizer.transform([title]) #turn title into a number set
    similarity = cosine_similarity(query_vec, tfidf).flatten()  #number similarities between your title, and the "all titles dataset". 
    #returns the titles/values that are similar
    indices = np.argpartition(similarity, -5) [-5:] # sort number similarity vectorframe from largest 5, indices is the last 5 largest nums
    results = movies.iloc[indices][::-1] #turns the movies column into an array, use indices to find those largest entries
    return results

In [7]:
ratings = pd.read_csv("ratings.csv")

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [8]:
def find_similar_movies(movieId) :
    #ratings dataframe but the elements are selected based off of matching movieId and rating being > 4 stars
    similar_users = ratings[(ratings["movieId"] == movieId) & (ratings["rating"] >= 4)]["userId"].unique() 
    #ratings dataframe but it's all the users now that liked first movie and their other ratings that are 4+
    similar_users_records = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]

    #value_counts returns a list of names and the frequency of how many times it was referrenced
    #percentage of similar movies from users similar to you
    similar_user_recs = similar_users_records.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

    #all users that also like the movie with positive ratings
    all_users = ratings[(ratings["movieId"].isin(similar_users_records.index)) & (ratings["rating"] >= 4)]
    all_user_records = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    #compare the similar users and all users; this will give us a ratio of how different the similar and all are
    rec_percent = pd.concat([similar_user_recs, all_user_records], axis=1)
    rec_percent.columns = ["similar", "all"]
    rec_percent["recommendation score"] = rec_percent["similar"] / rec_percent["all"]
    rec_percent = rec_percent.sort_values("recommendation score", ascending=False)

    # Now add the movie details DF to this DF
    # .head(10) means that the first 10 indicies of rec_percent will have this movies DF added to the right of it
    return rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")[["recommendation score", "title", "genres"]]
    #the score, title, genre above will only show those 3 columns
    


In [9]:
import ipywidgets as widgets
from IPython.display import display

# Create the text widget
movie_input = widgets.Text(
    value = "Toy Story",
    description="Movie Title:",
    disabled=False
)

recommendation = widgets.Output()

def on_type(data) :
    with recommendation : 
        recommendation.clear_output()
        title = data["new"] #the input in the field, is a dictionary. value of "new" will grab the input from the field
        if len(title) >= 5:
            curr_movie = search(title)
            compare_movie = curr_movie.iloc[0]["movieId"] # Grab the movie ID from the name that matches what you 
                                                    # are looking for. if avengers, then avengers will be
            display(curr_movie)                                             # used to find similar movies.
            display(find_similar_movies(compare_movie))

movie_input.observe(on_type, names='value')

display(movie_input, recommendation)


Text(value='Toy Story', description='Movie Title:')

Output()