In [2]:
import pandas as pd

movies = pd.read_csv('movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [12]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]", " ", title)

In [13]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [14]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ngram: groups of two words that are consecutive to make search more accurate than single grouping
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies['clean_title'])

In [27]:
# Compute similarity between word we enter and all the movies in our list
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    # Compare query term to each of the titles in the dataset and return similarity
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    # Make most similar result the first one in the list
    results = movies.iloc[indices][::-1]
    return results

In [28]:
query_vec
similarity
indices
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [26]:
# Build an interactive search box
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(value='Toy Story', description='Movie Title:', disabled=False)
# Hook it with output widget
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [30]:
# Find movies similar to what we searched
ratings = pd.read_csv('ratings.csv')
ratings
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [31]:
movie_id = 1

In [35]:
similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 4)]['userId'].unique()

In [36]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [40]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4)]['movieId']
similar_user_recs

5101           1
5104          11
5105          34
5106          46
5108          60
            ... 
24998389    3735
24998390    3751
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 1783202, dtype: int64

In [44]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

1       1.000000
260     0.521102
318     0.516733
356     0.502443
296     0.458315
          ...   
1358    0.101733
1485    0.101659
16      0.101140
899     0.100326
8874    0.100252
Name: movieId, Length: 256, dtype: float64