# Movies Recommendation System


In [1]:
#Imports 
import numpy as np
import pandas as pd
import re # Regular expression lib
from sklearn.feature_extraction.text import TfidfVectorizer #python ML lib
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
# add it to the data
movies["clean_title"] = movies["title"].apply(clean_title)
# why not  #  movies["clean_title"] = clean_title(movies["title"])
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


Search engine: 
* we need TFIDF Matrix : converting titles to matrix (terms and frequency)
* we need inverse document frequency :  it helps calculating the similarity of the input with our data and make the best choice



In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2)) # the ngram will make search more accurate by taking 2 words toghther into consideration 
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [5]:
# compute similarity between input (title) and all movies  
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices  = np.argpartition(similarity,-5) [-5:]  # find 5 most similair movies to the input 
    results = movies.iloc[indices] [::-1] #  [::-1] reverse the results 
    return results

In [6]:
ratings = pd.read_csv("ratings.csv")

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [10]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [12]:
# interactive search box 
movie_input = widgets.Text(
    value="Toy Story", 
    description = "Movie Title: ",
    disabled = False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list: 
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            result = search(title)
            movie_id= result.iloc[0]["movieId"] #First row  
            #display(find_similar_movies(movie_id))
            print(find_similar_movies(movie_id))
            


movie_input.observe(on_type, names='value')

display(movie_input, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

## Next steps:
improve the quality of the recommendations: 
* use genres
* use tags and metadata to improve 