# Movies Recommendation System


In [None]:
#Imports 
import numpy as np
import pandas as pd
import re # Regular expression lib
from sklearn.feature_extraction.text import TfidfVectorizer #python ML lib
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [None]:
movies = pd.read_csv("data/movies.csv")
movies

In [None]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
# add it to the data
movies["clean_title"] = movies["title"].apply(clean_title)
# why not  #  movies["clean_title"] = clean_title(movies["title"])
movies

Search engine: 
* we need TFIDF Matrix : converting titles to matrix (terms and frequency)
* we need inverse document frequency :  it helps calculating the similarity of the input with our data and make the best choice



In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2)) # the ngram will make search more accurate by taking 2 words toghther into consideration 
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
# compute similarity between input (title) and all movies  
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices  = np.argpartition(similarity,-5) [-5:]  # find 5 most similair movies to the input 
    results = movies.iloc[indices] [::-1] #  [::-1] reverse the results 
    return results

In [None]:
ratings = pd.read_csv("data/ratings.csv")

In [None]:
def find_similar_movies(movie_id):
    #Finding recommendations 
    similair_users = ratings[(ratings["movieId"] == movie_id ) & (ratings["rating"] > 4)]["userId"].unique()
    similair_users_recs = ratings[(ratings["userId"].isin(similair_users) & (ratings["rating"]) > 4 )]["movieId"]
    #Adjusting 
    similair_users_recs = similair_users_recs.value_counts() / len(similair_users)
    similair_users_recs = similair_users_recs[similair_users_recs> .1]
    #
    all_users = ratings[(ratings["movieId"].isin(similair_users_recs.index) & (ratings["ratings"]) > 4)]
    all_users_recs = all_users["movieId"].value_counts( ) / len(all_users["userId"].unique())
    #Generating score 
    rec_percentages = pd.concat([similair_users_recs,all_users_recs] ,axis=1)
    rec_percentages.columns =["similair","all"]

    rec_percentages["score"]    = rec_percentages["similair"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score",ascending=False) # Bigger Begining
    return rec_percentages.head(10).merge(movies,left_index=True ,right_on="movieId")[["score","title","genres"]]

In [None]:
# interactive search box 
movie_input = widgets.Text(
    value="Toy Story", 
    description = "Movie Title: ",
    disabled = False
    
)
recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list: 
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            result = search(title)
            movie_id= result.iloc[0]["movieId"] #First row  
            display(find_similar_movies(movie_id))



movie_input.observe(on_type, names='value')

display(movie_input, recommendation_list)