In [1]:
#https://grouplens.org/datasets/movielens/25m/

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer as Tf
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

movies = pd.read_csv("movies.csv")

def clean(title):
    cleanedtitle = re.sub("[^a-zA-Z0-9 ]" , "" , title)
    return cleanedtitle
    #strips the titles of filler characters
    
movies["clean_title"] = movies["title"].apply(clean)


vectorizer = Tf(ngram_range=(1,2)) 
                            #looks at pairs of words too, like not just
                            #"toy" , it looks at "toy story" too
        
tfidf = vectorizer.fit_transform(movies["clean_title"])

def searching(title):
    title = clean(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector , tfidf).flatten()
    indices = np.argpartition(similarity , -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

ratings = pd.read_csv("ratings.csv")


In [2]:

def find_same_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_recs = similar_recs.value_counts() / len(similar_users)

    similar_recs = similar_recs[similar_recs > 0.1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_recs.index)) & (ratings["rating"] > 4)]
    all_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    
    rec_percents = pd.concat([similar_recs , all_recs] , axis = 1)
    rec_percents.columns = ["similar" , "all"]
    
    rec_percents["score"] = rec_percents["similar"] / rec_percents["all"]
    rec_percents = rec_percents.sort_values("score" , ascending = False)
    return rec_percents.head(15).merge(movies , left_index = True , right_on = "movieId")[["score" ,"title" , "genres"]]


In [3]:
movie_input = widgets.Text(
    value = "The Avengers 2012",
    description = "Movie Title : ",
    disabled = False
    )


recommendation = widgets.Output()

def on_type(data):
    with recommendation:
        recommendation.clear_output()
        title = data["new"]
        if len(title) > 1:
            results = searching(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_same_movies(movie_id))
            
movie_input.observe(on_type , names = "value")

display(movie_input , recommendation)

Text(value='The Avengers 2012', description='Movie Title : ')

Output()