# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display

# Reading in Dataset

In [30]:
# Please change the path as per your file location
movies = pd.read_csv(r"movies.csv")
ratings = pd.read_csv(r"ratings.csv")

# Data Cleaning

In [3]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)

# Building Search Engine

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [6]:
# Creating a search function for movie titles
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

# Creating a recommendation system

In [31]:
# Logic - Finding users who liked the same movie that a user inputs into the search engine

In [62]:
# Building a recommendation function
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    
    # To only show those movies that greater than 10% of the similar users liked
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
    
    # Finding how much all users like movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Creating a recommendation scores
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    # Returning top 10 recommendations
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [64]:
movie_name_input = widgets.Text(
    description="Movie Title: ",
    disabled=False    
)

recommendations_list = widgets.Output()

def on_type(data):
    with recommendations_list:
        recommendations_list.clear_output()
        title = data["new"]
        if len(title) > 0:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendations_list)

Text(value='', description='Movie Title: ')

Output()