In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import ipywidgets as widgets
from IPython.display import display

import re

In [2]:
movies_df = pd.read_csv("~/Desktop/MovieLens-resources/movies.csv")

ratings_df = pd.read_csv("~/Desktop/MovieLens-resources/ratings.csv")

In [3]:
ratings_df = ratings_df.loc[ratings_df['userId'] <= 1050]

In [4]:
ratings_df = ratings_df.drop('timestamp',axis=1)
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0
...,...,...,...
100825,1050,5445,4.0
100826,1050,5899,3.0
100827,1050,5944,4.0
100828,1050,6184,2.0


In [5]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


In [6]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

In [7]:
movies_df

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama,State of Siege Temple Attack 2021
86533,288971,Ouija Japan (2021),Action|Horror,Ouija Japan 2021
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary,The Men Who Made the Movies Howard Hawks 1973
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller,Skinford Death Sentence 2023


In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_df["clean_title"])

In [9]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_df.iloc[indices].iloc[::-1]
    
    return results

In [10]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()