# 📚 **Import Libraries**

In [17]:
import os
import re
import requests
import zipfile
import logging

import numpy  as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise        import cosine_similarity

import ipywidgets as widgets

from IPython.display import display

In [18]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("urllib3").setLevel(logging.WARNING)

# Functions

In [19]:
def clean_title(title: str) -> str:
  """
  Remove alphanumeric characters and spaces from a string.

  Args:
    title (str): The string to be processed.

  Returns:
    str: The resulting string after removing the characters.
  """
  
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [20]:
def search(title: str) -> pd.DataFrame:
  """
  Search for similarity scores between the given title and a set of titles represented as vectors.

  Args:
      title (str): The input title to compare against a set of titles.

  Returns:
    pd.DataFrame: A dataframe containing the top 5 most similar titles.
  """

  title      = clean_title(title)
  query_vec  = vectorizer.transform([title])
  similarity = cosine_similarity(tfidf, query_vec).flatten()
  indices    = np.argpartition(similarity, -5)[-5:]
  results    = movies.iloc[indices][::-1]

  return results

In [None]:
def on_type(data: dict) -> None:
  """
  Clear the output in a movie list and display search results based on the input title.

  Args:
      data: The input data, which likely contains a "new" key with the title as its value.
  """
  
  with movie_list:
    movie_list.clear_output()

    title = data["new"]

    if len(title) > 5:
      display(search(title))


# 📡 **Data Acquisition**

## Download Data

In [21]:
try:
  if not os.path.exists("data"):
    os.mkdir("data")

  zip_url  = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
  zip_path = "data/movies"

  response = requests.get(zip_url)

  with open(zip_path, "wb") as zip_file:
    zip_file.write(response.content)

  with zipfile.ZipFile(zip_path, 'r') as zip_file:
    zip_file.extractall("data/")

  logging.info("🟩 Download Success!")
except Exception as error:
  logging.error("🟥 Download Failed!")
  logging.error(f"🟥 Error: {error}")

INFO:root:🟩 Download Success!


## Read CSV

In [22]:
movies = pd.read_csv("./data/ml-25m/movies.csv")

# 🔍 **Data Exploration**

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [25]:
movies.describe()

Unnamed: 0,movieId
count,62423.0
mean,122220.387646
std,63264.744844
min,1.0
25%,82146.5
50%,138022.0
75%,173222.0
max,209171.0


In [26]:
movies["genres"].value_counts()

genres
Drama                                   9056
Comedy                                  5674
(no genres listed)                      5062
Documentary                             4731
Comedy|Drama                            2386
                                        ... 
Action|Adventure|Crime|Fantasy             1
Drama|Film-Noir|Musical|Thriller           1
Action|Drama|Horror|Mystery                1
Adventure|Comedy|Sci-Fi|Thriller|War       1
Comedy|Horror|Mystery|Sci-Fi|Western       1
Name: count, Length: 1639, dtype: int64

# 🧱 **Data Modeling**

In [27]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [28]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


# 🏗️ **Build Searcher**

In [29]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [30]:
search("1995")

Unnamed: 0,movieId,title,genres,clean_title
60174,202701,Любить по-русски (1995),Drama|Romance,1995
68,69,Friday (1995),Comedy,Friday 1995
3856,3960,Haunted (1995),Drama|Thriller,Haunted 1995
173,175,Kids (1995),Drama,Kids 1995
5,6,Heat (1995),Action|Crime|Thriller,Heat 1995


In [None]:
movie_input = widgets.Text(
  value="Toy Story (1995)",
  description="Movie Title:",
  disabled=False
)

movie_list = widgets.Output()

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)