# 📚 **Import Libraries**

In [13]:
import os
import re
import requests
import zipfile
import logging

import numpy  as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise        import cosine_similarity

from utils import *

In [14]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("urllib3").setLevel(logging.WARNING)

# Functions

In [15]:
def clean_title(title: str) -> str:
  """
  Remove alphanumeric characters and spaces from a string.

  Args:
    title (str): The string to be processed.

  Returns:
    str: The resulting string after removing the characters.
  """
  
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [16]:
def search(title: str) -> np.ndarray:
  """
  Search for similarity scores between the given title and a set of titles represented as vectors.

  Args:
      title (str): The input title to compare against a set of titles.

  Returns:
    np.ndarray: An array of similarity scores, where each score represents the similarity between the input title and a title in the dataset.
  """

  title      = clean_title(title)
  query_vec  = vectorizer.transform([title])
  similarity = cosine_similarity(tfidf, query_vec).flatten()

  return similarity

# 📡 **Data Acquisition**

## Download Data

In [17]:
try:
  if not os.path.exists("data"):
    os.mkdir("data")

  zip_url  = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
  zip_path = "data/movies"

  response = requests.get(zip_url)

  with open(zip_path, "wb") as zip_file:
    zip_file.write(response.content)

  with zipfile.ZipFile(zip_path, 'r') as zip_file:
    zip_file.extractall("data/")

  logging.info("🟩 Download Success!")
except Exception as error:
  logging.error("🟥 Download Failed!")
  logging.error(f"🟥 Error: {error}")

INFO:root:🟩 Download Success!


## Read CSV

In [18]:
movies = pd.read_csv("./data/ml-25m/movies.csv")

# 🔍 **Data Exploration**

In [19]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [20]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [21]:
movies.describe()

Unnamed: 0,movieId
count,62423.0
mean,122220.387646
std,63264.744844
min,1.0
25%,82146.5
50%,138022.0
75%,173222.0
max,209171.0


In [22]:
movies["genres"].value_counts()

genres
Drama                                   9056
Comedy                                  5674
(no genres listed)                      5062
Documentary                             4731
Comedy|Drama                            2386
                                        ... 
Action|Adventure|Crime|Fantasy             1
Drama|Film-Noir|Musical|Thriller           1
Action|Drama|Horror|Mystery                1
Adventure|Comedy|Sci-Fi|Thriller|War       1
Comedy|Horror|Mystery|Sci-Fi|Western       1
Name: count, Length: 1639, dtype: int64

# 🧱 **Data Modeling**

In [23]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [24]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [25]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [26]:
search("1995")

array([0.29475734, 0.32844299, 0.22159051, ..., 0.        , 0.        ,
       0.        ])

In [27]:
movies["clean_title"].head()

0                      Toy Story 1995
1                        Jumanji 1995
2               Grumpier Old Men 1995
3              Waiting to Exhale 1995
4    Father of the Bride Part II 1995
Name: clean_title, dtype: object