# 📚 **Import Libraries**

In [1]:
import os
import re
import requests
import zipfile
import logging

import numpy  as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise        import cosine_similarity

import ipywidgets as widgets

from IPython.display import display

In [2]:
logging.basicConfig(level=logging.INFO)

logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("ipywidgets").setLevel(logging.WARNING)

# Functions

In [3]:
def clean_title(title: str) -> str:
  """
  Remove alphanumeric characters and spaces from a string.

  Args:
    title (str): The string to be processed.

  Returns:
    str: The resulting string after removing the characters.
  """
  
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [4]:
def search(title: str) -> pd.DataFrame:
  """
  Search for similarity scores between the given title and a set of titles represented as vectors.

  Args:
      title (str): The input title to compare against a set of titles.

  Returns:
    pd.DataFrame: A dataframe containing the top 5 most similar titles.
  """

  title      = clean_title(title)
  query_vec  = vectorizer.transform([title])
  similarity = cosine_similarity(tfidf, query_vec).flatten()
  indices    = np.argpartition(similarity, -5)[-5:]
  results    = movies.iloc[indices][::-1]

  return results

In [5]:
def on_type(data: dict) -> None:
  """
  Clear the output in a movie list and display search results based on the input title.

  Args:
    data (dict): The input data, which likely contains a "new" key with the title as its value.

  Returns:
    None
  """
  
  with movie_list:
    movie_list.clear_output()

    title = data["new"]

    if len(title) > 5:
      display(search(title))


# 📡 **Data Acquisition**

## Download Data

In [6]:
try:
  if not os.path.exists("data"):
    os.mkdir("data")

  zip_url  = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
  zip_path = "data/movies"

  response = requests.get(zip_url)

  with open(zip_path, "wb") as zip_file:
    zip_file.write(response.content)

  with zipfile.ZipFile(zip_path, 'r') as zip_file:
    zip_file.extractall("data/")

  logging.info("🟩 Download Success!")
except Exception as error:
  logging.error("🟥 Download Failed!")
  logging.error(f"🟥 Error: {error}")

INFO:root:🟩 Download Success!


## Read CSV

In [7]:
movies  = pd.read_csv("./data/ml-25m/movies.csv")
ratings = pd.read_csv("./data/ml-25m/ratings.csv")

# 🔍 **Data Exploration**

## Movies Dataset

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [10]:
movies.describe()

Unnamed: 0,movieId
count,62423.0
mean,122220.387646
std,63264.744844
min,1.0
25%,82146.5
50%,138022.0
75%,173222.0
max,209171.0


In [11]:
movies["genres"].value_counts()

genres
Drama                                   9056
Comedy                                  5674
(no genres listed)                      5062
Documentary                             4731
Comedy|Drama                            2386
                                        ... 
Action|Adventure|Crime|Fantasy             1
Drama|Film-Noir|Musical|Thriller           1
Action|Drama|Horror|Mystery                1
Adventure|Comedy|Sci-Fi|Thriller|War       1
Comedy|Horror|Mystery|Sci-Fi|Western       1
Name: count, Length: 1639, dtype: int64

## Ratings Dataset

In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [14]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


# 🧱 **Data Modeling**

In [15]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [16]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [17]:
similar_users = ratings[
  (ratings["movieId"] == 1) &
  (ratings["rating"] > 4.0)
]["userId"].unique()

similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [18]:
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.0)]["movieId"]

similar_users_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [19]:
similar_users_recs.value_counts()

movieId
1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: count, Length: 19282, dtype: int64

In [20]:
similar_users_recs.value_counts() / len(similar_users)

movieId
1         1.000000
318       0.445607
260       0.403770
356       0.370215
296       0.367295
            ...   
128478    0.000053
125125    0.000053
119701    0.000053
107563    0.000053
7625      0.000053
Name: count, Length: 19282, dtype: float64

In [21]:
similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
similar_users_recs = similar_users_recs[similar_users_recs > 0.10]

similar_users_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [22]:
all_users = ratings[
  (ratings["movieId"].isin(similar_users_recs.index)) &
  (ratings["rating"] > 4.0)
]

all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [23]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

all_users_recs

movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

In [27]:
rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)

rec_percentages.columns = ["similar", "all"]

rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [29]:
rec_percentages["score"] = rec_percentages["similar"] * rec_percentages["all"]

rec_percentages.sort_values("score", ascending=False)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,0.445607,0.342220,0.152496
1,1.000000,0.124728,0.124728
296,0.367295,0.284674,0.104559
260,0.403770,0.222207,0.089720
356,0.370215,0.235266,0.087099
...,...,...,...
953,0.103053,0.045792,0.004719
50872,0.113990,0.039111,0.004458
551,0.101195,0.040918,0.004141
745,0.100345,0.037031,0.003716


In [30]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,0.124728,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
314,0.445607,0.34222,0.152496,318,"Shawshank Redemption, The (1994)",Crime|Drama,Shawshank Redemption The 1994
257,0.40377,0.222207,0.08972,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977
351,0.370215,0.235266,0.087099,356,Forrest Gump (1994),Comedy|Drama|Romance|War,Forrest Gump 1994
292,0.367295,0.284674,0.104559,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
2480,0.346695,0.244033,0.084605,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,Matrix The 1999
1166,0.34197,0.18803,0.0643,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,Star Wars Episode V The Empire Strikes Back 1980
1168,0.320945,0.164614,0.052832,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,Raiders of the Lost Ark Indiana Jones and the ...
585,0.315689,0.225909,0.071317,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,Silence of the Lambs The 1991
522,0.30284,0.215207,0.065173,527,Schindler's List (1993),Drama|War,Schindlers List 1993


# 🏗️ **Build Searcher**

In [24]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [25]:
search("1995")

Unnamed: 0,movieId,title,genres,clean_title
60174,202701,Любить по-русски (1995),Drama|Romance,1995
68,69,Friday (1995),Comedy,Friday 1995
3856,3960,Haunted (1995),Drama|Thriller,Haunted 1995
173,175,Kids (1995),Drama,Kids 1995
5,6,Heat (1995),Action|Crime|Thriller,Heat 1995


In [26]:
movie_input = widgets.Text(
  value="Toy Story (1995)",
  description="Movie Title:",
  disabled=False
)

movie_list = widgets.Output()

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story (1995)', description='Movie Title:')

Output()