# Data preparation notebook for recommender project

In [2]:
import ast
from pathlib import Path

import numpy as np

# You need a kaggle account to use this
# https://pypi.org/project/opendatasets/
import opendatasets as od
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
od.download("https://www.kaggle.com/datasets/shivamb/netflix-shows")
od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

In [4]:
netflix_movies = pd.read_csv(
    "./netflix-shows/netflix_titles.csv", usecols=["title", "type", "release_year"]
)
netflix_movies = netflix_movies[netflix_movies["type"] == "Movie"]

In [5]:
cols_to_read = [
    "id",
    "title",
    "overview",
    "release_date",
    "original_language",
    "genres",
    "vote_average",
]
df_raw = pd.read_csv(
    "./the-movies-dataset/movies_metadata.csv", usecols=cols_to_read
).loc[
    :,
    [
        "id",
        "title",
        "release_date",
        "original_language",
        "genres",
        "overview",
        "vote_average",
    ],
]

df = df_raw.copy()[~df_raw.duplicated()]

FileNotFoundError: [Errno 2] No such file or directory: './the-movies-dataset/movies_metadata.csv'

In [5]:
df.shape

(45436, 7)

In [6]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce").dropna()
df["release_year"] = df["release_date"].dt.strftime("%Y")

In [7]:
def get_genre_values(row):
    row = ast.literal_eval(row)
    genre_list = [val["name"] for val in row]
    if len(genre_list) == 0:
        return ["Unknown"]
    return genre_list

In [8]:
df = df.loc[df[cols_to_read[:-1]].dropna().index].reset_index(drop=True)

df["id"] = df["id"].astype(int)

In [9]:
df.shape

(44395, 8)

In [10]:
df = pd.merge(
    df,
    pd.read_csv("./the-movies-dataset/keywords.csv").drop_duplicates(),
    how="left",
    on=["id"],
)
df["keywords"] = df.keywords.fillna("[]")
df.shape

(44395, 9)

In [11]:
df["genres"] = df["genres"].apply(get_genre_values)

In [12]:
df["keywords"] = df["keywords"].apply(get_genre_values)

In [13]:
df["keywords"] = df.loc[:, ["keywords"]].apply(
    lambda x: ", ".join(x.keywords) if "Unknown" not in x.keywords else "",
    axis=1,
)

idx_no_kw = df[df["keywords"] == ""].index

In [14]:
df.head()

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords
0,862,Toy Story,1995-10-30,en,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",7.7,1995,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,1995-12-15,en,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,6.9,1995,"board game, disappearance, based on children's..."
2,15602,Grumpier Old Men,1995-12-22,en,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,6.5,1995,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,Waiting to Exhale,1995-12-22,en,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",6.1,1995,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,1995-02-10,en,[Comedy],Just when George Banks has recovered from his ...,5.7,1995,"baby, midlife crisis, confidence, aging, daugh..."


In [None]:
df = df.sort_values(by="release_date").reset_index(drop=True)
df["title_dated"] = df["title"].copy()
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: " ".join([x["title_dated"], f"({str(df['release_year'][x.name])})"]),
    axis=1,
)

In [16]:
duplicate_title_and_year = df.loc[df.title.duplicated(), "title"].index

In [17]:
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: x["title_dated"].replace(
        f"({str(df['release_year'][x.name])})",
        f"({str(df['release_date'][x.name].strftime('%Y-%m'))})",
    ),
    axis=1,
)

In [18]:
netflix_movies["release_year"] = netflix_movies["release_year"].astype(str)
df_combined = pd.merge(
    df, netflix_movies[["title", "release_year"]], on=["title", "release_year"]
)
df_combined = df_combined[
    ~df_combined[["title", "release_year", "overview"]].duplicated()
].reset_index(drop=True)

In [19]:
all_genres = list(df_combined["genres"].explode().unique())
all_genres.remove("Unknown")
print(all_genres)

['Documentary', 'War', 'History', 'Drama', 'Comedy', 'Music', 'Romance', 'Adventure', 'Science Fiction', 'Action', 'Crime', 'Thriller', 'Family', 'Horror', 'Mystery', 'Fantasy', 'Western', 'Animation', 'Foreign', 'TV Movie']


In [21]:
df[df.title == "The Haunted Castle"]

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
51,133063,The Haunted Castle,1896-12-24,xx,"[Fantasy, Horror]",A bat flies into an ancient castle and transfo...,6.7,1896,"supernatural, silent film, georges melies, ear...",The Haunted Castle
54,104471,The Haunted Castle,1897-01-01,fr,[Horror],A man has an encounter with several spooky app...,5.6,1897,"french, horror, silent film, ghost, georges me...",The Haunted Castle (1897-01)
378,6793,The Haunted Castle,1921-04-07,en,"[Crime, Drama, Horror]",The sinister Count Oetsch scandalizes the aris...,6.2,1921,"castle, hunt, fratricide, uninvited guest",The Haunted Castle (1921-04)
8141,108798,The Haunted Castle,1969-12-20,ja,[Horror],The film takes place during the rule of the ei...,5.0,1969,,The Haunted Castle (1969-12)


In [21]:
mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_all = mlb.fit_transform(df["genres"])


mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_netflix = mlb.fit_transform(df_combined["genres"])
genres = mlb.classes_



In [22]:
similarity_genre = cosine_similarity(binary_labels_all, binary_labels_netflix)

In [23]:
model = SentenceTransformer("all-mpnet-base-v2")
embeddings_overview = model.encode(
    df.overview.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)
embeddings_keywords = model.encode(
    df.keywords.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)

Batches:   0%|          | 0/1388 [00:00<?, ?it/s]

Batches:   0%|          | 0/1388 [00:00<?, ?it/s]

In [24]:
print(embeddings_overview.shape, embeddings_keywords.shape)

(44395, 768) (44395, 768)


In [25]:
indices_shared = df[df.id.isin(df_combined.id)].index
embeddings_overview_netflix = embeddings_overview[indices_shared]
embeddings_keywords_netflix = embeddings_keywords[indices_shared]
similarity_netflix = similarity_genre[indices_shared]

In [26]:
similarity_embedding_overview = util.cos_sim(
    embeddings_overview, embeddings_overview_netflix
)
similarity_embedding_keywords = util.cos_sim(
    embeddings_keywords, embeddings_keywords_netflix
)

In [27]:
similarity_embedding_keywords[idx_no_kw] = 0
for idx_combined, idx_df in enumerate(indices_shared):
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_keywords[idx_df][idx_combined] = -100
    similarity_embedding_keywords[idx_df][idx_combined] = -100
    similarity_genre[idx_df][idx_combined] = -100

In [28]:
with open(r"./similarity_genre.npy", "wb") as f:
    np.save(f, similarity_genre)
with open(r"./embeddings_keywords.npy", "wb") as f:
    np.save(f, embeddings_keywords)
with open(r"./embeddings_overview.npy", "wb") as f:
    np.save(f, embeddings_overview)
with open(r"./similarity_embedding_overview.npy", "wb") as f:
    np.save(f, similarity_embedding_overview)
with open(r"./similarity_embedding_keywords.npy", "wb") as f:
    np.save(f, similarity_embedding_keywords)

In [29]:
# with open(r"./embeddings_keywords.npy", "rb") as f:
#     embeddings_keywords = np.load(f)
# with open(r"./embeddings_overview.npy", "rb") as f:
#     embeddings_overview = np.load(f)
# with open(r"./similarity_embedding_overview.npy", "rb") as f:
# similarity_embedding_overview = np.load(f)
# with open(r"./similarity_embedding_keywords.npy", "rb") as f:
# similarity_embedding_keywords = np.load(f)
# with open(r"./similarity_genre.npy", "rb") as f:
#     similarity_genre = np.load(f)

In [30]:
Path("./custom_csv").mkdir(parents=True, exist_ok=True)
df.to_csv("./custom_csv/movies_cleaned.csv")
df_combined.to_csv("./custom_csv/movies_combined.csv")
# df = pd.read_csv("./custom_csv/movies_cleaned.csv")
# df_combined = pd.read_csv("./custom_csv/movies_combined.csv")

In [31]:
df[df.title == "Interstellar"]

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
40199,157336,Interstellar,2014-11-05,en,"[Adventure, Drama, Science Fiction]",Interstellar chronicles the adventures of a gr...,8.1,2014,"saving the world, artificial intelligence, fat...",Interstellar


In [32]:
movie_number = 40199

In [33]:
top_ten = reversed(similarity_embedding_overview[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
    df["overview"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

Interstellar 
====
 ['Adventure', 'Drama', 'Science Fiction'] 
====
 saving the world, artificial intelligence, father son relationship, single parent, nasa, expedition, wormhole, space travel, famine, black hole, dystopia, race against time, quantum mechanics, spaceship, space, rescue, family relationships, farmhouse, robot, astronaut, scientist, father daughter relationship, single father, farmer, space station, imax, astrophysics, zero gravity, courage, time paradox, relativity 
====
 Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage. 
====


Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
1703,313943,Revolt,2017-07-01,en,[Science Fiction],The story of humankind's last stand against a ...,6.0,2017,alien invasion,Revolt
221,8413,Event Horizon,1997-08-15,en,"[Horror, Science Fiction, Mystery]",In the year 2047 a group of astronauts are sen...,6.4,1997,"space marine, nudity, nightmare, hallucination...",Event Horizon
217,607,Men in Black,1997-07-02,en,"[Action, Adventure, Comedy, Science Fiction]",Men in Black follows the exploits of agents Ka...,6.9,1997,"secret identity, sun glasses, undercover, spac...",Men in Black
10,830,Forbidden Planet,1956-03-15,en,"[Adventure, Science Fiction, Action]",Captain Adams and the crew of the Starship C57...,7.2,1956,"lasergun, monster, space marine, loss of fathe...",Forbidden Planet
316,9397,Evolution,2001-06-08,en,"[Action, Comedy, Science Fiction]",A comedy that follows the chaos that ensues wh...,5.7,2001,"grand canyon, governor, shampoo, high school t...",Evolution
...,...,...,...,...,...,...,...,...,...,...
1214,329135,O Kadhal Kanmani,2015-04-17,ta,"[Drama, Family, Romance]",Adhi and Tara are in a live-in relationship an...,5.8,2015,"living together, love, open relationship, sexu...",O Kadhal Kanmani
162,53163,Love on Delivery,1994-02-04,cn,"[Comedy, Action]",Ho Kam-An is a love struck dim-sum delivery bo...,6.1,1994,,Love on Delivery
795,51450,Santa's Apprentice,2010-11-24,fr,"[Family, Animation]","Santa doesn't want to retire, but rules are ru...",4.9,2010,"santa claus, gift, snow, christmas",Santa's Apprentice
1165,320882,The Seven Five,2014-11-14,en,[Documentary],Meet the dirtiest cop in NYC history. Michael ...,7.8,2014,"new york, corruption, mafia, mobster, police c...",The Seven Five
