# Data preparation notebook for recommender project

In [1]:
import ast
from pathlib import Path

import numpy as np

# You need a kaggle account to use this
# https://pypi.org/project/opendatasets/
import opendatasets as od
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
od.download("https://www.kaggle.com/datasets/shivamb/netflix-shows")
od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

Skipping, found downloaded files in "./netflix-shows" (use force=True to force download)
Skipping, found downloaded files in "./the-movies-dataset" (use force=True to force download)


In [3]:
netflix_movies = pd.read_csv(
    "./netflix-shows/netflix_titles.csv", usecols=["title", "type", "release_year"]
)
netflix_movies = netflix_movies[netflix_movies["type"] == "Movie"]

In [4]:
cols_to_read = [
    "id",
    "title",
    "overview",
    "release_date",
    "original_language",
    "genres",
    "vote_average",
]
df_raw = pd.read_csv(
    "./the-movies-dataset/movies_metadata.csv", usecols=cols_to_read
).loc[
    :,
    [
        "id",
        "title",
        "release_date",
        "original_language",
        "genres",
        "overview",
        "vote_average",
    ],
]

df = df_raw.copy()[~df_raw.duplicated()]

In [5]:
df.shape

(45436, 7)

In [6]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce").dropna()
df["release_year"] = df["release_date"].dt.strftime("%Y")

In [7]:
def get_genre_values(row):
    row = ast.literal_eval(row)
    genre_list = [val["name"] for val in row]
    if len(genre_list) == 0:
        return ["Unknown"]
    return genre_list

In [8]:
df = df.loc[df[cols_to_read[:-1]].dropna().index].reset_index(drop=True)

df["id"] = df["id"].astype(int)

In [9]:
df.shape

(44395, 8)

In [10]:
df = pd.merge(
    df,
    pd.read_csv("./the-movies-dataset/keywords.csv").drop_duplicates(),
    how="left",
    on=["id"],
)
df["keywords"] = df.keywords.fillna("[]")
df.shape

(44395, 9)

In [11]:
df["genres"] = df["genres"].apply(get_genre_values)

In [12]:
df["keywords"] = df["keywords"].apply(get_genre_values)

In [13]:
df["keywords"] = df.loc[:, ["keywords"]].apply(
    lambda x: ", ".join(x.keywords) if "Unknown" not in x.keywords else "",
    axis=1,
)

idx_no_kw = df[df["keywords"] == ""].index

In [14]:
df.head()

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords
0,862,Toy Story,1995-10-30,en,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",7.7,1995,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,1995-12-15,en,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,6.9,1995,"board game, disappearance, based on children's..."
2,15602,Grumpier Old Men,1995-12-22,en,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,6.5,1995,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,Waiting to Exhale,1995-12-22,en,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",6.1,1995,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,1995-02-10,en,[Comedy],Just when George Banks has recovered from his ...,5.7,1995,"baby, midlife crisis, confidence, aging, daugh..."


In [15]:
df = df.sort_values(by="release_date").reset_index(drop=True)
df["title_dated"] = df["title"].copy()
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: " ".join([x["title_dated"], f"({str(df['release_year'][x.name])})"]),
    axis=1,
)

In [16]:
duplicate_title_and_year = df.loc[df.title.duplicated(), "title"].index

In [17]:
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: x["title_dated"].replace(
        f"({str(df['release_year'][x.name])})",
        f"({str(df['release_date'][x.name].strftime('%Y-%m'))})",
    ),
    axis=1,
)

In [18]:
netflix_movies["release_year"] = netflix_movies["release_year"].astype(str)
df_combined = pd.merge(
    df, netflix_movies[["title", "release_year"]], on=["title", "release_year"]
)
df_combined = df_combined[
    ~df_combined[["title", "release_year", "overview"]].duplicated()
].reset_index(drop=True)

In [19]:
all_genres = list(df_combined["genres"].explode().unique())
all_genres.remove("Unknown")
print(all_genres)

['Documentary', 'War', 'History', 'Drama', 'Comedy', 'Music', 'Romance', 'Adventure', 'Science Fiction', 'Action', 'Crime', 'Thriller', 'Family', 'Horror', 'Mystery', 'Fantasy', 'Western', 'Animation', 'Foreign', 'TV Movie']


In [20]:
df.loc[duplicate_title_and_year]

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
54,104471,The Haunted Castle,1897-01-01,fr,[Horror],A man has an encounter with several spooky app...,5.6,1897,"french, horror, silent film, ghost, georges me...",The Haunted Castle (1897-01)
83,193411,The Kiss,1900-03-09,en,[Romance],"Nothing new, but an old thing done over again ...",4.5,1900,"kiss, remake, kissing, couple, silent film",The Kiss (1900-03)
223,92349,Cinderella,1914-12-28,en,"[Fantasy, Drama]",Based on Charles Perrault's fairy tale: Cinder...,5.4,1914,,Cinderella (1914-12)
228,87300,Alice in Wonderland,1915-01-15,en,"[Family, Fantasy]",A German adaptation of the classic Lewis Carro...,5.5,1915,"dream, alice in wonderland, rabbit",Alice in Wonderland (1915-01)
317,174928,Carmen,1918-12-20,de,[Drama],"The tragic story of Don Jose, a Spanish cavalr...",4.8,1918,"gypsy, smuggler, short",Carmen (1918-12)
...,...,...,...,...,...,...,...,...,...,...
44333,463906,The Saint,2017-07-11,en,"[Action, Adventure, Crime]","International master thief, Simon Templar, als...",5.8,2017,the saint,The Saint (2017-07)
44339,428501,City of Ghosts,2017-07-14,en,[Documentary],"With unprecedented access, this documentary fo...",2.0,2017,,City of Ghosts (2017-07)
44340,459950,Feed,2017-07-18,en,[Drama],"Olivia and Matthew Grey, 18-year-old twins bor...",7.3,2017,"twins, valedictorian, eating disorder, highsch...",Feed (2017-07)
44341,374720,Dunkirk,2017-07-19,en,"[Action, Drama, History, Thriller, War]",The miraculous evacuation of Allied soldiers f...,7.5,2017,"france, beach, world war ii, evacuation, germa...",Dunkirk (2017-07)


In [21]:
mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_all = mlb.fit_transform(df["genres"])


mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_netflix = mlb.fit_transform(df_combined["genres"])
genres = mlb.classes_



In [29]:
similarity_genre = cosine_similarity(binary_labels_all, binary_labels_netflix)

In [23]:
model = SentenceTransformer("all-mpnet-base-v2")
embeddings_overview = model.encode(
    df.overview.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)
embeddings_keywords = model.encode(
    df.keywords.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)

Batches:   0%|          | 0/1388 [00:00<?, ?it/s]

Batches:   0%|          | 0/1388 [00:00<?, ?it/s]

In [24]:
print(embeddings_overview.shape, embeddings_keywords.shape)

(44395, 768) (44395, 768)


In [25]:
indices_shared = df[df.id.isin(df_combined.id)].index
embeddings_overview_netflix = embeddings_overview[indices_shared]
embeddings_keywords_netflix = embeddings_keywords[indices_shared]
similarity_netflix = similarity_genre[indices_shared]

In [28]:
similarity_embedding_overview = util.cos_sim(
    embeddings_overview, embeddings_overview_netflix
)
similarity_embedding_keywords = util.cos_sim(
    embeddings_keywords, embeddings_keywords_netflix
)

In [41]:
similarity_embedding_keywords[idx_no_kw] = 0
for idx_combined, idx_df in enumerate(indices_shared):
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_keywords[idx_df][idx_combined] = -100
    similarity_genre[idx_df][idx_combined] = -1

In [31]:
with open(r"./similarity_genre.npy", "wb") as f:
    np.save(f, similarity_genre)
# with open(r"./similarity_genre.npy", "rb") as f:
#     similarity_genre = np.load(f)

In [32]:
with open(r"./embeddings_overview.npy", "wb") as f:
    np.save(f, embeddings_overview)
#
# with open(r"./embeddings_keywords.npy", "rb") as f:
#     embeddings_overview = np.load(f)

In [33]:
with open(r"./similarity_embedding_overview.npy", "wb") as f:
    np.save(f, similarity_embedding_overview)
#
# with open(r"./similarity_embedding_overview.npy", "rb") as f:
#     similarity_embedding_overview = np.load(f)

In [34]:
with open(r"./similarity_embedding_keywords.npy", "wb") as f:
    np.save(f, similarity_embedding_keywords)
#
# with open(r"./similarity_embedding_overview.npy", "rb") as f:
#     similarity_embedding_keywords = np.load(f)

In [35]:
Path("./custom_csv").mkdir(parents=True, exist_ok=True)
df.to_csv("./custom_csv/movies_cleaned.csv")
df_combined.to_csv("./custom_csv/movies_combined.csv")

In [36]:
df = pd.read_csv("./custom_csv/movies_cleaned.csv")
df_combined = pd.read_csv("./custom_csv/movies_combined.csv")

In [37]:
df["title"][df.title == "The Lord of the Rings: The Return of the King"]

24243    The Lord of the Rings: The Return of the King
Name: title, dtype: object

In [38]:
movie_number = 24243

In [39]:
top_ten = reversed(similarity_embedding_overview[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

The Lord of the Rings: The Return of the King 
====
 ['Adventure', 'Fantasy', 'Action'] 
====
 elves, orcs, middle-earth (tolkien), based on novel, suspicion, bravery, war, honor, troll, brutality, violence, ghost, end of trilogy, quest, sword and sorcery 
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
364,364,121,The Lord of the Rings: The Two Towers,2002-12-18,en,"['Adventure', 'Fantasy', 'Action']",Frodo and Sam are trekking to Mordor to destro...,8.0,2002,"elves, orcs, middle-earth (tolkien), hobbit, b...",The Lord of the Rings: The Two Towers
1695,1695,451644,Dragonheart: Battle for the Heartfire,2017-06-13,en,['Adventure'],"When the King Gareth dies, his potential heirs...",5.7,2017,dragons,Dragonheart: Battle for the Heartfire
757,757,10196,The Last Airbender,2010-06-30,en,"['Action', 'Adventure', 'Family', 'Fantasy']","The story follows the adventures of Aang, a yo...",4.7,2010,"fire, ice, war ship, prince, kingdom, water, v...",The Last Airbender
1374,1374,263341,"Crouching Tiger, Hidden Dragon: Sword of Destiny",2016-02-18,en,"['Action', 'Adventure', 'Drama']","A story of lost love, young love, a legendary ...",6.0,2016,"martial arts, wuxia","Crouching Tiger, Hidden Dragon: Sword of Destiny"
1454,1454,390734,Kingsglaive: Final Fantasy XV,2016-07-09,ja,"['Action', 'Animation', 'Adventure', 'Drama', ...",The magical kingdom of Lucis is home to the wo...,6.8,2016,"fight, cgi, based on video game, warrior, anime",Kingsglaive: Final Fantasy XV
...,...,...,...,...,...,...,...,...,...,...,...
110,110,14733,Bill Hicks: Sane Man,1989-12-01,en,['Comedy'],The amazing comedian Bill Hicks passed away in...,8.2,1989,"comedian, honesty, stand-up comedy, cult comed...",Bill Hicks: Sane Man
1401,1401,390293,Bugs,2016-04-16,da,['Documentary'],Although scientists and agribusiness have star...,0.0,2016,"food, sustainable, bugs",Bugs
1385,1385,376570,Hush,2016-03-12,en,"['Horror', 'Thriller']",A deaf woman is stalked by a psychotic killer ...,6.9,2016,"deaf-mute, knife, computer, alone, murder, mut...",Hush (2016-03)
400,400,21542,Love Don't Cost a Thing,2003-12-12,en,"['Comedy', 'Drama', 'Family', 'Romance']",High school loser (Cannon) pays a cheerleader ...,5.4,2003,woman director,Love Don't Cost a Thing


In [42]:
top_ten = reversed(similarity_embedding_keywords[movie_number].argsort())
print(
    df["title"][movie_number],
    "====",
    df["genres"][movie_number],
    "====",
    df["keywords"][movie_number],
    "====",
    sep="\n",
)

df_combined.loc[top_ten]

The Lord of the Rings: The Return of the King
====
['Adventure', 'Fantasy', 'Action']
====
elves, orcs, middle-earth (tolkien), based on novel, suspicion, bravery, war, honor, troll, brutality, violence, ghost, end of trilogy, quest, sword and sorcery
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
364,364,121,The Lord of the Rings: The Two Towers,2002-12-18,en,"['Adventure', 'Fantasy', 'Action']",Frodo and Sam are trekking to Mordor to destro...,8.0,2002,"elves, orcs, middle-earth (tolkien), hobbit, b...",The Lord of the Rings: The Two Towers
895,895,62764,Mirror Mirror,2012-03-15,en,"['Adventure', 'Fantasy', 'Drama', 'Comedy', 'S...","After she spends all her money, an evil enchan...",5.5,2012,"attempted murder, fairy tale, black magic, coc...",Mirror Mirror (2012-03)
544,544,2270,Stardust,2007-08-09,en,"['Adventure', 'Fantasy', 'Romance', 'Family']",In a countryside town bordering on a magical l...,7.2,2007,"witch, based on novel, new love, prince, beaut...",Stardust (2007-08)
564,564,2310,Beowulf,2007-11-05,en,"['Adventure', 'Action', 'Animation']","6th-century Scandinavian warrior, Beowulf emba...",5.5,2007,"denmark, nordic mythology, lie, pride and vani...",Beowulf (2007-11)
1171,1171,68737,Seventh Son,2014-12-12,en,"['Adventure', 'Fantasy']","John Gregory, who is a seventh son of a sevent...",5.2,2014,"magic, chosen one, dark fantasy, witch hunter,...",Seventh Son
...,...,...,...,...,...,...,...,...,...,...,...
720,720,20504,The Book of Eli,2010-01-14,en,"['Action', 'Thriller', 'Science Fiction']","A post-apocalyptic tale, in which a lone man f...",6.6,2010,"book, post-apocalyptic, dystopia, faith, blind",The Book of Eli
682,682,19905,"The Goods: Live Hard, Sell Hard",2009-08-14,en,['Comedy'],Who is Don Ready? Salesman? Lover? Song Stylis...,5.4,2009,duringcreditsstinger,"The Goods: Live Hard, Sell Hard"
1123,1123,286521,5 Flights Up,2014-09-05,en,['Drama'],A long-time married couple who've spent their ...,5.9,2014,"new york, married couple, moving out, moving, ...",5 Flights Up
1500,1500,397717,Barry,2016-09-10,en,['Drama'],A biopic of Barack Obama set during his time a...,5.1,2016,"new york city, college student",Barry


In [43]:
top_ten = reversed(similarity_genre[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

The Lord of the Rings: The Return of the King 
====
 ['Adventure', 'Fantasy', 'Action'] 
====
 elves, orcs, middle-earth (tolkien), based on novel, suspicion, bravery, war, honor, troll, brutality, violence, ghost, end of trilogy, quest, sword and sorcery 
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
801,801,23047,Season of the Witch,2011-01-07,en,"['Adventure', 'Fantasy', 'Action']",A 14th century Crusader returns with his comra...,5.2,2011,"inquisition, monk, ambush, witch, hero, fight,...",Season of the Witch
585,585,1729,The Forbidden Kingdom,2008-04-18,en,"['Action', 'Adventure', 'Fantasy']",An American teenager who is obsessed with Hong...,6.3,2008,"tempel, shaolin, teenager, urination, staff, w...",The Forbidden Kingdom
1478,1478,396643,A Flying Jatt,2016-08-25,hi,"['Fantasy', 'Action', 'Adventure']",Jatt is a reluctant super hero that fights cri...,3.8,2016,,A Flying Jatt
741,741,18823,Clash of the Titans,2010-04-01,en,"['Adventure', 'Fantasy', 'Action']","Born of a god but raised as a man, Perseus is ...",5.6,2010,"hades, mythology, greek mythology, zeus, medus...",Clash of the Titans (2010-04)
535,535,559,Spider-Man 3,2007-05-01,en,"['Fantasy', 'Action', 'Adventure']",The seemingly invincible Spider-Man goes up ag...,5.9,2007,"dual identity, amnesia, sandstorm, love of one...",Spider-Man 3
...,...,...,...,...,...,...,...,...,...,...,...
1097,1097,266285,The Salvation,2014-05-22,da,"['Drama', 'Western']","In 1870s America, a peaceful American settler ...",6.2,2014,"revenge, murder, mute, oil, gang, shootout, train",The Salvation
1098,1098,273997,Stop at Nothing: The Lance Armstrong Story,2014-05-27,en,['Documentary'],A portrait of the man behind the greatest frau...,7.0,2014,,Stop at Nothing: The Lance Armstrong Story
1099,1099,273641,Bad Grandpa .5,2014-06-03,en,['Comedy'],Bad Grandpa .5 gives you a whole new perspecti...,5.8,2014,"mtv, jackass",Bad Grandpa .5
0,0,56143,The Battle of Midway,1942-09-14,en,"['Documentary', 'War']","The Japanese attack on Midway in June 1942, fi...",4.7,1942,,The Battle of Midway


In [44]:
combined_similarity = (
    similarity_genre * 0.5
    + similarity_embedding_keywords.numpy()
    + similarity_embedding_overview.numpy()
)

In [45]:
top_ten = reversed(combined_similarity[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

The Lord of the Rings: The Return of the King 
====
 ['Adventure', 'Fantasy', 'Action'] 
====
 elves, orcs, middle-earth (tolkien), based on novel, suspicion, bravery, war, honor, troll, brutality, violence, ghost, end of trilogy, quest, sword and sorcery 
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
364,364,121,The Lord of the Rings: The Two Towers,2002-12-18,en,"['Adventure', 'Fantasy', 'Action']",Frodo and Sam are trekking to Mordor to destro...,8.0,2002,"elves, orcs, middle-earth (tolkien), hobbit, b...",The Lord of the Rings: The Two Towers
757,757,10196,The Last Airbender,2010-06-30,en,"['Action', 'Adventure', 'Family', 'Fantasy']","The story follows the adventures of Aang, a yo...",4.7,2010,"fire, ice, war ship, prince, kingdom, water, v...",The Last Airbender
564,564,2310,Beowulf,2007-11-05,en,"['Adventure', 'Action', 'Animation']","6th-century Scandinavian warrior, Beowulf emba...",5.5,2007,"denmark, nordic mythology, lie, pride and vani...",Beowulf (2007-11)
1212,1212,308504,Last Knights,2015-04-03,en,"['Action', 'Adventure']","When an evil emperor executes their leader, hi...",6.2,2015,"loyalty, emperor, castle, sword fight, revenge...",Last Knights
648,648,12437,Underworld: Rise of the Lycans,2009-01-22,en,"['Fantasy', 'Action', 'Adventure', 'Science Fi...","A prequel to the first two Underworld films, t...",6.2,2009,"prison, underworld, slavery, castle, vampire, ...",Underworld: Rise of the Lycans
...,...,...,...,...,...,...,...,...,...,...,...
1123,1123,286521,5 Flights Up,2014-09-05,en,['Drama'],A long-time married couple who've spent their ...,5.9,2014,"new york, married couple, moving out, moving, ...",5 Flights Up
1166,1166,300685,Jeff Dunham: All Over the Map,2014-11-18,en,['Comedy'],"Jeff Dunham and his iconic creations, Achmed t...",6.1,2014,"ventriloquist, stand-up comedy, live audience",Jeff Dunham: All Over the Map
201,201,2925,The First Wives Club,1996-09-20,en,['Comedy'],After years of helping their hubbies climb the...,6.5,1996,"divorce, divorced woman, reunited friends",The First Wives Club
1500,1500,397717,Barry,2016-09-10,en,['Drama'],A biopic of Barack Obama set during his time a...,5.1,2016,"new york city, college student",Barry
