# Data preparation notebook for recommender project

In [1]:
import ast
from pathlib import Path

import numpy as np

# You need a kaggle account to use this
# https://pypi.org/project/opendatasets/
import opendatasets as od
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
od.download("https://www.kaggle.com/datasets/shivamb/netflix-shows")
od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

Skipping, found downloaded files in "./netflix-shows" (use force=True to force download)
Skipping, found downloaded files in "./the-movies-dataset" (use force=True to force download)


In [3]:
netflix_movies = pd.read_csv(
    "./netflix-shows/netflix_titles.csv", usecols=["title", "type", "release_year"]
)
netflix_movies = netflix_movies[netflix_movies["type"] == "Movie"]

In [4]:
cols_to_read = [
    "id",
    "title",
    "overview",
    "release_date",
    "original_language",
    "genres",
    "vote_average",
]
df_raw = pd.read_csv(
    "./the-movies-dataset/movies_metadata.csv", usecols=cols_to_read
).loc[
    :,
    [
        "id",
        "title",
        "release_date",
        "original_language",
        "genres",
        "overview",
        "vote_average",
    ],
]

df = df_raw.copy()[~df_raw.duplicated()]

In [5]:
df.shape

(45436, 7)

In [6]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce").dropna()
df["release_year"] = df["release_date"].dt.strftime("%Y")

In [7]:
def get_genre_values(row):
    row = ast.literal_eval(row)
    genre_list = [val["name"] for val in row]
    if len(genre_list) == 0:
        return ["Unknown"]
    return genre_list

In [8]:
df = df.loc[df[cols_to_read[:-1]].dropna().index].reset_index(drop=True)

df["id"] = df["id"].astype(int)

In [9]:
df.shape

(44395, 8)

In [10]:
df = pd.merge(
    df,
    pd.read_csv("./the-movies-dataset/keywords.csv").drop_duplicates(),
    how="left",
    on=["id"],
)
df["keywords"] = df.keywords.fillna("[]")
df.shape

(44395, 9)

In [11]:
df["genres"] = df["genres"].apply(get_genre_values)

In [12]:
df["keywords"] = df["keywords"].apply(get_genre_values)

In [13]:
df["keywords"] = df.loc[:, ["keywords"]].apply(
    lambda x: ", ".join(x.keywords) if "Unknown" not in x.keywords else "",
    axis=1,
)

idx_no_kw = df[df["keywords"] == ""].index

In [14]:
df.head()

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords
0,862,Toy Story,1995-10-30,en,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",7.7,1995,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,1995-12-15,en,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,6.9,1995,"board game, disappearance, based on children's..."
2,15602,Grumpier Old Men,1995-12-22,en,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,6.5,1995,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,Waiting to Exhale,1995-12-22,en,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",6.1,1995,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,1995-02-10,en,[Comedy],Just when George Banks has recovered from his ...,5.7,1995,"baby, midlife crisis, confidence, aging, daugh..."


In [15]:
df = df.sort_values(by="release_date").reset_index(drop=True)
df["title_dated"] = df["title"].copy()
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: " ".join([x["title_dated"], f"({str(df['release_year'][x.name])})"]),
    axis=1,
)

In [16]:
duplicate_title_and_year = df.loc[df.title.duplicated(), "title"].index

In [17]:
df.loc[df.title.duplicated(), "title_dated"] = df.loc[
    df.title.duplicated(), ["title_dated"]
].apply(
    lambda x: x["title_dated"].replace(
        f"({str(df['release_year'][x.name])})",
        f"({str(df['release_date'][x.name].strftime('%Y-%m'))})",
    ),
    axis=1,
)

In [18]:
netflix_movies["release_year"] = netflix_movies["release_year"].astype(str)
df_combined = pd.merge(
    df, netflix_movies[["title", "release_year"]], on=["title", "release_year"]
)
df_combined = df_combined[
    ~df_combined[["title", "release_year", "overview"]].duplicated()
].reset_index(drop=True)

In [19]:
all_genres = list(df_combined["genres"].explode().unique())
all_genres.remove("Unknown")
print(all_genres)

['Documentary', 'War', 'History', 'Drama', 'Comedy', 'Music', 'Romance', 'Adventure', 'Science Fiction', 'Action', 'Crime', 'Thriller', 'Family', 'Horror', 'Mystery', 'Fantasy', 'Western', 'Animation', 'Foreign', 'TV Movie']


In [20]:
df.loc[duplicate_title_and_year]

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
54,104471,The Haunted Castle,1897-01-01,fr,[Horror],A man has an encounter with several spooky app...,5.6,1897,"french, horror, silent film, ghost, georges me...",The Haunted Castle (1897-01)
83,193411,The Kiss,1900-03-09,en,[Romance],"Nothing new, but an old thing done over again ...",4.5,1900,"kiss, remake, kissing, couple, silent film",The Kiss (1900-03)
223,92349,Cinderella,1914-12-28,en,"[Fantasy, Drama]",Based on Charles Perrault's fairy tale: Cinder...,5.4,1914,,Cinderella (1914-12)
228,87300,Alice in Wonderland,1915-01-15,en,"[Family, Fantasy]",A German adaptation of the classic Lewis Carro...,5.5,1915,"dream, alice in wonderland, rabbit",Alice in Wonderland (1915-01)
317,174928,Carmen,1918-12-20,de,[Drama],"The tragic story of Don Jose, a Spanish cavalr...",4.8,1918,"gypsy, smuggler, short",Carmen (1918-12)
...,...,...,...,...,...,...,...,...,...,...
44333,463906,The Saint,2017-07-11,en,"[Action, Adventure, Crime]","International master thief, Simon Templar, als...",5.8,2017,the saint,The Saint (2017-07)
44339,428501,City of Ghosts,2017-07-14,en,[Documentary],"With unprecedented access, this documentary fo...",2.0,2017,,City of Ghosts (2017-07)
44340,459950,Feed,2017-07-18,en,[Drama],"Olivia and Matthew Grey, 18-year-old twins bor...",7.3,2017,"twins, valedictorian, eating disorder, highsch...",Feed (2017-07)
44341,374720,Dunkirk,2017-07-19,en,"[Action, Drama, History, Thriller, War]",The miraculous evacuation of Allied soldiers f...,7.5,2017,"france, beach, world war ii, evacuation, germa...",Dunkirk (2017-07)


In [21]:
mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_all = mlb.fit_transform(df["genres"])


mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_netflix = mlb.fit_transform(df_combined["genres"])
genres = mlb.classes_



In [None]:
similarity_genre = cosine_similarity(binary_labels_all, binary_labels_netflix)

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")
embeddings_overview = model.encode(
    df.overview.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)
embeddings_keywords = model.encode(
    df.keywords.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)

In [None]:
print(embeddings_overview.shape, embeddings_keywords.shape)

In [None]:
indices_shared = df[df.id.isin(df_combined.id)].index
embeddings_overview_netflix = embeddings_overview[indices_shared]
embeddings_keywords_netflix = embeddings_keywords[indices_shared]
similarity_netflix = similarity_genre[indices_shared]

In [None]:
similarity_embedding_overview = util.cos_sim(
    embeddings_overview, embeddings_overview_netflix
)
similarity_embedding_keywords = util.cos_sim(
    embeddings_keywords, embeddings_keywords_netflix
)

In [None]:
similarity_embedding_keywords[idx_no_kw] = 0
for idx_combined, idx_df in enumerate(indices_shared):
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_overview[idx_df][idx_combined] = -100
    similarity_embedding_keywords[idx_df][idx_combined] = -100
    similarity_embedding_keywords[idx_df][idx_combined] = -100
    similarity_genre[idx_df][idx_combined] = -1

In [22]:
# with open(r"./similarity_genre.npy", "wb") as f:
#     np.save(f, similarity_genre)
with open(r"./similarity_genre.npy", "rb") as f:
    similarity_genre = np.load(f)

In [23]:
# with open(r"./embeddings_keywords.npy", "wb") as f:
#     np.save(f, embeddings_keywords)
# with open(r"./embeddings_overview.npy", "wb") as f:
#     np.save(f, embeddings_overview)
#
with open(r"./embeddings_keywords.npy", "rb") as f:
    embeddings_keywords = np.load(f)
with open(r"./embeddings_overview.npy", "rb") as f:
    embeddings_overview = np.load(f)

In [24]:
# with open(r"./similarity_embedding_overview.npy", "wb") as f:
#     np.save(f, similarity_embedding_overview)
# #
with open(r"./similarity_embedding_overview.npy", "rb") as f:
    similarity_embedding_overview = np.load(f)

In [25]:
# with open(r"./similarity_embedding_keywords.npy", "wb") as f:
#     np.save(f, similarity_embedding_keywords)
# #
with open(r"./similarity_embedding_keywords.npy", "rb") as f:
    similarity_embedding_keywords = np.load(f)

In [26]:
Path("./custom_csv").mkdir(parents=True, exist_ok=True)
df.to_csv("./custom_csv/movies_cleaned.csv")
df_combined.to_csv("./custom_csv/movies_combined.csv")

In [27]:
df = pd.read_csv("./custom_csv/movies_cleaned.csv")
df_combined = pd.read_csv("./custom_csv/movies_combined.csv")

In [None]:
df[df.title == "Interstellar"]

In [28]:
movie_number = 40199

In [29]:
top_ten = reversed(similarity_embedding_overview[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
    df["overview"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

Interstellar 
====
 ['Adventure', 'Drama', 'Science Fiction'] 
====
 saving the world, artificial intelligence, father son relationship, single parent, nasa, expedition, wormhole, space travel, famine, black hole, dystopia, race against time, quantum mechanics, spaceship, space, rescue, family relationships, farmhouse, robot, astronaut, scientist, father daughter relationship, single father, farmer, space station, imax, astrophysics, zero gravity, courage, time paradox, relativity 
====
 Interstellar chronicles the adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on human space travel and conquer the vast distances involved in an interstellar voyage. 
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
1703,1703,313943,Revolt,2017-07-01,en,['Science Fiction'],The story of humankind's last stand against a ...,6.0,2017,alien invasion,Revolt
221,221,8413,Event Horizon,1997-08-15,en,"['Horror', 'Science Fiction', 'Mystery']",In the year 2047 a group of astronauts are sen...,6.4,1997,"space marine, nudity, nightmare, hallucination...",Event Horizon
217,217,607,Men in Black,1997-07-02,en,"['Action', 'Adventure', 'Comedy', 'Science Fic...",Men in Black follows the exploits of agents Ka...,6.9,1997,"secret identity, sun glasses, undercover, spac...",Men in Black
10,10,830,Forbidden Planet,1956-03-15,en,"['Adventure', 'Science Fiction', 'Action']",Captain Adams and the crew of the Starship C57...,7.2,1956,"lasergun, monster, space marine, loss of fathe...",Forbidden Planet
316,316,9397,Evolution,2001-06-08,en,"['Action', 'Comedy', 'Science Fiction']",A comedy that follows the chaos that ensues wh...,5.7,2001,"grand canyon, governor, shampoo, high school t...",Evolution
...,...,...,...,...,...,...,...,...,...,...,...
1214,1214,329135,O Kadhal Kanmani,2015-04-17,ta,"['Drama', 'Family', 'Romance']",Adhi and Tara are in a live-in relationship an...,5.8,2015,"living together, love, open relationship, sexu...",O Kadhal Kanmani
162,162,53163,Love on Delivery,1994-02-04,cn,"['Comedy', 'Action']",Ho Kam-An is a love struck dim-sum delivery bo...,6.1,1994,,Love on Delivery
795,795,51450,Santa's Apprentice,2010-11-24,fr,"['Family', 'Animation']","Santa doesn't want to retire, but rules are ru...",4.9,2010,"santa claus, gift, snow, christmas",Santa's Apprentice
1165,1165,320882,The Seven Five,2014-11-14,en,['Documentary'],Meet the dirtiest cop in NYC history. Michael ...,7.8,2014,"new york, corruption, mafia, mobster, police c...",The Seven Five


In [30]:
top_ten = reversed(similarity_embedding_keywords[movie_number].argsort())
print(
    df["title"][movie_number],
    "====",
    df["genres"][movie_number],
    "====",
    df["keywords"][movie_number],
    "====",
    sep="\n",
)

df_combined.loc[top_ten]

Interstellar
====
['Adventure', 'Drama', 'Science Fiction']
====
saving the world, artificial intelligence, father son relationship, single parent, nasa, expedition, wormhole, space travel, famine, black hole, dystopia, race against time, quantum mechanics, spaceship, space, rescue, family relationships, farmhouse, robot, astronaut, scientist, father daughter relationship, single father, farmer, space station, imax, astrophysics, zero gravity, courage, time paradox, relativity
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
662,662,13475,Star Trek,2009-05-06,en,"['Science Fiction', 'Action', 'Adventure']",The fate of the galaxy rests in the hands of b...,7.4,2009,"spacecraft, teleportation, space mission, para...",Star Trek
10,10,830,Forbidden Planet,1956-03-15,en,"['Adventure', 'Science Fiction', 'Action']",Captain Adams and the crew of the Starship C57...,7.2,1956,"lasergun, monster, space marine, loss of fathe...",Forbidden Planet
221,221,8413,Event Horizon,1997-08-15,en,"['Horror', 'Science Fiction', 'Mystery']",In the year 2047 a group of astronauts are sen...,6.4,1997,"space marine, nudity, nightmare, hallucination...",Event Horizon
174,174,2164,Stargate,1994-10-27,en,"['Action', 'Adventure', 'Science Fiction']","An interstellar teleportation device, found in...",6.8,1994,"space travel, teleportation, uprising, shot to...",Stargate
260,260,817,Austin Powers: The Spy Who Shagged Me,1999-06-08,en,"['Adventure', 'Comedy', 'Crime', 'Science Fict...","When diabolical genius, Dr. Evil travels back ...",6.1,1999,"saving the world, moon, submarine, clone, spy,...",Austin Powers: The Spy Who Shagged Me
...,...,...,...,...,...,...,...,...,...,...,...
1572,1572,339396,True Memoirs of an International Assassin,2016-11-11,en,"['Comedy', 'Action']",After a publisher changes a writer's debut nov...,5.8,2016,assassin,True Memoirs of an International Assassin
749,749,37821,Killers,2010-06-04,en,"['Action', 'Comedy', 'Thriller', 'Romance']",When an elite assassin marries a beautiful com...,5.7,2010,assassin,Killers
1500,1500,397717,Barry,2016-09-10,en,['Drama'],A biopic of Barack Obama set during his time a...,5.1,2016,"new york city, college student",Barry
1394,1394,371645,Hunt for the Wilderpeople,2016-03-31,en,"['Drama', 'Adventure', 'Comedy']",Ricky is a defiant young city kid who finds hi...,7.8,2016,new zealand,Hunt for the Wilderpeople


In [39]:
combined_similarity = (
    similarity_genre + similarity_embedding_keywords + similarity_embedding_overview
)

In [40]:
top_ten = reversed(combined_similarity[movie_number].argsort())
print(
    df["title"][movie_number],
    "\n====\n",
    df["genres"][movie_number],
    "\n====\n",
    df["keywords"][movie_number],
    "\n====\n",
)

df_combined.loc[top_ten]

Interstellar 
====
 ['Adventure', 'Drama', 'Science Fiction'] 
====
 saving the world, artificial intelligence, father son relationship, single parent, nasa, expedition, wormhole, space travel, famine, black hole, dystopia, race against time, quantum mechanics, spaceship, space, rescue, family relationships, farmhouse, robot, astronaut, scientist, father daughter relationship, single father, farmer, space station, imax, astrophysics, zero gravity, courage, time paradox, relativity 
====


Unnamed: 0.1,Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords,title_dated
662,662,13475,Star Trek,2009-05-06,en,"['Science Fiction', 'Action', 'Adventure']",The fate of the galaxy rests in the hands of b...,7.4,2009,"spacecraft, teleportation, space mission, para...",Star Trek
10,10,830,Forbidden Planet,1956-03-15,en,"['Adventure', 'Science Fiction', 'Action']",Captain Adams and the crew of the Starship C57...,7.2,1956,"lasergun, monster, space marine, loss of fathe...",Forbidden Planet
174,174,2164,Stargate,1994-10-27,en,"['Action', 'Adventure', 'Science Fiction']","An interstellar teleportation device, found in...",6.8,1994,"space travel, teleportation, uprising, shot to...",Stargate
1375,1375,245703,Midnight Special,2016-02-18,en,"['Adventure', 'Drama', 'Science Fiction']",A father and son go on the run after the dad l...,6.2,2016,"father son relationship, helicopter, fbi, mote...",Midnight Special
1179,1179,264660,Ex Machina,2015-01-21,en,"['Drama', 'Science Fiction']","Caleb, a 26 year old coder at the world's larg...",7.6,2015,"dancing, artificial intelligence, distrust, is...",Ex Machina
...,...,...,...,...,...,...,...,...,...,...,...
1165,1165,320882,The Seven Five,2014-11-14,en,['Documentary'],Meet the dirtiest cop in NYC history. Michael ...,7.8,2014,"new york, corruption, mafia, mobster, police c...",The Seven Five
682,682,19905,"The Goods: Live Hard, Sell Hard",2009-08-14,en,['Comedy'],Who is Don Ready? Salesman? Lover? Song Stylis...,5.4,2009,duringcreditsstinger,"The Goods: Live Hard, Sell Hard"
751,751,32823,Get Him to the Greek,2010-06-04,en,['Comedy'],Pinnacle records has the perfect plan to get t...,5.9,2010,aftercreditsstinger,Get Him to the Greek
162,162,53163,Love on Delivery,1994-02-04,cn,"['Comedy', 'Action']",Ho Kam-An is a love struck dim-sum delivery bo...,6.1,1994,,Love on Delivery


In [43]:
combined_df = pd.DataFrame(combined_similarity, columns=df_combined.id, index=df.id)

In [ ]:
# import os
# from dotenv import load_dotenv
# from sqlalchemy import create_engine


In [53]:
# load_dotenv("../.env")
# engine = create_engine(os.environ["DATABASE_URL"])

In [54]:
# combined_df.to_sql("combined_scores", engine)

7