# Data preparation notebook for recommender project

In [1]:
import ast

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# # You need a kaggle account to use this
# # https://pypi.org/project/opendatasets/
# import opendatasets as od
# od.download("https://www.kaggle.com/datasets/shivamb/netflix-shows")
# od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")

In [3]:
netflix_movies = pd.read_csv(
    "./netflix-shows/netflix_titles.csv", usecols=["title", "type", "release_year"]
)
netflix_movies = netflix_movies[netflix_movies["type"] == "Movie"]

In [4]:
cols_to_read = [
    "id",
    "title",
    "overview",
    "release_date",
    "original_language",
    "genres",
    "vote_average",
]
df_raw = pd.read_csv(
    "./the-movies-dataset/movies_metadata.csv", usecols=cols_to_read
).loc[
    :,
    [
        "id",
        "title",
        "release_date",
        "original_language",
        "genres",
        "overview",
        "vote_average",
    ],
]

df = df_raw.copy()[~df_raw.duplicated()]

In [5]:
df.shape

(45436, 7)

In [6]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce").dropna()
df["release_year"] = df["release_date"].dt.strftime("%Y")

In [7]:
def get_genre_values(row):
    row = ast.literal_eval(row)
    genre_list = [val["name"] for val in row]
    if len(genre_list) == 0:
        return ["Unknown"]
    return genre_list

In [8]:
df = df.loc[df[cols_to_read[:-1]].dropna().index].reset_index(drop=True)

df["id"] = df["id"].astype(int)

In [9]:
df.shape

(44395, 8)

In [10]:
df = pd.merge(
    df,
    pd.read_csv("./the-movies-dataset/keywords.csv").drop_duplicates(),
    how="left",
    on=["id"],
)
df["keywords"] = df.keywords.fillna("[]")
df.shape

(44395, 9)

In [11]:
df["genres"] = df["genres"].apply(get_genre_values)

In [12]:
df["keywords"] = df["keywords"].apply(get_genre_values)

In [13]:
df["keywords"] = df.loc[:, ["keywords"]].apply(
    lambda x: ", ".join(x.keywords) if "Unknown" not in x.keywords else "",
    axis=1,
)

idx_no_kw = df[df["keywords"] == ""].index

In [14]:
df.head()

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords
0,862,Toy Story,1995-10-30,en,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",7.7,1995,"jealousy, toy, boy, friendship, friends, rival..."
1,8844,Jumanji,1995-12-15,en,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,6.9,1995,"board game, disappearance, based on children's..."
2,15602,Grumpier Old Men,1995-12-22,en,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,6.5,1995,"fishing, best friend, duringcreditsstinger, ol..."
3,31357,Waiting to Exhale,1995-12-22,en,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",6.1,1995,"based on novel, interracial relationship, sing..."
4,11862,Father of the Bride Part II,1995-02-10,en,[Comedy],Just when George Banks has recovered from his ...,5.7,1995,"baby, midlife crisis, confidence, aging, daugh..."


In [15]:
netflix_movies["release_year"] = netflix_movies["release_year"].astype(str)
df_combined = pd.merge(
    df, netflix_movies[["title", "release_year"]], on=["title", "release_year"]
)
df_combined = df_combined[
    ~df_combined[["title", "release_year", "overview"]].duplicated()
].reset_index(drop=True)

In [16]:
all_genres = list(df_combined["genres"].explode().unique())
all_genres.remove("Unknown")
print(all_genres)

['Adventure', 'Action', 'Thriller', 'Comedy', 'Drama', 'Romance', 'Family', 'Animation', 'Fantasy', 'Crime', 'Mystery', 'Science Fiction', 'Western', 'History', 'War', 'Horror', 'Music', 'Documentary', 'Foreign', 'TV Movie']


In [17]:
df = df.sort_values(by="release_date").reset_index(drop=True)
df.loc[df.title.duplicated(), "title"] = df.loc[df.title.duplicated(), ["title"]].apply(
    lambda x: " ".join([x["title"], f"({str(df['release_year'][x.name])})"]), axis=1
)

In [18]:
duplicate_title_and_year = df.loc[df.title.duplicated(), "title"].index

In [19]:
df.loc[df.title.duplicated(), "title"] = df.loc[df.title.duplicated(), ["title"]].apply(
    lambda x: x["title"].replace(
        f"({str(df['release_year'][x.name])})",
        f"({str(df['release_date'][x.name].strftime('%Y-%m'))})",
    ),
    axis=1,
)

In [20]:
df.loc[duplicate_title_and_year]

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,release_year,keywords
18699,12254,Emma (1996-10),1996-10-02,en,"[TV Movie, Comedy, Drama, Romance]",Emma Woodhouse has a rigid sense of propriety ...,6.7,1996,"marriage proposal, make a match"
19091,13852,The Castle (1997-04),1997-04-10,en,"[Drama, Comedy]",A Melbourne family is very happy living near t...,7.4,1997,"underdog, airport, court case, australian, fam..."
19140,2966,"20,000 Leagues Under the Sea (1997-05)",1997-05-11,en,"[Adventure, Fantasy, Action, Science Fiction]","The year is 1886, when New England's fishing h...",4.6,1997,
21677,125705,Hamlet (2000-12),2000-12-10,en,[Drama],"To be or not to be, etc.",0.0,2000,shakespeare
25071,3056,Frankenstein (2004-10),2004-10-09,en,"[TV Movie, Drama, Horror, Science Fiction, Mys...",Two hundred years after Mary Shelley's novel t...,4.6,2004,"monster, frankenstein"
25928,39914,Chaos (2005-08),2005-08-10,en,"[Action, Horror, Thriller]",Emily and her friend Angelica go to a rave in ...,4.4,2005,
27413,173288,Black Sheep (2006-10),2006-10-25,de,"[Comedy, Foreign]",Black Sheep (original title: Schwarze Schafe) ...,6.6,2006,
28753,59346,Darling (2007-11),2007-11-07,fr,[Drama],Darling is a woman of today. It seems to alway...,6.5,2007,woman director
30220,10139,Milk (2008-11),2008-11-26,en,"[History, Drama]",The story of California's first openly gay ele...,7.1,2008,"gay, san francisco, homophobia, mayor, biograp..."
31457,26738,Rage (2009-09),2009-09-24,en,"[Thriller, Drama]",A schoolboy uses his cellphone camera to shoot...,4.2,2009,"murder, fashion, woman director"


In [21]:
mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_all = mlb.fit_transform(df["genres"])


mlb = MultiLabelBinarizer(classes=all_genres)
binary_labels_netflix = mlb.fit_transform(df_combined["genres"])
genres = mlb.classes_



In [22]:
genre_similarity = cosine_similarity(binary_labels_all, binary_labels_netflix)

In [23]:
genre_similarity.shape

(44395, 1723)

In [None]:
model = SentenceTransformer("all-mpnet-base-v2")
embeddings_overview = model.encode(
    df.overview.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)
embeddings_keywords = model.encode(
    df.keywords.to_numpy().astype(str), show_progress_bar=True, convert_to_numpy=True
)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/1388 [00:00<?, ?it/s]

In [None]:
print(embeddings_overview.shape, embeddings_keywords.shape)

In [None]:
indices_shared = df[df.id.isin(df_combined.id)].index
embeddings_overview_netflix = embeddings_overview[indices_shared]
embeddings_keywords_netflix = embeddings_keywords[indices_shared]
similarity_netflix = genre_similarity[indices_shared]

In [None]:
similarity_encoding_overview = util.cos_sim(
    embeddings_overview, embeddings_overview_netflix
)
similarity_encoding_keywords = util.cos_sim(
    embeddings_keywords, embeddings_keywords_netflix
)

In [None]:
similarity_encoding_overview[indices_shared, range(0, len(indices_shared))] = -100
similarity_encoding_keywords[idx_no_kw] = 0
similarity_encoding_keywords[indices_shared, range(0, len(indices_shared))] = -100
genre_similarity[indices_shared, range(0, len(indices_shared))] = -1

In [None]:
with open(r"./genre_similarity.npy", "wb") as f:
    np.save(f, genre_similarity)

In [ ]:
with open(r"./embeddings_overview.npy", "wb") as f:
    np.save(f, embeddings_overview)

with open(r"./embeddings_keywords.npy", "wb") as f:
    np.save(f, embeddings_keywords)

In [None]:
with open(r"./similarity_encoding_overview.npy", "wb") as f:
    np.save(f, similarity_encoding_overview)

with open(r"./similarity_encoding_keywords.npy", "wb") as f:
    np.save(f, similarity_encoding_keywords)

In [None]:
df.to_csv("./custom_csv/movies_cleaned.csv")
df_combined.to_csv("./custom_csv/movies_combined.csv")

In [None]:
df["title"][df.title == "The Lord of the Rings: The Fellowship of the Ring"]

In [None]:
movie_number = 4837

In [None]:
# print(
#     df["title"][movie_number],
#     "\n====\n",
#     df["genres"][movie_number],
#     "\n====\n",
#     "\n".join(df_combined.title.loc[reversed(similarity_encoding[movie_number].argsort()[-10:])]),
#     "\n====\n",
#     "\n\n".join(df_combined.genres.loc[reversed(similarity_encoding[movie_number].argsort()[-10:])].astype(str)),
#     "\n====\n",
#     df["overview"][movie_number],
#     "\n\n".join(df_combined.overview.loc[reversed(similarity_encoding[movie_number].argsort()[-10:])]),
#     sep="\n")

In [None]:
# print(
#     df["title"][movie_number],
#     "\n====\n",
#     df["genres"][movie_number],
#     "\n====\n",
#     "\n".join(df_combined.title.loc[reversed(similarity[movie_number].argsort()[-10:])]),
#     "\n====\n",
#     "\n\n".join(df_combined.genres.loc[reversed(similarity[movie_number].argsort()[-10:])].astype(str)),
#     "\n====\n",
#     df["overview"][movie_number],
#     "\n\n".join(df_combined.overview.loc[reversed(similarity[movie_number].argsort()[-10:])]),
#     sep="\n")

In [None]:
# df_combined[df_combined.title == "The Lord of the Rings: The Two Towers"]

In [None]:
# similarity_encoding[movie_number, similarity_encoding[movie_number].argsort()[-10:]]

In [None]:
# similarity_encoding[movie_number][1597]

In [None]:
# combined_similarity = similarity_encoding + similarity

In [None]:
#  print(
#     df["title"][movie_number],
#     "\n====\n",
#     df["genres"][movie_number],
#     "\n====\n",
#     "\n".join(df_combined.title.loc[reversed(combined_similarity[movie_number].argsort()[-10:])]),
#     "\n====\n",
#     "\n\n".join(df_combined.genres.loc[reversed(combined_similarity[movie_number].argsort()[-10:])].astype(str)),
#     "\n====\n",
#     df["overview"][movie_number],
#     "\n\n".join(df_combined.overview.loc[reversed(combined_similarity[movie_number].argsort()[-10:])]),
#     sep="\n")