# Data preparation notebook for recommender project

In [1]:
import ast

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# You need a kaggle account to use this
# import opendatasets as od
# https://pypi.org/project/opendatasets/
# od.download("https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset")


In [3]:
def get_genre_values(row):
    row = ast.literal_eval(row)
    return [val["name"] for val in row]

In [4]:
cols_to_read = [
    "id",
    "title",
    "overview",
    "release_date",
    "original_language",
    "genres",
    "vote_average",
    "adult",
]
df = pd.read_csv("./the-movies-dataset/movies_metadata.csv", usecols=cols_to_read).loc[
    :,
    [
        "id",
        "title",
        "release_date",
        "original_language",
        "genres",
        "overview",
        "vote_average",
        "adult",
    ],
]

df = df.loc[df[cols_to_read[:-1]].dropna().index].reset_index(drop=True)

df["id"] = df["id"].astype(int)
df = pd.merge(
    df, pd.read_csv("./the-movies-dataset/keywords.csv"), how="inner", on=["id"]
)


df.shape

(45402, 9)

In [5]:
df.head()

Unnamed: 0,id,title,release_date,original_language,genres,overview,vote_average,adult,keywords
0,862,Toy Story,1995-10-30,en,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",7.7,False,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,1995-12-15,en,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,6.9,False,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,1995-12-22,en,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,6.5,False,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,1995-12-22,en,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",6.1,False,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,1995-02-10,en,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,5.7,False,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
wrong_genres = [
    "Aniplex",
    "BROSTA TV",
    "Carousel Productions",
    "GoHands",
    "Mardock Scramble Production Committee",
    "Odyssey Media",
    "Pulser Productions",
    "Rogue State",
    "Sentai Filmworks",
    "Telescene Film Group Productions",
    "The Cartel",
    "Vision View Entertainment",
    "Telescene Film Group Productions",
]

In [7]:
df["genres"] = df["genres"].apply(get_genre_values)

In [8]:
df["genres"] = df["genres"].apply(
    lambda row: [val for val in row if val not in wrong_genres]
)

In [9]:
no_genre_titles = df.loc[:, ["genres", "id", "title"]].explode("genres")
no_genre_titles = no_genre_titles[no_genre_titles["genres"].isna()]["id"].to_numpy()
df = df[~df["id"].isin(no_genre_titles)].reset_index(drop=True)

In [10]:
all_genres = np.unique(np.concatenate(df["genres"]))
print(all_genres)

['Action' 'Adventure' 'Animation' 'Comedy' 'Crime' 'Documentary' 'Drama'
 'Family' 'Fantasy' 'Foreign' 'History' 'Horror' 'Music' 'Mystery'
 'Romance' 'Science Fiction' 'TV Movie' 'Thriller' 'War' 'Western']


In [11]:
df["keywords"] = df["keywords"].apply(get_genre_values)

In [12]:
df["overview"] = df.loc[:, ["overview"]].apply(
    lambda x: " ".join([x["overview"], "  keywords:"] + df.loc[x.name, "keywords"]),
    axis=1,
)

In [13]:
df["overview"].loc[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.   keywords: jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life"

In [14]:
df.shape

(43195, 9)

In [15]:
mlb = MultiLabelBinarizer(classes=df["genres"].explode().unique())
binary_labels = mlb.fit_transform(df["genres"])
genres = mlb.classes_

In [16]:
similarity = (cosine_similarity(binary_labels) * 100).astype(int)
np.fill_diagonal(similarity, 0)

In [17]:
with open(r".\similarity.npy", "wb") as f:
    np.save(f, similarity)

In [18]:
model = SentenceTransformer("all-MiniLM-L6-v2")
encoding = model.encode(df["overview"].to_numpy().astype(str))

In [19]:
with open(r".\encoding.npy", "wb") as f:
    np.save(f, encoding)

In [20]:
cos_sim = util.cos_sim(encoding, encoding)

In [21]:
cos_sim_np = (cos_sim * 100).numpy().astype(int)
np.fill_diagonal(cos_sim_np, -100)

In [26]:
with open(r".\cos_sim_desc.npy", "wb") as f:
    np.save(f, cos_sim)

In [24]:
df["title"][df.title == "The Hobbit"]

8649    The Hobbit
Name: title, dtype: object

In [25]:
print(
    df["title"][8649],
    "\n====\n",
    "\n".join(df["title"][reversed(cos_sim_np[8649].argsort()[-10:])].to_numpy()),
    "\n====\n",
    df["overview"][8649],
    "\n====\n",
    "\n\n".join(df["overview"][reversed(cos_sim_np[8649].argsort()[-10:])].to_numpy()),
    sep="\n",
)

The Hobbit

====

The Hobbit: An Unexpected Journey
The Hobbit: The Desolation of Smaug
The Hobbit: The Battle of the Five Armies
The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Return of the King
The Lord of the Rings
The Return of the King
SAGA: Curse of the Shadow
The Dragon Spell

====

Bilbo Baggins the Hobbit was just minding his own business, when his occasional visitor Gandalf the Wizard drops in one night. One by one, a whole group of dwarves drop in, and before he knows it, Bilbo has joined their quest to reclaim their kingdom, taken from them by the evil dragon Smaug. The only problem is that Gandalf has told the dwarves that Bilbo is an expert burglar, but he isn't...   keywords: elves dwarves orcs rivendell robbery gold magic spider eagle dragon

====

Bilbo Baggins, a hobbit enjoying his quiet life, is swept into an epic quest by Gandalf the Grey and thirteen dwarves who seek to reclaim their mountain home

In [None]:
release = pd.to_datetime(df.release_date)
release = release - release.min()

In [None]:
release = cosine_similarity(np.stack([release.to_numpy(), release.to_numpy()], axis=1))

In [None]:
release.shape

In [None]:
with open(r".\similarity.npy", "rb") as f:
    similarity = np.load(f)

In [None]:
np.median(cos_sim_np)

In [None]:
sum_cos = cos_sim_np + (similarity // 4)

In [None]:
sum_cos = np.array(sum_cos)

In [None]:
sum_cos

In [None]:
df["title"][df.title == "Maleficent"]

In [None]:
print(
    df["title"][8614],
    "\n====\n",
    "\n".join(df["title"][reversed(sum_cos[8614].argsort()[-15:])].to_numpy()),
    "\n====\n",
    df["overview"][8614],
    "\n====\n",
    "\n\n".join(df["overview"][reversed(sum_cos[8614].argsort()[-15:])].to_numpy()),
    sep="\n",
)