In [1]:
import torch
import torch.nn as nn
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval

In [2]:
from models.cb.vectstem import TFIDVectAndStem
from utility import Mapper

## Import Dataset

In [3]:
metadatas = pd.read_csv("./database/merged/metadatas.csv")
mapper = Mapper.load(f"./database/merged/mapper.pkl")
movie_ratings = pd.read_csv("./database/merged/global_ratings.csv")
movies = pd.merge(metadatas, movie_ratings, on="MovieID")
movies["ReleaseDate"] = pd.to_datetime(movies["ReleaseDate"])
movies["Runtime"] = pd.to_timedelta(movies["Runtime"])
movies["Runtime"] = movies["Runtime"] / pd.Timedelta(hours=1)
movies["Genres"] = movies["Genres"].apply(lambda x: literal_eval(x))
movies["Directors"] = movies["Directors"].apply(lambda x: literal_eval(x))
movies["Cast"] = movies["Cast"].apply(lambda x: literal_eval(x))
movies["OriginCountries"] = movies["OriginCountries"].apply(lambda x: literal_eval(x))
movies["Plot"] = movies["Plot"].fillna("")
movies["Index"] = movies["MovieID"].apply(lambda m: mapper.item_fwd_map[m])
movies = movies.sort_values(by="Index").reset_index(drop=True).drop(columns="Index")

In [4]:
movies.head(3)

Unnamed: 0,MovieID,Title,Runtime,ReleaseDate,Genres,Directors,Cast,OriginCountries,Languages,Plot,AverageRating,VoteCount
0,1,Toy Story (1995),1.35,1995-10-30,"[Adventure, Animation, Children, Comedy, Fantasy]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[United States],['English'],A cowboy doll is profoundly threatened and jea...,4.134552,1187323.0
1,2,Jumanji (1995),1.733333,1995-12-15,"[Adventure, Children, Fantasy]",[Joe Johnston],"[Robin Williams, Kirsten Dunst, Bonnie Hunt, J...",[United States],"['English', 'French']",When two kids find and play a magical board ga...,3.536073,426370.0
2,3,Grumpier Old Men (1995),1.683333,1995-12-22,"[Comedy, Romance]",[Howard Deutch],"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[United States],"['English', 'Italian', 'German']",John and Max resolve to save their beloved bai...,3.287087,43509.0


## TFIDVectorizer + Stemmer

### Utility

In [5]:
def repeat_text(text: str, time: int):
    result = text
    for _ in range(time - 1):
        result += " " + text
    return result

def merge_text(l_text_1, l_text_2, weights=(1, 1)):
    result = []
    for title, plot in zip(l_text_1, l_text_2):
        title = repeat_text(str(title), weights[0])
        l_text_2 = repeat_text(str(plot), weights[1])
        result.append(title + " " + l_text_2)
    return result


def get_recommendation(movie, mapper: Mapper, tf_mat: torch.Tensor, top: int = 10):
    movie_idx = mapper.item_fwd_map[movie]
    movie_vect = tf_mat[movie_idx].unsqueeze(0)
    values, indices = torch.nn.CosineSimilarity()(movie_vect, tf_mat).sort()
    indices, values = (
        [
            mapper.item_inv_map[int(i)]
            for i in reversed(indices[indices != movie_idx][-top:])
        ],
        [float(s) for s in reversed(values[indices != movie_idx][-top:])],
    )
    return indices, values

### On genres and ratings

Ratings

In [9]:
R_bar = movies["AverageRating"].mean()
c = movies["VoteCount"].quantile(0.95)
def average_rating(row):
    R = row["AverageRating"]
    v = row["VoteCount"]
    return (v * R + c * R_bar) / (v + c)
weighted = movies.apply(average_rating, axis=1)
weighted_max = weighted.max()
weighted_min = weighted.min()
weighted_range = weighted_max - weighted_min
normalized = (weighted - weighted_min) / weighted_range

Genres

In [15]:
movies_genres = (
    movies[["MovieID", "Genres"]].explode(column="Genres")
    .groupby("Genres").count().drop("(no genres listed)")
)
movies_genres_mapper = {genre: i for i, genre in enumerate(movies_genres.index)}

In [28]:
def one_hot_genres_encode(entries, weights=None):
    result = []
    if weights is None:
        weights = [1.0] * len(entries)
    for genres, w in zip(entries, weights):
        genres_vect = [0.0] * len(movies_genres_mapper)
        for g in genres:
            if g == "(no genres listed)":
                continue
            genres_vect[movies_genres_mapper[g]] = 1.0
        genres_vect = [w] + genres_vect
        result.append(genres_vect)
    return torch.FloatTensor(result)

In [29]:
tf_genres_mat = one_hot_genres_encode(movies["Genres"], normalized)
torch.save(tf_genres_mat, "./models/cb/genres_with_ratings.pt")
tf_genres_mat.shape

torch.Size([87585, 20])

In [38]:
movie_id = 281096
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_genres_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Puss in Boots: The Last Wish (2022)
166461 Moana (2016) 0.9999973773956299
3114 Toy Story 2 (1999) 0.9998486638069153
4016 Emperor's New Groove, The (2000) 0.9998302459716797
247988 Luca (2021) 0.9998108744621277
225173 Soul (2020) 0.9998044967651367
213207 Onward (2020) 0.9997608661651611
4886 Monsters, Inc. (2001) 0.9995051622390747
286131 The Super Mario Bros. Movie (2023) 0.9992679357528687
1 Toy Story (1995) 0.9990704655647278
206959 Frozen II (2019) 0.9987330436706543


### On title and plot

In [56]:
titles_and_plots = merge_text(movies["Title"], movies["Plot"], (1, 1))
vectstem = TFIDVectAndStem(0.001, 0.005)
vectstem.fit(titles_and_plots)
tf_plot_mat = vectstem(titles_and_plots)
torch.save(tf_plot_mat, "./models/cb/titles_and_plots.pt")
np.save("models/cb/keywords.npy", vectstem.features)
with open("models/cb/vocab.json", "w") as file:
    json.dump(vectstem.vocab, file)
tf_plot_mat.shape

torch.Size([87585, 2280])

In [61]:
movie_id = 1
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_plot_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Toy Story (1995)
120474 Toy Story That Time Forgot (2014) 0.7035073637962341
239474 Lamp Life (2020) 0.7035073637962341
193759 Power of Grayskull: The Definitive History of He-Man and the Masters of the Universe (2017) 0.7035073637962341
153234 Toy Reanimator (2002) 0.6450608968734741
201588 Toy Story 4 (2019) 0.6366295218467712
115875 Toy Story Toons: Hawaiian Vacation (2011) 0.6327633857727051
120468 Toy Story Toons: Partysaurus Rex (2012) 0.6220443248748779
78499 Toy Story 3 (2010) 0.6127521991729736
208112 Rudolph the Red-Nosed Reindeer & the Island of Misfit Toys (2001) 0.6083803772926331
3114 Toy Story 2 (1999) 0.605545699596405


In [59]:
movie_id = 225173
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_plot_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Soul (2020)
4301 Calle 54 (2000) 0.48218005895614624
165331 Song of Lahore (2015) 0.48218005895614624
219374 Jammin' the Blues (1944) 0.48218005895614624
155748 Born to Be Blue (2015) 0.48218005895614624
192325 Jazz on a Summer's Day (1960) 0.43974214792251587
252788 Joe & Joe (1996) 0.4334331154823303
210561 Joe Leahy's Neighbors (1988) 0.4298184812068939
170503 People You May Know (2016) 0.4298184812068939
183703 Forever My Girl (2018) 0.42514103651046753
213091 The Skywalk Is Gone (2002) 0.42514103651046753


### On directors and cast

Mention directors 2 times and only the first 5 (or less) members in cast

In [34]:
directors = movies["Directors"].apply(lambda x: " ".join(x))
cast = movies["Cast"].apply(lambda x: " ".join(x[:10]))
directors_and_cast = merge_text(directors, cast, (2, 1))
vectstem = TFIDVectAndStem(0.005, 0.1)
vectstem.fit(directors_and_cast)
tf_dircast_mat = vectstem(directors_and_cast)
torch.save(tf_dircast_mat, "./models/cb/directors_and_cast.pt")
tf_dircast_mat.shape

torch.Size([87585, 482])

In [35]:
movie_id = 1
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_dircast_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Toy Story (1995)
3114 Toy Story 2 (1999) 0.6659654974937439
78499 Toy Story 3 (2010) 0.6239543557167053
115879 Toy Story Toons: Small Fry (2011) 0.5229395031929016
106022 Toy Story of Terror (2013) 0.5195543766021729
115875 Toy Story Toons: Hawaiian Vacation (2011) 0.4724394679069519
159801 Old Fashioned: The Story of the Wisconsin Supper Club (2015) 0.4704514443874359
166409 The Sea Around Us (1953) 0.46715396642684937
120474 Toy Story That Time Forgot (2014) 0.42283886671066284
180767 Lucky People Center International (1998) 0.39776840806007385
229119 Rifkin's Festival (2020) 0.3931911885738373


In [36]:
movie_id = 4306
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_dircast_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Shrek (2001)


257813 Shrek in the Swamp Karaoke Dance Party (2001) 0.6370930671691895
53121 Shrek the Third (2007) 0.6302591562271118
196229 Full of Grace (2015) 0.5436352491378784
194903 One Small Step (2018) 0.5267082452774048
8360 Shrek 2 (2004) 0.510800302028656
282725 Andrew Santino: Cheeseburger (2023) 0.4690206050872803
1475 Kama Sutra: A Tale of Love (1996) 0.4690206050872803
5430 For a Lost Soldier (Voor een Verloren Soldaat) (1992) 0.4690206050872803
189725 Lek and the Dogs (2017) 0.4690206050872803
166064 Battle Under Orion (2009) 0.4690206050872803
