In [37]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from ast import literal_eval

In [38]:
from models.cb.vectstem import TFIDVectAndStem
from utility import Mapper

## Import Dataset

In [39]:
metadatas = pd.read_csv("./database/merged/metadatas.csv")
mapper = Mapper.load(f"./database/merged/mapper.pkl")
movie_ratings = pd.read_csv("./database/merged/global_ratings.csv")
movies = pd.merge(metadatas, movie_ratings, on="MovieID")
movies["ReleaseDate"] = pd.to_datetime(movies["ReleaseDate"])
movies["Runtime"] = pd.to_timedelta(movies["Runtime"])
movies["Runtime"] = movies["Runtime"] / pd.Timedelta(hours=1)
movies["Genres"] = movies["Genres"].apply(lambda x: literal_eval(x))
movies["Directors"] = movies["Directors"].apply(lambda x: literal_eval(x))
movies["Cast"] = movies["Cast"].apply(lambda x: literal_eval(x))
movies["OriginCountries"] = movies["OriginCountries"].apply(lambda x: literal_eval(x))
movies["Plot"] = movies["Plot"].fillna("")
movies["Index"] = movies["MovieID"].apply(lambda m: mapper.item_fwd_map[m])
movies = movies.sort_values(by="Index").reset_index(drop=True).drop(columns="Index")

In [40]:
movies.head(3)

Unnamed: 0,MovieID,Title,Runtime,ReleaseDate,Genres,Directors,Cast,OriginCountries,Languages,Plot,AverageRating,VoteCount
0,1,Toy Story (1995),1.35,1995-10-30,"[Adventure, Animation, Children, Comedy, Fantasy]",[John Lasseter],"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[United States],['English'],A cowboy doll is profoundly threatened and jea...,4.134552,1187323.0
1,2,Jumanji (1995),1.733333,1995-12-15,"[Adventure, Children, Fantasy]",[Joe Johnston],"[Robin Williams, Kirsten Dunst, Bonnie Hunt, J...",[United States],"['English', 'French']",When two kids find and play a magical board ga...,3.536073,426370.0
2,3,Grumpier Old Men (1995),1.683333,1995-12-22,"[Comedy, Romance]",[Howard Deutch],"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[United States],"['English', 'Italian', 'German']",John and Max resolve to save their beloved bai...,3.287087,43509.0


## TFIDVectorizer + Stemmer

### Utility

In [41]:
def repeat_text(text: str, time: int):
    result = text
    for _ in range(time - 1):
        result += " " + text
    return result

def merge_text(l_text_1, l_text_2, weights=(1, 1)):
    result = []
    for title, plot in zip(l_text_1, l_text_2):
        title = repeat_text(str(title), weights[0])
        l_text_2 = repeat_text(str(plot), weights[1])
        result.append(title + " " + l_text_2)
    return result


def get_recommendation(movie, mapper: Mapper, tf_mat: torch.Tensor, top: int = 10):
    movie_idx = mapper.item_fwd_map[movie]
    movie_vect = tf_mat[movie_idx].unsqueeze(0)
    values, indices = torch.nn.CosineSimilarity()(movie_vect, tf_mat).sort()
    indices, values = (
        [
            mapper.item_inv_map[int(i)]
            for i in reversed(indices[indices != movie_idx][-top:])
        ],
        [float(s) for s in reversed(values[indices != movie_idx][-top:])],
    )
    return indices, values

### On genres and ratings

In [42]:
R_bar = movies["AverageRating"].mean()
c = movies["VoteCount"].quantile(0.95)
def normalized_average_rating(row):
    R = row["AverageRating"]
    v = row["VoteCount"]
    return (v * R + c * R_bar) / (v + c) / 5.0
weighted_ratings = movies.apply(normalized_average_rating, axis=1)

In [43]:
movies_genres = (
    movies[["MovieID", "Genres"]].explode(column="Genres")
    .groupby("Genres").count().drop("(no genres listed)")
)
movies_genres_mapper = {genre: i for i, genre in enumerate(movies_genres.index)}

In [44]:
def one_hot_genres_encode(entries, weights=None):
    result = []
    if weights is None:
        weights = [1.0] * len(entries)
    for genres, w in zip(entries, weights):
        genres_vect = [0.0] * len(movies_genres_mapper)
        for g in genres:
            if g == "(no genres listed)":
                continue
            genres_vect[movies_genres_mapper[g]] = 1.0 * w
        result.append(genres_vect)
    return torch.FloatTensor(result)

In [45]:
tf_genres_mat = one_hot_genres_encode(movies["Genres"], weighted_ratings)
torch.save(tf_genres_mat, "./models/cb/genres_with_ratings.pt")
tf_genres_mat.shape

torch.Size([87585, 19])

In [None]:
movie_id = 1
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_genres_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Toy Story (1995)
175625 The Dragon Spell (2016) 1.0
33463 DuckTales: The Movie - Treasure of the Lost Lamp (1990) 1.0
278208 Tad the Lost Explorer and the Curse of the Mummy (2022) 1.0
269122 Bunyan and Babe (2017) 1.0
192981 Penguin Highway (2018) 1.0
213207 Onward (2020) 1.0
166461 Moana (2016) 1.0
3754 Adventures of Rocky and Bullwinkle, The (2000) 1.0
289983 The Monkey King (2023) 1.0
200630 Missing Link (2019) 1.0


### On title and plot

In [47]:
titles_and_plots = merge_text(movies["Title"], movies["Plot"], (1, 1))
vectstem = TFIDVectAndStem(0.005, 1.0)
vectstem.fit(titles_and_plots)
tf_plot_mat = vectstem(titles_and_plots)
torch.save(tf_plot_mat, "./models/cb/titles_and_plots.pt")
tf_plot_mat.shape

torch.Size([87585, 702])

In [48]:
movie_id = 1
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_plot_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Toy Story (1995)
6099 Megaforce (1982) 0.6118521690368652
201975 Action Figures (2015) 0.5887300968170166
201977 Action Figures 2 (2018) 0.5816023349761963
200884 Quad (1982) 0.5105335116386414
72485 Lemon Tree (2008) 0.4880746304988861
32959 Boy (Shônen) (1969) 0.48507559299468994
202517 Kung Fu Panda: Secrets of the Scroll (2016) 0.45864298939704895
111628 Mega Python vs. Gatoroid (2011) 0.45693689584732056
136489 Terror on the Midway (1942) 0.4384334087371826
272627 Foreboding (1992) 0.41186511516571045


### On directors and cast

Mention directors 2 times and only the first 5 (or less) members in cast

In [51]:
directors = movies["Directors"].apply(lambda x: " ".join(x))
cast = movies["Cast"].apply(lambda x: " ".join(x[:10]))
directors_and_cast = merge_text(directors, cast, (2, 1))
vectstem = TFIDVectAndStem(0.005, 1.0)
vectstem.fit(directors_and_cast)
tf_dircast_mat = vectstem(directors_and_cast)
torch.save(tf_dircast_mat, "./models/cb/directors_and_cast.pt")
tf_dircast_mat.shape

torch.Size([87585, 485])

In [56]:
movie_id = 1
print("Recommendation for", movies[movies["MovieID"] == movie_id]["Title"].values[0])
for m_id, sim in zip(*get_recommendation(movie_id, mapper, tf_dircast_mat, 10)):
    print(m_id, movies[movies["MovieID"] == m_id]["Title"].values[0], sim)

Recommendation for Toy Story (1995)
3114 Toy Story 2 (1999) 0.7406386137008667
78499 Toy Story 3 (2010) 0.6615656018257141
169318 Theo Von: No Offense (2016) 0.5818730592727661
123200 Jim Jefferies: I Swear to God (2009) 0.5722253322601318
168474 Ukonvaaja (2016) 0.5556421279907227
281708 Come Back Anytime (2021) 0.5556421279907227
169552 Ralphie May: Unruly (2015) 0.5556421279907227
95856 Knick Knack (1989) 0.5556421279907227
169648 John Pinette: I'm Starvin'! (2007) 0.5556421279907227
170159 The Bridge on the River Kwai: An Appreciation by Filmmaker John Milius (2000) 0.5556421279907227
