## Metadata Based Recommender

In [1]:
from ast import literal_eval

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

using https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [2]:
data = pd.read_csv(
    "../../data/movies_dataset_kaggle/movies_metadata.csv",
    low_memory=False,
)

In [3]:
credits = pd.read_csv("../../data/movies_dataset_kaggle/credits.csv")
keywords = pd.read_csv("../../data/movies_dataset_kaggle/keywords.csv")

In [4]:
display(data.head())
display(credits.head())
keywords.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
def top_3(x):
    return [i["name"] for i in literal_eval(x)][:3]

In [6]:
def strip(x):
    return [i.lower().replace(" ", "") for i in x]

In [7]:
def preprocess(df, credits, keywords):
    return (
        df[
            [
                "title",
                "genres",
                "release_date",
                "runtime",
                "vote_average",
                "vote_count",
                "id",
            ]
        ]
        .assign(
            release_date=lambda _df: pd.to_datetime(
                _df["release_date"], errors="coerce"
            ),
            year=lambda _df: _df.release_date.apply(
                lambda x: (
                    int(str(x).split("-")[0])
                    if x != np.nan and str(x).split("-")[0] != "NaT"
                    else np.nan
                )
            ),
            genres=lambda _df: (_df.genres.apply(top_3).apply(strip)),
            id=lambda _df: _df.id.apply(lambda x: int(x) if "-" not in x else np.nan),
        )
        .drop(
            [
                "release_date",
            ],
            axis=1,
        )
        .merge(
            credits.assign(
                cast=lambda _df: (_df.cast.apply(top_3).apply(strip)),
                director=lambda _df: (
                    _df.crew.apply(literal_eval)
                    .apply(lambda x: [i["name"] for i in x if i["job"] == "Director"])
                    .apply(lambda x: x[0] if x else None)
                    .apply(lambda x: x.lower().replace(" ", "") if x else x)
                ),
            ).drop("crew", axis=1)
        )
        .merge(
            keywords.assign(
                keywords=lambda _df: (_df.keywords.apply(top_3).apply(strip))
            )
        )
        .dropna()
    )

In [8]:
df = preprocess(data, credits, keywords)
df

Unnamed: 0,title,genres,runtime,vote_average,vote_count,id,year,cast,director,keywords
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,862.0,1995.0,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy]"
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,8844.0,1995.0,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgame, disappearance, basedonchildren'sbook]"
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,15602.0,1995.0,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, bestfriend, duringcreditsstinger]"
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34.0,31357.0,1995.0,"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,"[basedonnovel, interracialrelationship, single..."
4,Father of the Bride Part II,[comedy],106.0,5.7,173.0,11862.0,1995.0,"[stevemartin, dianekeaton, martinshort]",charlesshyer,"[baby, midlifecrisis, confidence]"
...,...,...,...,...,...,...,...,...,...,...
46622,Robin Hood,"[drama, action, romance]",104.0,5.7,26.0,30840.0,1991.0,"[patrickbergin, umathurman, davidmorrissey]",johnirvin,[]
46624,Century of Birthing,[drama],360.0,9.0,3.0,111109.0,2011.0,"[angelaquino, perrydizon, hazelorencio]",lavdiaz,"[artist, play, pinoy]"
46625,Betrayal,"[action, drama, thriller]",90.0,3.8,6.0,67758.0,2003.0,"[erikaeleniak, adambaldwin, juliedupage]",markl.lester,[]
46626,Satan Triumphant,[],87.0,0.0,0.0,227506.0,1917.0,"[iwanmosschuchin, nathalielissenko, pavelpavlov]",yakovprotazanov,[]


In [9]:
metadata = df.apply(
    lambda x: (
        " ".join(
            [
                " ".join(x.genres),
                " ".join(x.cast),
                x.director,
                " ".join(x.keywords),
            ]
        )
    ),
    axis=1,
)

metadata

0        animation comedy family tomhanks timallen donr...
1        adventure fantasy family robinwilliams jonatha...
2        romance comedy waltermatthau jacklemmon ann-ma...
3        comedy drama romance whitneyhouston angelabass...
4        comedy stevemartin dianekeaton martinshort cha...
                               ...                        
46622    drama action romance patrickbergin umathurman ...
46624    drama angelaquino perrydizon hazelorencio lavd...
46625    action drama thriller erikaeleniak adambaldwin...
46626     iwanmosschuchin nathalielissenko pavelpavlov ...
46627                                        daisyasquith 
Length: 45458, dtype: object

In [10]:
counter = CountVectorizer()

counter_matrix = counter.fit_transform(metadata)

In [11]:
cosine_sim = linear_kernel(counter_matrix, counter_matrix)

In [12]:
def get_recommendations(title, n=10):
    titles = df.title.reset_index(drop=True)
    idx = titles.where(titles == title).first_valid_index()

    return titles.iloc[np.argsort(cosine_sim[idx])[-2::-1][:n]]

In [13]:
get_recommendations("Toy Story")

15410                   Toy Story 3
27249                         Anina
2998                    Toy Story 2
25660               Partysaurus Rex
25662    Toy Story That Time Forgot
21863          Toy Story of Terror!
17410                        Cars 2
11144                 Monster House
24803            Jetsons: The Movie
3309              Creature Comforts
Name: title, dtype: object