## Description Based Recommender

In [1]:
from ast import literal_eval

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

using https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [2]:
data = pd.read_csv(
    "../../data/movies_dataset_kaggle/movies_metadata.csv",
    low_memory=False,
)

In [3]:
data[["title", "overview"]]

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...
45461,Subdue,Rising and falling between a man and woman.
45462,Century of Birthing,An artist struggles to finish his work while a...
45463,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,Satan Triumphant,"In a small town live two brothers, one a minis..."


In [4]:
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"\w+")
stopwords_ = stopwords.words("english")
tfidf = TfidfVectorizer()

In [5]:
def clean(text):
    return " ".join(
        [
            lemmatizer.lemmatize(i)
            for i in tokenizer.tokenize(text.lower())
            if i not in stopwords_
        ]
    )

In [6]:
desc_cleaned = data.overview.fillna("").apply(clean)
desc_cleaned

0        led woody andy toy live happily room andy birt...
1        sibling judy peter discover enchanted board ga...
2        family wedding reignites ancient feud next doo...
3        cheated mistreated stepped woman holding breat...
4        george bank recovered daughter wedding receive...
                               ...                        
45461                             rising falling man woman
45462    artist struggle finish work storyline cult pla...
45463    one hit go wrong professional assassin end sui...
45464    small town live two brother one minister one h...
45465    50 year decriminalisation homosexuality uk dir...
Name: overview, Length: 45466, dtype: object

In [7]:
tfidf_matrix = tfidf.fit_transform(desc_cleaned)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
def get_recommendations(title, n=10):
    titles = data.title
    idx = titles.where(titles == title).first_valid_index()

    return titles.iloc[np.argsort(cosine_sim[idx])[-2::-1][:n]]

In [9]:
get_recommendations("Toy Story")

15348                                     Toy Story 3
2997                                      Toy Story 2
24523                                       Small Fry
10301                          The 40 Year Old Virgin
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object