In [3]:
import polars as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import numpy as np
from ast import literal_eval
import time


In [2]:
nlp = spacy.load("en_core_web_lg")  # This loads the pre-trained word vectors

In [4]:
tmdb_movies = pd.read_csv("../../data/clean/final_movies.csv") #load dataset

In [9]:
tmdb_movies.head(), tmdb_movies.shape

(shape: (5, 9)
 ┌─────┬─────┬─────────────┬─────────────┬───┬─────────────┬─────────────┬─────────────┬────────────┐
 │     ┆ id  ┆ title       ┆ overview    ┆ … ┆ genres      ┆ cast        ┆ director    ┆ popularity │
 │ --- ┆ --- ┆ ---         ┆ ---         ┆   ┆ ---         ┆ ---         ┆ ---         ┆ ---        │
 │ i64 ┆ f64 ┆ str         ┆ str         ┆   ┆ str         ┆ str         ┆ str         ┆ f64        │
 ╞═════╪═════╪═════════════╪═════════════╪═══╪═════════════╪═════════════╪═════════════╪════════════╡
 │ 0   ┆ 2.0 ┆ Ariel       ┆ After the   ┆ … ┆ ['Drama',   ┆ ['Matti     ┆ ['Aki Kauri ┆ 23.17      │
 │     ┆     ┆             ┆ coal mine   ┆   ┆ 'Comedy',   ┆ Pellonpää', ┆ smäki']     ┆            │
 │     ┆     ┆             ┆ he works a… ┆   ┆ 'Romance',… ┆ 'Esko Nikk… ┆             ┆            │
 │ 1   ┆ 3.0 ┆ Shadows in  ┆ Nikander, a ┆ … ┆ ['Drama',   ┆ ['Aki Kauri ┆ ['Aki Kauri ┆ 35.528     │
 │     ┆     ┆ Paradise    ┆ rubbish     ┆   ┆ 'Comedy',   ┆ smäki'

In [20]:
# Sample target keywords
target_keywords = ["Action","Mystery", 
"Thriller",
"Suspenseful",
"Gripping" ,
"Plot twist"]

In [21]:
# Calculate word embeddings for target keywords
target_embeddings = np.mean([nlp(keyword).vector for keyword in target_keywords], axis=0)

In [41]:

# Function to calculate embeddings of movie keywords
def calculate_embeddings(movie_keywords):
    if list(movie_keywords):#isinstance(movie_keywords, list):
        return np.mean([nlp(keyword).vector for keyword in movie_keywords], axis=0)    
    return None


In [23]:

# Function to calculate cosine similarity between target keywords and movie keywords
def calculate_similarity(embedding):
    #movie_embeddings = np.mean([nlp(keyword).vector for keyword in movie_keywords.split(",")], axis=0)
    if not isinstance(embedding, np.ndarray):
        return 0.0
    similarity = cosine_similarity([target_embeddings], [embedding])[0][0]
    return similarity


In [6]:
tmdb_movies_keywords = tmdb_movies.drop_nulls()

In [7]:
tmdb_movies_keywords.shape

(90344, 9)

In [11]:
tmdb_movies_keywords.write_csv("../../data/clean/no_nulls.csv")

In [34]:
tmdb_movies_keywords = tmdb_movies_keywords.with_columns(pd.col("keywords").map_elements(literal_eval))

  tmdb_movies_keywords = tmdb_movies_keywords.with_columns(pd.col("keywords").map_elements(literal_eval))


In [43]:
# Apply the function to calculate similarity for each movie
start_time = time.time()
embeddings = tmdb_movies_keywords.with_columns(pd.col("keywords").map_elements(calculate_embeddings))["keywords"]
end_time = time.time()
print(f"Time taken to calculate {len(embeddings)} embeddings was {end_time-start_time}")

  embeddings = tmdb_movies_keywords.with_columns(pd.col("keywords").map_elements(calculate_embeddings))["keywords"]


Time taken to calculate 90344 embeddings was 5436.755422830582


In [45]:
score = []
for i in range(len(embeddings)):
    score.append((i, calculate_similarity(embeddings[i])))

In [46]:
score.sort(key=lambda x: (x[1],x[0]), reverse=True)

In [47]:
score[0:5]

[(46850, 0.6543091),
 (42030, 0.65016574),
 (2920, 0.65016574),
 (58760, 0.65006727),
 (67154, 0.6461362)]

In [49]:
for x in score[0:10]:
    print(tmdb_movies["title"][x[0]], tmdb_movies["keywords"][x[0]], x[1])

Befriend and Betray None 0.6543091
Rika barn leka bäst None 0.65016574
Amori, letti e tradimenti ['beautiful\xa0 woman', 'sex comedy'] 0.65016574
Taken 2 ['kidnapping', 'fbi', 'turkey', 'police chase', 'teenage daughter', 'stealing a car', 'ex-husband ex-wife relationship', 'albanian', 'u.s. embassy'] 0.65006727
Two Chips and a Miss ['short film'] 0.6461362
Frisian Terror ['gore', 'medieval', 'horror comedy'] 0.6440673
Holy Man ['salesclerk', 'tv ratings', 'guru', 'television producer', 'religion'] 0.6344439
The Debt None 0.62841535
Léon Morin, Priest ['faith', 'widow', 'world war ii', 'atheist', 'confessional', 'catholic priest', 'occupied france', 'nouvelle vague'] 0.6270883
Hobo with a Shotgun ['prostitute', 'shotgun', 'dystopia', 'pimp', 'vigilante', 'massacre', 'brutality', 'white suit', 'psychotronic'] 0.6270883


In [50]:
tmdb_movies.head()

Unnamed: 0_level_0,id,title,overview,keywords,genres,cast,director,popularity
i64,f64,str,str,str,str,str,str,f64
0,2.0,"""Ariel""","""After the coal mine he works a…","""['prison', 'underdog', 'helsin…","""['Drama', 'Comedy', 'Romance',…","""['Matti Pellonpää', 'Esko Nikk…","""['Aki Kaurismäki']""",23.17
1,3.0,"""Shadows in Paradise""","""Nikander, a rubbish collector …","""['helsinki, finland', 'salescl…","""['Drama', 'Comedy', 'Romance']""","""['Aki Kaurismäki', 'Kati Outin…","""['Aki Kaurismäki']""",35.528
2,5.0,"""Four Rooms""","""It's Ted the Bellhop's first n…","""['hotel', ""new year's eve"", 'w…","""['Comedy']""","""['Marisa Tomei', 'Antonio Band…","""['Quentin Tarantino', 'Robert …",52.481
3,6.0,"""Judgment Night""","""Four young friends, while taki…","""['drug dealer', 'chicago, illi…","""['Action', 'Crime', 'Thriller'…","""['Jeremy Piven', 'Stephen Dorf…","""['Stephen Hopkins']""",41.054
4,8.0,"""Life in Loops (A Megacities RM…","""Timo Novotny labels his new pr…","""['megacities']""","""['Documentary']""",,"""['Timo Novotny']""",10.613


NameError: name 'tmdb_movies_keywords' is not defined