In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

### Import data
---

In [None]:
# Read pickle file
movies_df = pickle.load(open('data\\movies_list.pkl', 'rb'))

In [8]:
# Display dataframe
movies_df

Unnamed: 0,tmdb_id,release_date,original_title,title,keywords,genres,production_companies,original_language,origin_country,director_name,actor_1_name,actor_2_name,actor_3_name,year
0,19995,2009-12-16,Avatar,Avatar,paraplegic|attachment to nature|culture clash|...,Action|Adventure|Fantasy|Science Fiction,Dune Entertainment|Lightstorm Entertainment|20...,en,US,James Cameron,Sam Worthington,Zoe Saldaña,Sigourney Weaver,2009
1,285,2007-05-19,Pirates of the Caribbean: At World's End,Pirates of the Caribbean: At World's End,exotic island|strong woman|love of one's life|...,Adventure|Fantasy|Action,Jerry Bruckheimer Films|Second Mate Production...,en,US,Gore Verbinski,Johnny Depp,Geoffrey Rush,Orlando Bloom,2007
2,206647,2015-10-26,Spectre,Spectre,based on novel or book|spy|secret agent|sequel...,Action|Adventure|Thriller,Metro-Goldwyn-Mayer|Columbia Pictures|EON Prod...,en,GB,Sam Mendes,Daniel Craig,Christoph Waltz,Léa Seydoux,2015
3,49026,2012-07-17,The Dark Knight Rises,The Dark Knight Rises,airplane|fight|burglar|hostage|secret identity...,Action|Crime|Drama|Thriller,Syncopy|Legendary Pictures|DC Entertainment|Wa...,en,GB|US,Christopher Nolan,Christian Bale,Gary Oldman,Tom Hardy,2012
4,140607,2015-12-15,Star Wars: The Force Awakens,Star Wars: The Force Awakens,android|spacecraft|space opera|absurd,Adventure|Action|Science Fiction,Lucasfilm Ltd.|Bad Robot,en,US,J.J. Abrams,Harrison Ford,Mark Hamill,Carrie Fisher,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7691,1143440,2025-09-17,La tour de glace,The Ice Tower,fairy tale|1970s|snow|female protagonist|woman...,Drama|Fantasy,3B Productions|Sutor Kolonko|Davis Films|ARTE ...,fr,FR,Lucile Hadžihalilović,Marion Cotillard,Clara Pacini,August Diehl,2025
7692,1313229,2025-09-17,Nino,Nino,"paris, france|cancer|radiology",Drama,Blue Monday Productions|France 2 Cinéma,fr,FR,Pauline Loquès,Théodore Pellerin,Salomé Dewaels,Jeanne Balibar,2025
7693,1309725,2025-10-29,La Femme la plus riche du monde,The Richest Woman in the World,artist|scandal|political corruption|political ...,Drama|Comedy,Récifilms|Versus Production|Haut et Court|Ciné...,fr,BE|FR,Thierry Klifa,Isabelle Huppert,Marina Foïs,Laurent Lafitte,2025
7694,1290432,2025-09-27,Vie privée,A Private Life,suicide|paranoia|suspicion of murder|private i...,Drama|Crime|Mystery,Les Films Velvet|France 3 Cinéma|Auvergne-Rhôn...,fr,FR,Rebecca Zlotowski,Jodie Foster,Daniel Auteuil,Virginie Efira,2025


In [12]:
# Check missing values
movies_df.isna().sum()

tmdb_id                 0
release_date            0
original_title          0
title                   0
keywords                0
genres                  0
production_companies    0
original_language       0
origin_country          0
director_name           0
actor_1_name            0
actor_2_name            0
actor_3_name            0
year                    0
dtype: int64

### Vectorize data
---

In [14]:
# regroup actors in one same column
movies_df['cast'] = movies_df[["actor_1_name", "actor_2_name", "actor_3_name"]].apply(lambda r: "|".join([a for a in r if a.strip() != ""]), axis=1)

In [15]:
# columns with single values -> default tokenizer (whitespace). binary=True makes features 0/1.
unityVector = CountVectorizer(binary=True)

director = unityVector.fit_transform(movies_df['director_name']).toarray()
language = unityVector.fit_transform(movies_df['original_language']).toarray()


# other columns -> pipe-split tokenizer
split_by_pipe = CountVectorizer(binary=True, tokenizer=lambda x: x.split('|'), token_pattern=None)

keywords = split_by_pipe.fit_transform(movies_df['keywords']).toarray()
genres = split_by_pipe.fit_transform(movies_df['genres']).toarray()
production_companies = split_by_pipe.fit_transform(movies_df['production_companies']).toarray()
countries = split_by_pipe.fit_transform(movies_df['origin_country']).toarray()
cast = split_by_pipe.fit_transform(movies_df['cast']).toarray()

In [16]:
# Concatenate feature blocks
movies_binary_full = np.concatenate(
    [keywords, genres, production_companies,language, director, cast, countries],
    axis=1
)
# Check result
movies_binary_full

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(7696, 39514))

In [17]:
# Compute cosine similarity
movies_sim = cosine_similarity(movies_binary_full)

# Check the result
movies_sim

array([[1.        , 0.1250782 , 0.09549105, ..., 0.        , 0.        ,
        0.        ],
       [0.1250782 , 1.        , 0.11227218, ..., 0.        , 0.        ,
        0.        ],
       [0.09549105, 0.11227218, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.15430335,
        0.18156826],
       [0.        , 0.        , 0.        , ..., 0.15430335, 1.        ,
        0.19611614],
       [0.        , 0.        , 0.        , ..., 0.18156826, 0.19611614,
        1.        ]], shape=(7696, 7696))

### Recommend movies
---

In [24]:
# Recommend movies based on content similarity
def recommend(title : str, movies_df : pd.DataFrame, top_n=10):
    if title in movies_df['title'].values:
        movies_index = movies_df[movies_df['title'] == title].index.item() # return the index of the row matching the title (eg; "avatar" index = 0)
        scores = dict(enumerate(movies_sim[movies_index])) # returns the indices of the movies and their similarities
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True)) # sort the list of pairs (key = "movie_index", item = "similarity_score")
        
        selected_movies_index = [movie_id for movie_id, _ in sorted_scores.items()] # take the movies indices sorted by similarity scores
        selected_movies_score = [scores for _, scores in sorted_scores.items()] # take the similarity scores
        
        # Create a new dataframe that is indexed using the sorted movies' indices by score similarity
        rec_movies = movies_df.iloc[selected_movies_index]
        rec_movies['similarity'] = selected_movies_score
        
        rec_movies = rec_movies.reset_index(drop=True)
        return rec_movies[1:top_n+1] # Return the top_n similar movies while skipping the first row (because the first row represent the movie itself [it is naturally the most similar movie to itself])
    else:
        print("Title not in dataset. Please check spelling.")
        return pd.NA

In [25]:
# Test function using "avatar"
recommend("Avatar", movies_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_movies['similarity'] = selected_movies_score


Unnamed: 0,tmdb_id,release_date,original_title,title,keywords,genres,production_companies,original_language,origin_country,director_name,actor_1_name,actor_2_name,actor_3_name,year,cast,similarity
1,76600,2022-12-14,Avatar: The Way of Water,Avatar: The Way of Water,dying and death|loss of loved one|alien life-f...,Action|Adventure|Science Fiction,20th Century Studios|Lightstorm Entertainment|...,en,US,James Cameron,Sam Worthington,Zoe Saldaña,Sigourney Weaver,2022,Sam Worthington|Zoe Saldaña|Sigourney Weaver,0.394491
2,83533,2025-12-17,Avatar: Fire and Ash,Avatar: Fire and Ash,witch|clone|space war|tribe|sequel|alien|famil...,Science Fiction|Adventure|Fantasy,20th Century Studios|Lightstorm Entertainment|...,en,US,James Cameron,Sam Worthington,Zoe Saldaña,Sigourney Weaver,2025,Sam Worthington|Zoe Saldaña|Sigourney Weaver,0.373182
3,679,1986-07-18,Aliens,Aliens,android|space marine|extraterrestrial technolo...,Action|Thriller|Science Fiction,SLM Production Group|20th Century Fox|Brandywi...,en,US,James Cameron,Sigourney Weaver,Carrie Henn,Michael Biehn,1986,Sigourney Weaver|Carrie Henn|Michael Biehn,0.256928
4,118340,2014-07-30,Guardians of the Galaxy,Guardians of the Galaxy,spacecraft|based on comic|space|orphan|adventu...,Action|Science Fiction|Adventure,Marvel Studios,en,US,James Gunn,Chris Pratt,Zoe Saldaña,Dave Bautista,2014,Chris Pratt|Zoe Saldaña|Dave Bautista,0.234246
5,283995,2017-04-19,Guardians of the Galaxy Vol. 2,Guardians of the Galaxy Vol. 2,superhero|based on comic|sequel|misfit|space|a...,Science Fiction|Adventure|Action,Marvel Studios,en,US,James Gunn,Chris Pratt,Zoe Saldaña,Dave Bautista,2017,Chris Pratt|Zoe Saldaña|Dave Bautista,0.228315
6,36955,1994-07-15,True Lies,True Lies,florida|horseback riding|gun|kidnapping|spy|fl...,Action|Thriller,Lightstorm Entertainment|20th Century Fox,en,US,James Cameron,Arnold Schwarzenegger,Jamie Lee Curtis,Tom Arnold,1994,Arnold Schwarzenegger|Jamie Lee Curtis|Tom Arnold,0.224574
7,76170,2013-07-21,The Wolverine,The Wolverine,japan|samurai|superhero|mutant|world war ii|ba...,Action|Science Fiction|Adventure,The Donners' Company|20th Century Fox|Marvel E...,en,US,James Mangold,Hugh Jackman,Hiroyuki Sanada,Tao Okamoto,2013,Hugh Jackman|Hiroyuki Sanada|Tao Okamoto,0.222812
8,47933,2016-06-22,Independence Day: Resurgence,Independence Day: Resurgence,alien|alien invasion,Action|Adventure|Science Fiction,20th Century Fox|TSG Entertainment|Centropolis...,en,US,Roland Emmerich,Liam Hemsworth,Jeff Goldblum,Jessie T. Usher,2016,Liam Hemsworth|Jeff Goldblum|Jessie T. Usher,0.218797
9,14164,2009-03-12,Dragonball Evolution,Dragonball Evolution,martial arts|superhero|karate|revenge|dragon|b...,Action|Adventure|Fantasy|Science Fiction|Thriller,World Film Magic|Dune Entertainment|Star Overs...,en,US,James Wong,Justin Chatwin,Chow Yun-Fat,Joon Park,2009,Justin Chatwin|Chow Yun-Fat|Joon Park,0.218797
10,36668,2006-05-24,X-Men: The Last Stand,X-Men: The Last Stand,superhero|mutant|based on comic|superhuman|bea...,Adventure|Action|Science Fiction|Thriller,The Donners' Company|20th Century Fox|Marvel E...,en,US,Brett Ratner,Hugh Jackman,Patrick Stewart,Ian McKellen,2006,Hugh Jackman|Patrick Stewart|Ian McKellen,0.21769


### Save the similarities matrix
---

In [None]:
pickle.dump(movies_sim, open('data\\similarity.pkl', 'wb'))