<a href="https://colab.research.google.com/github/FelipeAce96/Movies-Embeddings-Recommender/blob/master/DEMO_MOVIES_RECOMMENDERipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movies-Embeddings-Recommender 🎥

A movies recommender based in embeddings.
I used a transformer model from hugging face, the instructor large model to obtain the embeddings from each movie.
The embeddings are calculated using the following prompt:

    MOVIE FEATURES
    TITLE: Toy Story
    COLLECTION: Toy Story Collection
    COMPANY NAME: Pixar Animation Studios
    RELEASED: 1995
    GENRES:
      Animation
      Comedy
      Family

# DOWNLOAD DATASET AND EMBEDDINGS

In [1]:
!git clone https://github.com/FelipeAce96/Movies-Embeddings-Recommender.git

Cloning into 'Movies-Embeddings-Recommender'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 13 (delta 3), reused 0 (delta 0), pack-reused 4[K
Unpacking objects: 100% (13/13), 54.51 MiB | 8.45 MiB/s, done.


In [2]:
import numpy as np
import pandas as pd
import json
pd.options.display.max_columns=None

df = pd.read_csv('/content/Movies-Embeddings-Recommender/movies_metadata.csv', on_bad_lines='skip')
df = df.drop_duplicates(subset=['id'])
print(df.shape)
df.head(2)

(45436, 24)


  df = pd.read_csv('/content/Movies-Embeddings-Recommender/movies_metadata.csv', on_bad_lines='skip')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [3]:
# Formats
df['year']=pd.to_datetime(df['release_date'], errors= 'coerce').dt.strftime('%Y')

In [4]:
template = (
'''
TITLE: {title}
COLLECTION: {collection}
COMPANY NAME: {company_name}
RELEASED: {released_date}
GENRES:{genres}
'''
)

In [5]:
def create_prompt(row):
  try:
    title = row['original_title']
    overview = row['overview']
    try:
      company_name = json.loads(row['production_companies'].replace("'",'"'))[0]['name'] if str(row['production_companies'])!= 'nan' else ''
    except:
      company_name = ''
    released_date = row['year'] if str(row['year'])!= 'nan' else ''
    collection = json.loads(row['belongs_to_collection'].replace("'",'"')).get('name') if str(row['belongs_to_collection'])!= 'nan' else ''
    genres = ", ".join([f'{e.get("name")}' for e in  json.loads(row['genres'].replace("'",'"'))]) if str(row['genres'])!= 'nan' else ''
    prompt = template.format(
        title=title,
        overview = overview,
        company_name = company_name,
        released_date = released_date,
        collection = collection,
        genres = genres
    )
    return prompt
  except Exception as e:
    print(e)
    return np.nan

In [6]:
df['PROMPTS'] = df.apply(lambda row: create_prompt(row), axis=1)

Expecting value: line 1 column 116 (char 115)
Expecting value: line 1 column 111 (char 110)
Expecting value: line 1 column 117 (char 116)
Expecting value: line 1 column 114 (char 113)
Expecting value: line 1 column 72 (char 71)
Expecting value: line 1 column 65 (char 64)
Expecting value: line 1 column 58 (char 57)
Expecting value: line 1 column 115 (char 114)
Expecting value: line 1 column 118 (char 117)
Expecting value: line 1 column 122 (char 121)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 121 (char 120)
Expecting value: line 1 column 142 (char 141)
Expecting value: line 1 column 122 (char 121)
Expecting value: line 1 column 126 (char 125)
Expecting value: line 1 column 129 (char 128)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 65 (char 64)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 121 (char 120)
Expecting value: line 1 column 117 (char 116)
Expecting ',' delimiter: line 1 column 43 

In [7]:
import h5py

# Cargar los embeddings desde el archivo HDF5
with h5py.File("/content/Movies-Embeddings-Recommender/movie_embeddings.h5", "r") as archivo:
    grupo = archivo["embeddings"]
    embeddings_dict = {clave: grupo[clave][:] for clave in grupo}


In [8]:
embeddings_dict.get('862').shape[0]

128

In [9]:
EMBEDDING_SIZE = embeddings_dict.get('862').shape[0]
results = pd.DataFrame(embeddings_dict).T.reset_index(drop=False).rename(columns={'index':'id'})
results.shape

(44111, 129)

In [10]:
results.head(1)

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
0,100,-0.069642,-0.062266,0.030718,0.007524,-0.084341,0.099216,-0.032305,0.013084,-0.024202,-0.040082,-0.047217,-0.063484,0.016388,0.018544,0.035417,0.040696,-0.009108,-0.00938,-0.012013,-0.046498,0.024706,-0.019615,-0.006221,0.030825,-0.057565,0.010527,0.029242,-0.013914,0.007834,-0.021472,-0.010252,0.029923,0.032425,0.063781,-0.022683,-0.011105,-0.025637,-0.017386,0.010426,-0.030079,-0.033772,-0.028726,-0.042189,0.090219,0.03102,-0.001883,0.041639,0.010259,0.033806,0.012182,-0.033738,0.003929,-0.066782,0.04855,-0.006198,-0.003964,0.025115,-0.009891,-0.019963,0.039208,-0.002385,-0.006475,0.020112,0.038639,0.015056,0.035261,-0.020338,0.005723,0.032744,-0.010967,0.026152,-0.018417,-0.022531,-0.017128,0.018905,0.044735,-0.004021,0.056351,0.041996,-0.015156,-0.044115,0.016969,0.014601,-0.015886,-0.020658,-0.02287,0.026342,-0.05502,0.008658,-0.002945,-0.029967,0.010601,-0.017854,0.015828,0.031109,-0.047492,-0.01456,0.002527,-0.028201,0.02847,-0.02882,0.007935,-0.004729,0.010766,0.0192,-0.033267,-0.029501,-0.008102,-0.003028,0.014859,-0.029524,-0.025954,-0.006798,0.023206,0.000614,-0.014477,0.00919,-0.016965,-0.00429,0.003902,-0.013468,-0.005083,-0.04391,0.007811,-0.003955,0.00183,-0.003314,-0.043223


In [11]:
emb_pca = results[[i for i in range(EMBEDDING_SIZE)]].to_numpy()
emb_pca.shape

(44111, 128)

# KNN MODEL

In [12]:
# Test if embeddings make sense
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
knn.fit(emb_pca)

In [13]:
#Test with toy story
p_id_test = "862"
p_name_test = df.loc[df['id'] == p_id_test,'original_title'].values[0]
print(f'Test Movie: {p_name_test} ')
p_emb_test = embeddings_dict.get(p_id_test)
p_emb_test.shape

Test Movie: Toy Story 


(128,)

In [14]:
#Results
distances, indices = knn.kneighbors(p_emb_test.reshape(1,-1),return_distance=True)
distances, indices = distances[0], indices[0]
_ids = results.loc[indices, 'id']
temp = df.loc[df['id'].isin(_ids),[
  'id','original_title','PROMPTS'
]].reset_index(drop=True)
temp['similarity'] = 1 - distances

for index, row in temp.iterrows():
    print(f"{index + 1}. Similarity: {row['similarity']:.4f}")
    print(row['PROMPTS'],'\n')

1. Similarity: 1.0000

TITLE: Toy Story
COLLECTION: Toy Story Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 1995
GENRES:Animation, Comedy, Family
 

2. Similarity: 0.8979

TITLE: The Lion King
COLLECTION: The Lion King Collection
COMPANY NAME: Walt Disney Pictures
RELEASED: 1994
GENRES:Family, Animation, Drama
 

3. Similarity: 0.7205

TITLE: The Adventures of Pinocchio
COLLECTION: 
COMPANY NAME: New Line Cinema
RELEASED: 1996
GENRES:Animation, Family, Fantasy
 

4. Similarity: 0.6363

TITLE: Toy Story 2
COLLECTION: Toy Story Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 1999
GENRES:Animation, Comedy, Family
 

5. Similarity: 0.6351

TITLE: Finding Nemo
COLLECTION: Finding Nemo Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 2003
GENRES:Animation, Family
 

6. Similarity: 0.6291

TITLE: Tom and Jerry: The Movie
COLLECTION: Tom and Jerry Collection
COMPANY NAME: Miramax Films
RELEASED: 1992
GENRES:Animation, Family
 

7. Similarity: 0.6083

TITLE: To

In [23]:
temp = df.sort_values(by='revenue', ascending=False).head(50)[['id','title']]
temp = temp[~temp['title'].str.contains("'")]
options = []

for idx, row in temp.iterrows():
  options.append(row['title'])
options[0]

'Avatar'

# Use the model

In [35]:
#@title Find similar movies to:  { run: "auto" }

MOVIES_LIKE = 'Finding Nemo' #@param ['Avatar', 'Star Wars: The Force Awakens', 'Titanic', 'The Avengers', 'Jurassic World', 'Furious 7', 'Avengers: Age of Ultron', 'Harry Potter and the Deathly Hallows: Part 2', 'Frozen', 'Beauty and the Beast', 'The Fate of the Furious', 'Iron Man 3', 'Minions', 'Captain America: Civil War', 'Transformers: Dark of the Moon', 'The Lord of the Rings: The Return of the King', 'Skyfall', 'Transformers: Age of Extinction', 'The Dark Knight Rises', 'Toy Story 3', 'Rogue One: A Star Wars Story', 'Pirates of the Caribbean: On Stranger Tides', 'Finding Dory', 'Alice in Wonderland', 'Zootopia', 'The Hobbit: An Unexpected Journey', 'Despicable Me 3', 'The Dark Knight', 'Despicable Me 2', 'The Jungle Book', 'The Hobbit: The Desolation of Smaug', 'The Hobbit: The Battle of the Five Armies', 'Harry Potter and the Deathly Hallows: Part 1', 'Finding Nemo', 'Harry Potter and the Order of the Phoenix', 'Harry Potter and the Half-Blood Prince', 'The Lord of the Rings: The Two Towers', 'Star Wars: Episode I - The Phantom Menace', 'Jurassic Park', 'Shrek 2', 'Harry Potter and the Goblet of Fire', 'Spider-Man 3', 'Ice Age: Dawn of the Dinosaurs', 'Spectre', 'Ice Age: Continental Drift', 'Harry Potter and the Chamber of Secrets', 'The Secret Life of Pets']

#Test

p_id_test = df.loc[df['title']==MOVIES_LIKE, 'id'].values[0]
p_emb_test = embeddings_dict.get(p_id_test)

#Results
distances, indices = knn.kneighbors(p_emb_test.reshape(1,-1),return_distance=True)
distances, indices = distances[0], indices[0]
_ids = results.loc[indices, 'id']
temp = df.loc[df['id'].isin(_ids),[
  'id','original_title','PROMPTS'
]].reset_index(drop=True)
temp['similarity'] = 1 - distances

for index, row in temp.iterrows():
    print(f"{index + 1}. Similarity: {row['similarity']:.4f}")
    print(row['PROMPTS'],'\n')

1. Similarity: 1.0000

TITLE: Toy Story
COLLECTION: Toy Story Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 1995
GENRES:Animation, Comedy, Family
 

2. Similarity: 0.7032

TITLE: Little Nemo: Adventures In Slumberland
COLLECTION: 
COMPANY NAME: 
RELEASED: 1989
GENRES:Adventure, Animation, Family, Fantasy
 

3. Similarity: 0.6576

TITLE: Toy Story 2
COLLECTION: Toy Story Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 1999
GENRES:Animation, Comedy, Family
 

4. Similarity: 0.5805

TITLE: Lilo & Stitch
COLLECTION: Lilo & Stitch Collection
COMPANY NAME: Walt Disney Pictures
RELEASED: 2002
GENRES:Animation, Family
 

5. Similarity: 0.5763

TITLE: Finding Nemo
COLLECTION: Finding Nemo Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 2003
GENRES:Animation, Family
 

6. Similarity: 0.5709

TITLE: The SpongeBob SquarePants Movie
COLLECTION: SpongeBob Collection
COMPANY NAME: Paramount Pictures
RELEASED: 2004
GENRES:Animation, Comedy, Family
 

7. Similarity: 0