<a href="https://colab.research.google.com/github/FelipeAce96/Movies-Embeddings-Recommender/blob/master/MOVIES_EMBEDDING_RECOMMENDER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALL DEPENDENCIES

In [1]:
!pip install InstructorEmbedding
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!nvidia-smi

Tue Jun  6 20:07:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Helper Functions
import numpy as np
import torch
import gc

#Clean GPU
def clean_gpu():
  gc.collect()
  torch.cuda.empty_cache()
  return

#Split in batches
def divide_lista_en_batches(lista, tamaño_batch):
    batches = []
    for i in range(0, len(lista), tamaño_batch):
        batch = lista[i:i + tamaño_batch]
        batches.append(batch)
    return batches

# CUDA is available?
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

# READ THE DATASET

In [4]:
import numpy as np
import pandas as pd
import json
pd.options.display.max_columns=None

df = pd.read_csv('/content/movies_metadata.csv')
print(df.shape)
df.head(2)

(45466, 24)


  df = pd.read_csv('/content/movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [5]:
# drop duplicates

df = df.drop_duplicates(subset = ['id']).reset_index(drop=True)

In [6]:
df['belongs_to_collection'].iloc[0]

"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}"

In [7]:
df.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [8]:
df.isna().sum()

adult                        0
belongs_to_collection    40945
budget                       0
genres                       0
homepage                 37659
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25035
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [9]:
# Formats
df['year']=pd.to_datetime(df['release_date'], errors= 'coerce').dt.strftime('%Y')

## Create the Prompt

In [10]:
template = (
'''MOVIE FEATURES
TITLE: {title}
COLLECTION: {collection}
COMPANY NAME: {company_name}
RELEASED: {released_date}
OVREVIEW: {overview}
GENRES:
{genres}
'''
)

In [11]:
row = df.loc[0]
row

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [12]:
title = row['original_title']
overview = row['overview']
company_name = json.loads(row['production_companies'].replace("'",'"'))[0]['name'] if str(row['production_companies'])!= 'nan' else ''
released_date = row['year'] if str(row['year'])!= 'nan' else ''
collection = json.loads(row['belongs_to_collection'].replace("'",'"')).get('name') if str(row['belongs_to_collection'])!= 'nan' else ''
genres = "\n".join([f'\t{e.get("name")}' for e in  json.loads(row['genres'].replace("'",'"'))]) if str(row['genres'])!= 'nan' else ''

In [13]:
prompt = template.format(
    title=title,
    overview = overview,
    company_name = company_name,
    released_date = released_date,
    collection = collection,
    genres = genres
)
print(prompt)

MOVIE FEATURES
TITLE: Toy Story
COLLECTION: Toy Story Collection
COMPANY NAME: Pixar Animation Studios
RELEASED: 1995
OVREVIEW: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
GENRES:
	Animation
	Comedy
	Family



In [14]:
def create_prompt(row):
  try:
    title = row['original_title']
    overview = row['overview']
    try:
      company_name = json.loads(row['production_companies'].replace("'",'"'))[0]['name'] if str(row['production_companies'])!= 'nan' else ''
    except:
      company_name = ''
    released_date = row['year'] if str(row['year'])!= 'nan' else ''
    collection = json.loads(row['belongs_to_collection'].replace("'",'"')).get('name') if str(row['belongs_to_collection'])!= 'nan' else ''
    genres = "\n".join([f'\t{e.get("name")}' for e in  json.loads(row['genres'].replace("'",'"'))]) if str(row['genres'])!= 'nan' else ''
    prompt = template.format(
        title=title,
        overview = overview,
        company_name = company_name,
        released_date = released_date,
        collection = collection,
        genres = genres
    )
    return prompt
  except Exception as e:
    print(e)
    return np.nan

In [15]:
df['PROMPTS'] = df.apply(lambda row: create_prompt(row), axis=1)

Expecting value: line 1 column 116 (char 115)
Expecting value: line 1 column 111 (char 110)
Expecting value: line 1 column 117 (char 116)
Expecting value: line 1 column 114 (char 113)
Expecting value: line 1 column 72 (char 71)
Expecting value: line 1 column 65 (char 64)
Expecting value: line 1 column 58 (char 57)
Expecting value: line 1 column 115 (char 114)
Expecting value: line 1 column 118 (char 117)
Expecting value: line 1 column 122 (char 121)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 121 (char 120)
Expecting value: line 1 column 142 (char 141)
Expecting value: line 1 column 122 (char 121)
Expecting value: line 1 column 126 (char 125)
Expecting value: line 1 column 129 (char 128)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 65 (char 64)
Expecting value: line 1 column 120 (char 119)
Expecting value: line 1 column 121 (char 120)
Expecting value: line 1 column 117 (char 116)
Expecting ',' delimiter: line 1 column 43 

In [16]:
df= df.dropna(subset=['PROMPTS']).reset_index(drop=True)
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,PROMPTS
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,MOVIE FEATURES\nTITLE: Toy Story\nCOLLECTION: ...
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,MOVIE FEATURES\nTITLE: Jumanji\nCOLLECTION: \n...


In [17]:
prompts = df['PROMPTS'].to_list()

In [18]:
import random
print(random.choice(prompts))

MOVIE FEATURES
TITLE: 127 Hours
COLLECTION: 
COMPANY NAME: Fox Searchlight Pictures
RELEASED: 2010
OVREVIEW: The true story of mountain climber Aron Ralston's remarkable adventure to save himself after a fallen boulder crashes on his arm and traps him in an isolated canyon in Utah.
GENRES:
	Adventure
	Drama
	Thriller



# LOAD THE MODEL

In [19]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
instruction = "Represent the Science title:"
embeddings = model.encode([[instruction,sentence]])
print(embeddings.shape)

  from tqdm.autonotebook import trange


Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)c7233/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

Downloading (…)15c7233/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512
(1, 768)


In [20]:
instruction = 'Represent the movie:'
input_texts = [[instruction, prompt] for prompt in prompts]
len(input_texts)

44111

In [21]:
# Run in Batches

batch_size = 128
batches = divide_lista_en_batches(input_texts, batch_size)
print(len(batches))

345


In [22]:
batches[0][0]

['Represent the movie:',
 "MOVIE FEATURES\nTITLE: Toy Story\nCOLLECTION: Toy Story Collection\nCOMPANY NAME: Pixar Animation Studios\nRELEASED: 1995\nOVREVIEW: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.\nGENRES:\n\tAnimation\n\tComedy\n\tFamily\n"]

In [23]:
import numpy as np
results = np.zeros((len(prompts), embeddings.shape[1]))
results.shape

(44111, 768)

In [24]:
clean_gpu()

In [25]:
%%time
for i, batch in enumerate(batches):
  #Sentences are encoded by calling model.encode()
  embeddings = model.encode(batch)
  results[i* batch_size : (i+1)* batch_size] = embeddings
  if i % 50 == 0:
    print(f'Batch number: {i}')
    clean_gpu()

Batch number: 0
Batch number: 50
Batch number: 100
Batch number: 150
Batch number: 200
Batch number: 250
Batch number: 300
CPU times: user 27min 54s, sys: 6.43 s, total: 28min
Wall time: 27min 29s


In [26]:
results

array([[-0.02746302,  0.02125845, -0.03350887, ..., -0.03979352,
         0.05030221,  0.02034248],
       [-0.07327545,  0.00317347, -0.01115879, ..., -0.0335513 ,
         0.03612265,  0.01066509],
       [-0.04299108,  0.01801763,  0.00998771, ..., -0.04291728,
         0.03587092,  0.03809695],
       ...,
       [-0.07294752,  0.00906606, -0.00939354, ..., -0.01449475,
         0.00298197,  0.02695587],
       [-0.05927648,  0.02055962,  0.00771868, ..., -0.03929051,
         0.01854493,  0.03403043],
       [-0.03773106,  0.0425097 ,  0.0140347 , ..., -0.03719359,
         0.01696873,  0.05181765]])

In [27]:
from sklearn.decomposition import PCA
#pca algorithm
modelo_pca = PCA(n_components=128)
emb_pca = modelo_pca.fit_transform(results)
explained_variance = np.sum(modelo_pca.explained_variance_ratio_)
print(f'Explained Variance: {explained_variance}')
emb_pca.shape

Explained Variance: 0.7103661295382249


(44111, 128)

In [28]:
# Test if embeddings make sense
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=25, algorithm='brute', metric='cosine')
knn.fit(emb_pca)

In [29]:
df.sort_values(by='revenue', ascending=False).head(35)[['id','title']]

Unnamed: 0,id,title
14083,19995,Avatar
25735,140607,Star Wars: The Force Awakens
1605,597,Titanic
17262,24428,The Avengers
24322,135397,Jurassic World
27936,168259,Furious 7
25738,99861,Avengers: Age of Ultron
16886,12445,Harry Potter and the Deathly Hallows: Part 2
21437,109445,Frozen
40948,321612,Beauty and the Beast


In [30]:
# p_id_test = 1262642
p_id_test = "209112"
p_name_test = df.loc[df['id'] == p_id_test,'original_title'].values[0]
print(f'Test Movie: {p_name_test} ')
idx = df.loc[df['id']== p_id_test].index.values[0]
p_emb_test = emb_pca[idx]
p_emb_test.shape

Test Movie: Batman v Superman: Dawn of Justice 


(128,)

In [31]:
distances, indices = knn.kneighbors(p_emb_test.reshape(1,-1),return_distance=True)
distances, indices = distances[0], indices[0]
temp = df.iloc[indices][['id','original_title','belongs_to_collection','release_date']]
temp['similarity'] = 1 - distances
temp

Unnamed: 0,id,original_title,belongs_to_collection,release_date,similarity
30092,209112,Batman v Superman: Dawn of Justice,"{'id': 209131, 'name': 'Man of Steel Collectio...",2016-03-23,1.0
40962,408220,Justice League Dark,,2017-01-24,0.675949
18440,103269,Superman vs. The Elite,,2012-06-12,0.669103
31244,126712,Superman,,1948-07-15,0.633796
20437,49521,Man of Steel,"{'id': 209131, 'name': 'Man of Steel Collectio...",2013-06-12,0.630394
9781,272,Batman Begins,"{'id': 263, 'name': 'The Dark Knight Collectio...",2005-06-10,0.627511
29587,297761,Suicide Squad,,2016-08-02,0.620714
30476,323027,Justice League: Gods and Monsters,,2015-07-28,0.610353
12440,13851,Batman: Gotham Knight,,2008-07-03,0.608096
19616,142061,"Batman: The Dark Knight Returns, Part 2","{'id': 248534, 'name': 'Batman: The Dark Knigh...",2013-01-18,0.600447


In [32]:
df['id'].unique().shape, df.shape

((44111,), (44111, 26))

In [33]:
# SAVE THE EMBEDDINGS

df_final = pd.DataFrame(emb_pca)
df_final['index'] = df.index
df_final['id']= df['id'].copy()
df_final.tail(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,index,id
44110,0.071556,-0.014197,0.001722,-0.022765,-0.033013,-0.039907,0.081867,-0.00292,-0.01099,-0.038553,0.075607,0.006512,-0.017914,0.015865,-0.019535,0.037231,0.058021,-0.030515,-0.038144,0.047793,-0.001473,-0.014805,-0.058162,0.058656,-0.021303,-0.044224,0.010252,0.026348,-0.004067,0.007567,-0.01366,0.017469,-0.032483,0.006104,0.054968,-0.029459,-0.054557,0.015556,0.004617,0.023314,-0.029687,0.024915,0.038768,-0.00173,-0.026557,0.040181,-0.006223,0.01107,0.017848,0.013274,-7e-05,-0.036463,0.013048,-0.012038,0.015352,-0.030557,0.057593,0.043619,-0.054329,-0.03991,-0.045836,-0.046153,0.028609,-0.012168,0.004227,0.04444,0.001662,0.022607,0.0043,-0.039055,0.024142,-0.039478,-0.029935,-0.050255,-0.018726,0.031394,0.005837,0.021595,-0.064409,0.025494,-0.018357,0.012532,0.005921,-0.005455,-0.035631,0.036093,0.000843,0.024796,0.032629,-0.01925,-0.014187,0.063244,0.033437,-0.056163,-0.003104,-0.000779,-0.039811,-0.023388,-0.019846,0.033164,-0.034154,0.011724,0.016376,-0.015513,0.004132,0.074506,-0.02683,-0.022107,0.033023,-0.016515,-0.019092,0.018523,0.035052,0.009758,0.011224,-0.002693,0.008072,-0.043035,0.009369,0.040761,-0.01332,0.010675,-0.020795,-0.006288,0.040684,0.044635,-0.007005,0.017151,44110,461257


In [62]:
df_final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,index,id
0,-0.010352,-0.109725,-0.062225,-0.100388,0.054307,0.066607,0.013209,0.014714,-0.071336,-0.038813,0.018894,0.011224,-0.003395,-0.09681,0.039129,-0.129869,0.008544,-0.032298,0.018761,-0.042438,0.057079,-0.013376,-0.010261,0.001951,-0.013345,0.022643,0.071815,0.007298,-0.042318,-0.006264,0.010947,-0.039461,0.052103,-0.000242,-0.035527,0.005191,-0.010463,-0.006173,0.071666,-0.009036,-0.020215,-0.010564,-0.000572,0.032483,0.005065,0.019375,-0.01662,0.010953,0.029225,-0.028295,-0.033211,-0.032334,-0.031656,0.029124,0.031891,0.025658,0.016678,0.026943,-0.028055,-0.007799,0.001954,-0.029574,0.024506,-0.007716,0.01282,-0.014218,-0.022681,0.027775,-0.034297,0.021132,0.012322,-0.040865,-0.023167,-0.003735,0.03398,0.031559,-0.019893,-0.012157,-0.002902,-0.002385,-0.048046,-0.01693,-0.027603,-0.011605,-0.00563,0.00957,0.003939,0.036371,0.029462,-0.004575,-0.005465,0.02449,-0.028694,-0.031127,0.019757,-0.00863,-0.025559,-0.013616,0.010084,-0.01944,-0.009672,0.012875,-0.034693,0.007695,-0.04646,-0.03511,0.015021,0.028285,0.006852,-0.006141,-0.018861,0.012544,0.034802,0.007092,0.01917,0.004868,-0.007822,0.036249,-0.018969,-0.011178,-0.007823,-0.022376,0.003776,-0.028548,-0.001008,0.010152,0.014478,-0.001312,0,862
1,-0.073505,-0.067131,-0.045962,-0.057251,0.126018,0.073942,-0.053341,0.012343,-0.016966,0.002733,-0.009679,0.012093,-0.006149,-0.062374,-0.044416,-0.067908,0.041868,-0.026769,-0.054938,-0.020103,-0.024177,0.024219,-0.019293,0.018133,-0.01036,0.00851,-0.002245,-0.013853,-0.03841,0.008494,0.059245,-0.083403,0.039006,-0.061262,0.015955,0.000994,-0.02311,0.026382,-0.077564,-0.001058,0.010966,0.023934,-0.024722,-0.002422,0.003183,-0.001043,-0.046377,0.061106,0.105467,0.027892,0.02717,-0.03121,0.031523,-0.017877,0.031253,0.020328,-0.015824,-0.025856,-0.01202,0.013494,0.001821,-0.000193,-0.01679,-0.01956,0.029921,-0.0189,0.026806,-0.017743,-0.019631,-0.028259,-0.003501,0.006191,-0.0418,0.026505,0.014352,0.072209,-0.028924,0.002875,-0.005569,0.024378,-0.006681,-0.026937,0.056024,0.036959,0.012524,-0.017657,-0.030983,0.00662,0.00286,0.036689,-0.028491,-0.015895,-0.051001,-0.041594,-0.004992,-0.030332,-0.003412,-0.032847,-0.004332,0.009098,0.013756,0.015122,0.004724,0.002202,0.002137,0.028135,-0.024062,-0.013961,-0.009638,-0.020476,-0.036396,-0.00997,-0.016877,-0.009229,0.022892,-0.013186,-0.011428,-0.022163,-0.004308,-0.005794,-0.006943,-0.031027,0.008236,0.003521,-0.003926,0.002674,-0.015579,0.010915,1,8844
2,0.04274,-0.125301,0.048395,-0.050283,0.039837,0.064085,-0.025201,0.000854,0.009713,-0.008938,-0.04624,0.012983,-0.108286,-0.02046,-0.09355,0.023581,0.001775,0.029975,0.026769,-0.001529,-0.007545,-0.001246,-0.020216,0.044947,0.003342,0.01035,0.006991,-0.008868,-0.029886,0.02983,0.046528,-0.020007,0.010025,-0.071954,0.012266,0.009602,-0.057991,0.008122,0.038744,-0.034821,-0.047871,0.030035,-0.041303,0.001753,0.021189,-0.004794,-0.03967,0.015217,0.010119,-0.057927,0.027466,-0.057817,-0.009016,0.016186,0.011137,-0.011707,0.005668,-0.001572,0.021838,0.008052,-0.001609,-0.057741,0.021924,-0.032519,0.019263,0.020655,0.012623,0.022378,-0.00499,0.010146,-0.026451,0.006482,-0.023838,-0.038005,0.01974,-0.00734,0.017404,0.016608,-0.007577,-0.003612,0.01031,-0.017884,-0.008681,-0.022086,-0.040261,-0.026674,0.042254,-0.013268,0.015312,0.039376,-0.001309,-0.047317,0.011568,0.010592,-0.047776,0.023583,0.016352,-0.003821,-0.002179,-0.007442,-0.014977,0.000523,0.053799,-0.03855,0.027721,0.001053,-0.004366,0.005495,-0.03467,0.020452,-0.030732,-0.01159,0.022934,0.01434,-0.010956,0.010093,0.017052,0.014904,0.034133,-0.043057,0.028795,-0.011159,0.013353,0.021073,0.009022,-0.006534,0.005873,0.003102,2,15602
3,0.071676,-0.108978,-0.01232,0.044606,0.000823,-0.071537,-0.007767,0.025085,-0.04076,0.062478,-0.006376,-0.018335,-0.016748,0.017802,-0.044839,0.015175,0.006037,-0.004343,0.041434,-0.01782,0.04415,0.014084,-0.041041,0.024761,0.01261,0.022253,0.004834,-0.031334,-0.01719,-0.008224,0.010153,-0.013286,0.000788,-0.028615,0.010371,0.009229,-0.048306,-0.002773,0.030067,-0.002685,0.012992,-0.029494,0.000304,-0.011143,0.003202,0.011456,-0.0033,0.017431,0.023496,-0.020742,0.037005,0.011057,-0.006877,0.026224,-0.034425,0.030396,0.042336,0.016443,0.02427,0.038726,0.017738,0.015275,-0.020615,-0.026062,0.007494,0.002563,-0.010629,0.00891,0.027643,0.002799,-0.032043,-0.02043,-0.005891,-0.022443,-0.029185,0.000271,0.021481,0.005915,-0.001354,0.050914,0.011651,-0.030891,-0.020011,0.006098,0.006561,-0.02075,0.031901,-0.050901,0.02269,-0.020439,-0.022364,-0.023124,-0.032972,-0.024057,-0.003157,-0.017646,-0.01279,0.028737,0.038249,-0.015236,-0.019265,0.024384,0.013817,-0.005071,0.053047,0.023339,0.058388,0.011948,0.017921,-0.057032,0.018751,0.022899,0.029178,-0.01648,0.014361,0.012218,0.035077,0.019247,0.003085,-0.049072,-0.020642,-0.043032,0.008131,0.022873,-0.020976,-0.022131,0.018358,0.004523,3,31357
4,0.064185,-0.114923,0.004181,0.011906,-0.016618,0.018914,-0.059879,-0.017174,0.001549,-0.032225,-0.016199,0.031824,-0.048488,-0.090421,-0.080895,-0.00662,0.083604,0.001838,0.000922,-0.047553,-0.027574,-0.063761,-0.06254,0.004902,-0.052422,0.026224,-0.011361,-0.079242,0.029631,-0.022124,-0.035761,0.019713,-0.040178,0.03224,-0.008216,0.048503,-0.046916,0.029809,-2e-05,-0.028255,-0.054103,0.05358,-0.067148,-0.046304,-0.013408,-0.041302,0.0326,0.024252,-0.002614,0.013464,0.029073,0.00042,-0.003712,0.029087,-0.004743,-0.010554,-0.008093,-0.01929,0.037916,-0.023638,0.05594,0.015763,0.037072,-0.003775,0.007076,0.034775,0.02984,0.026517,0.015665,0.00149,0.001433,-0.030828,-0.002123,0.000118,-0.020077,0.018921,-0.032039,0.006518,-0.049314,-0.008816,-0.028701,-0.018161,0.041745,0.058944,-0.010406,-0.021492,-0.02646,0.004817,-0.036373,0.017688,0.00617,-0.033828,0.006414,0.002262,0.02968,-0.021984,0.022156,-0.01878,0.037965,-0.006522,-0.03579,-0.017818,0.025243,-0.018072,-0.020283,-0.025662,0.043904,0.003447,-0.016642,-0.004111,-0.025701,0.007105,0.00994,0.024935,0.014572,-0.035795,-0.033443,-0.002981,-0.002774,0.006062,0.005394,-0.009143,-0.011988,0.012007,-0.014966,0.044435,0.015945,0.014108,4,11862


In [59]:
results = df_final[[i for i in range(128)]].to_numpy()
results.shape

(44111, 128)

In [60]:
ids = df_final['id'].copy()

In [74]:
embeddings_dict  = {}

# Recorrer la serie e imprimir el índice y el valor
for indice, movie_id in ids.iteritems():
    embeddings_dict [str(movie_id)] = results[indice]

  for indice, movie_id in ids.iteritems():


In [77]:
# Embedding from toy story
embeddings_dict.get('862')

array([-0.01035223, -0.10972525, -0.06222505, -0.1003879 ,  0.05430709,
        0.06660701,  0.01320904,  0.01471363, -0.07133633, -0.03881311,
        0.01889355,  0.01122382, -0.00339526, -0.09680998,  0.0391289 ,
       -0.1298691 ,  0.00854411, -0.03229799,  0.01876071, -0.04243776,
        0.05707888, -0.01337589, -0.01026136,  0.00195072, -0.01334548,
        0.02264325,  0.07181543,  0.00729827, -0.04231753, -0.00626432,
        0.01094671, -0.03946125,  0.0521028 , -0.00024185, -0.03552674,
        0.00519065, -0.01046341, -0.00617261,  0.07166632, -0.00903567,
       -0.0202154 , -0.01056421, -0.00057193,  0.03248255,  0.00506453,
        0.01937478, -0.0166199 ,  0.01095301,  0.02922454, -0.0282946 ,
       -0.03321112, -0.03233412, -0.03165616,  0.02912424,  0.03189067,
        0.02565843,  0.01667795,  0.02694346, -0.02805517, -0.00779936,
        0.00195383, -0.02957352,  0.02450601, -0.00771578,  0.01281997,
       -0.01421824, -0.02268125,  0.02777522, -0.03429703,  0.02

In [78]:
import h5py


# Crear un archivo HDF5
with h5py.File("movie_embeddings.h5", "w") as archivo:
    grupo = archivo.create_group("embeddings")

    # Guardar los embeddings en el grupo
    for clave, embedding in embeddings_dict.items():
        grupo.create_dataset(clave, data=embedding)

In [79]:
import h5py

# Cargar los embeddings desde el archivo HDF5
with h5py.File("movie_embeddings.h5", "r") as archivo:
    grupo = archivo["embeddings"]
    embeddings_cargados = {clave: grupo[clave][:] for clave in grupo}


In [80]:
embeddings_cargados.get('862')

array([-0.01035223, -0.10972525, -0.06222505, -0.1003879 ,  0.05430709,
        0.06660701,  0.01320904,  0.01471363, -0.07133633, -0.03881311,
        0.01889355,  0.01122382, -0.00339526, -0.09680998,  0.0391289 ,
       -0.1298691 ,  0.00854411, -0.03229799,  0.01876071, -0.04243776,
        0.05707888, -0.01337589, -0.01026136,  0.00195072, -0.01334548,
        0.02264325,  0.07181543,  0.00729827, -0.04231753, -0.00626432,
        0.01094671, -0.03946125,  0.0521028 , -0.00024185, -0.03552674,
        0.00519065, -0.01046341, -0.00617261,  0.07166632, -0.00903567,
       -0.0202154 , -0.01056421, -0.00057193,  0.03248255,  0.00506453,
        0.01937478, -0.0166199 ,  0.01095301,  0.02922454, -0.0282946 ,
       -0.03321112, -0.03233412, -0.03165616,  0.02912424,  0.03189067,
        0.02565843,  0.01667795,  0.02694346, -0.02805517, -0.00779936,
        0.00195383, -0.02957352,  0.02450601, -0.00771578,  0.01281997,
       -0.01421824, -0.02268125,  0.02777522, -0.03429703,  0.02