In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
import torch.nn.functional as F
import pandas as pd

In [2]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

### Obtaining the data

In [3]:
# Load the info
user_pref  = pd.read_csv(r'../../../user_preferences_full_cleaned.csv',sep=';')
filtered_data = pd.read_csv(r'../../../filtered_data.csv',sep=';')
user_pref.head(2)

Unnamed: 0,userId,PeliculasFavoritas,TitulosPeliculas,SeriesFavoritas,TitulosSeries,GenerosFavoritos,DetallesPeliculas,DetallesSeries,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,3169503e-8011-707f-a637-23e5f9a37d5d,653346;693134;996154,El reino del planeta de los simios;Dune: Parte...,121,Doctor Who,"['Accion', 'Aventura', 'Animacion', 'Comedia',...","[{""Titulo"": ""El reino del planeta de los simio...","[{""Titulo"": ""Doctor Who"", ""Sinopsis"": ""Doctor ...",,,,,
1,"vive entre la luz y la oscuridad, llena de per...",no pueden regresar a casa porque la ley los p...,los mutantes. Dotados de extranos y variados ...,aparecer en su concurso televisivo preferido....,"de lo contrario, su existencia no seria posib...","[{""Titulo"": ""Juego de tronos"", ""Sinopsis"": ""En...",,,,,,,


In [4]:
filtered_data.head(2)

Unnamed: 0,PlatformName,ExternalIds,CleanTitle,Deeplinks,Synopsis,Image,Genres,Cast,Crew,Directors
0,TMDB,11,La guerra de las galaxias,,"La princesa Leia, líder del movimiento rebelde...",/ahT4ObS7XKedQkOSpGr1wQ97aKA.jpg,"['Aventura', 'Acción', 'Ciencia ficción']",,,
1,TMDB,12,Buscando a Nemo,,"Nemo, un pececillo, hijo único muy querido y p...",/jPhak722pNGxQIXSEfeWIUqBrO5.jpg,"['Animación', 'Familia']",,,


### Sentences for vectorize from 'filtered_data'

In [5]:
# Sentences we want sentence embeddings for
filtered_data['sentences_to_embed'] = (filtered_data.CleanTitle.fillna('') + filtered_data.Synopsis.fillna('') + filtered_data.Genres.fillna(''))

# >>>> Here I make this attemp with only 25 to not to crash it all. Remove '.head(25)' once you're sure to send all the info to vectorize
sentences_from_filtered_data = filtered_data.sentences_to_embed.head(250).astype(str).tolist()


In [6]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences_from_filtered_data, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings_from_filtered_data = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings from filtered_data:")
print(sentence_embeddings_from_filtered_data[0])

Sentence embeddings from filtered_data:
tensor([ 3.2867e-02, -6.8492e-02,  4.3265e-02, -1.1126e-01, -5.1304e-02,
         2.2752e-02,  3.1482e-03,  5.1561e-02,  2.6399e-02,  8.8553e-02,
         2.4195e-02, -3.4258e-03,  5.4374e-02, -6.8434e-02, -2.8044e-04,
        -3.0558e-02, -1.4775e-02,  2.6595e-02, -6.6962e-03,  8.9228e-02,
         1.5272e-01, -4.5240e-02, -3.4046e-02,  1.2077e-01, -4.9471e-02,
         3.7264e-02, -3.0139e-03, -3.4076e-02, -1.0938e-01, -5.4833e-02,
         1.8205e-02, -1.6173e-03,  3.2669e-02,  6.5272e-03, -5.8715e-03,
        -3.0002e-02,  3.3804e-02, -1.2000e-02,  3.8254e-02, -6.9378e-02,
        -6.7582e-02, -1.0259e-02,  1.9385e-02, -3.2723e-02,  3.2871e-03,
        -6.6965e-02, -3.5510e-02,  3.2741e-02,  2.5061e-02,  2.5534e-02,
         4.7079e-02, -1.6016e-02,  4.6271e-02, -3.6409e-02,  5.3023e-02,
         2.8934e-02, -4.9449e-02, -1.1708e-01,  1.6114e-01, -5.0066e-03,
         4.6694e-03,  1.3566e-01,  3.8408e-02,  3.9393e-02, -3.7163e-02,
        -9.

### Sentences for vectorize from 'user_preferences'

In [7]:
user_pref.head(2)

Unnamed: 0,userId,PeliculasFavoritas,TitulosPeliculas,SeriesFavoritas,TitulosSeries,GenerosFavoritos,DetallesPeliculas,DetallesSeries,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,3169503e-8011-707f-a637-23e5f9a37d5d,653346;693134;996154,El reino del planeta de los simios;Dune: Parte...,121,Doctor Who,"['Accion', 'Aventura', 'Animacion', 'Comedia',...","[{""Titulo"": ""El reino del planeta de los simio...","[{""Titulo"": ""Doctor Who"", ""Sinopsis"": ""Doctor ...",,,,,
1,"vive entre la luz y la oscuridad, llena de per...",no pueden regresar a casa porque la ley los p...,los mutantes. Dotados de extranos y variados ...,aparecer en su concurso televisivo preferido....,"de lo contrario, su existencia no seria posib...","[{""Titulo"": ""Juego de tronos"", ""Sinopsis"": ""En...",,,,,,,


In [8]:
# Sentences we want sentence embeddings for
user_pref['sentences_to_embed'] = (user_pref.TitulosPeliculas.fillna('') +
                                   user_pref.TitulosSeries.fillna('') + 
                                   user_pref.GenerosFavoritos.fillna('') +
                                   user_pref.DetallesPeliculas.fillna('') +
                                   user_pref.DetallesSeries.fillna(''))

# >>>> Here I make this attemp with only 25 to not to crash it all. Remove '.head(25)' once you're sure to send all the info to vectorize
sentences_from_user_pref = user_pref.sentences_to_embed.head(25).astype(str).tolist()

In [9]:
# Tokenize sentences
encoded_input = tokenizer(sentences_from_user_pref, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentences_embeddings_from_user_pref = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings from user_pref:")
print(sentences_embeddings_from_user_pref[0])

Sentence embeddings from user_pref:
tensor([-8.6983e-03,  1.9204e-02, -5.8308e-03, -2.3660e-02,  1.7928e-02,
         2.5893e-02, -2.8368e-02,  1.7070e-02,  6.3262e-02,  6.4569e-02,
         1.5900e-01, -4.2928e-02,  2.1678e-02, -9.4919e-03,  1.1661e-02,
        -3.6294e-03, -3.7695e-02,  7.3656e-02, -1.6738e-02,  8.6832e-02,
         7.0808e-02, -1.8388e-02, -9.0843e-03,  2.7893e-02, -1.1812e-01,
         2.6061e-02, -1.0339e-01,  3.6359e-02, -1.5528e-01, -4.7489e-02,
        -3.8649e-02,  8.6731e-02, -4.6083e-03, -4.1934e-02, -2.4812e-03,
         5.4407e-02,  6.8242e-03, -5.7281e-02, -1.5889e-02,  2.9276e-02,
        -1.2272e-01, -1.2706e-02, -1.0463e-02, -7.0701e-02, -3.8347e-02,
        -8.9384e-02,  4.0357e-02, -1.2260e-02,  4.9424e-02, -4.1969e-02,
        -6.5599e-02,  6.0545e-03, -6.0653e-02, -1.8321e-02, -4.7025e-02,
        -2.2548e-02, -3.8654e-02, -1.3480e-03,  6.8287e-02, -3.7368e-02,
        -1.8930e-02,  1.8002e-02, -4.1229e-02,  6.3963e-02,  2.4025e-02,
        -2.8520

In [10]:
# Select a user's embedding (first tensor as an example)
user_embedding = sentences_embeddings_from_user_pref[0]

# Calculate the cosine similarity between the user's embedding and all content embeddings
content_similarities = cosine_similarity(user_embedding.unsqueeze(0), sentence_embeddings_from_filtered_data)

# Sort the indices by similarity score in descending order
most_similar_indices = content_similarities.argsort(descending=True)

# Display the top indices and their similarity scores
print("Índices más similares:", most_similar_indices[:5])
print("Similitudes más altas:", content_similarities[most_similar_indices[:5]])


Índices más similares: tensor([ 54,   7, 237,  19, 171])
Similitudes más altas: tensor([0.7390, 0.6898, 0.6683, 0.6597, 0.6457])
