In [93]:
import ast
import json
import math
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F

from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
#from torch.nn.functional import cosine_similarity

In [94]:
# for uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

In [95]:
# Obtainig the data
user_pref  = pd.read_csv(r'../../../user_preferences_full_cleaned.csv',sep=';')
filtered_data = pd.read_csv(r'../../../filtered_data.csv',sep=';')

In [96]:
# Cleaning dfs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref.reset_index(inplace=True,drop=True)

### Sentences for vectorize from 'filtered_data'

In [97]:
# Making the sentences to embed 
filtered_data['sentences_to_embed'] = (
    filtered_data.CleanTitle.fillna('') +
    filtered_data.Synopsis.fillna('') +
    filtered_data.Genres.fillna('').apply(
        lambda x: ', '.join(ast.literal_eval(x)) if x.startswith('[') and x.endswith(']') else x
    )
)

# Formatting as str-list to send to the model
sentences_from_filtered_data = filtered_data.sentences_to_embed.dropna().astype(str).tolist()


In [98]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 100)

In [99]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    

# Sequence Batch Processing Process
all_embeddings_from_filtered_data = []
for batch in batches:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_filtered_data.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_filtered_data[:2])  

Clave de API cargada correctamente.
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 se

### Sentences for vectorize from 'user_preferences'

In [100]:
# Sentences we want to be embedded from user_preferences
user_pref['sentences_to_embed'] = (user_pref.TitulosPeliculas.fillna('') +
                                   user_pref.TitulosSeries.fillna('') + 
                                   user_pref.GenerosFavoritos.fillna('') +
                                   user_pref.DetallesPeliculas.fillna('') +
                                   user_pref.DetallesSeries.fillna(''))

# Formatting as str-list to send to the model
sentences_from_user_pref = user_pref.sentences_to_embed.dropna().astype(str).tolist()

# We split the sentences in batches as we did previously with filtered_data
batches_user_pref = split_into_batches(sentences_from_user_pref, 100)

In [101]:
# Sequence Batch Processing Process
all_embeddings_from_user_pref = []
for batch in batches_user_pref:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_user_pref.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_user_pref[:2])  

Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100

### Similarities

In [102]:
# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
all_embeddings_from_user_pref_array = np.array(all_embeddings_from_user_pref)
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data)

user_for_example = 5

# Taking first user as example to calculate the cosine_similarity
user_embedding_example = all_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # Asegurar forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
content_similarities = cosine_similarity(user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
most_similar_indexes = content_similarities.argsort()[::-1]

# Top-10
topten_most_similar_indexes = most_similar_indexes[:10]

In [103]:
# To display the most similar indexes and their similarity scores
print("Most similar indexes:", most_similar_indexes[:10])
print("Highest similarities:", content_similarities[most_similar_indexes[:10]])

Most similar indexes: [ 740  560  318 1119 1065  911 3293 1067  171 3225]
Highest similarities: [0.82248653 0.65683419 0.65205892 0.63631373 0.63552686 0.63268192
 0.63263276 0.63172285 0.6300957  0.62968809]


In [104]:
# buscar indices de most_similar_indexes y traer los titulos de esas peliculas/series desde filtered data
user_id = user_pref.loc[user_for_example]['userId']
print(f'Recomentations for user: {user_id}')
filtered_data.loc[topten_most_similar_indexes]['CleanTitle']

Recomentations for user: 2149009e-2091-702e-5874-8a93ba7b0823


740                                Origen
560                            A todo gas
318                          Toro salvaje
1119                              Elysium
1065                        Desafío total
911     La jungla: Un buen día para morir
3293                      Triple frontera
1067                 Infiltrados en clase
171                     El último samurái
3225                El guardián invisible
Name: CleanTitle, dtype: object