In [2]:
import ast
import json
import math
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F

from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
#from torch.nn.functional import cosine_similarity

In [3]:
# for uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")

In [4]:
# Obtainig the data
user_pref  = pd.read_csv(r'../../../user_preferences_full_cleaned.csv',sep=';')
filtered_data = pd.read_csv(r'../../../filtered_data.csv',sep=';')

In [5]:
# Cleaning dfs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref.reset_index(inplace=True,drop=True)

### Sentences for vectorize from 'filtered_data'

In [6]:
# Making the sentences to embed 
filtered_data['sentences_to_embed'] = (
    filtered_data.CleanTitle.fillna('') +
    filtered_data.Synopsis.fillna('') +
    filtered_data.Genres.fillna('').apply(
        lambda x: ', '.join(ast.literal_eval(x)) if x.startswith('[') and x.endswith(']') else x
    )
)

# Formatting as str-list to send to the model
sentences_from_filtered_data = filtered_data.sentences_to_embed.dropna().astype(str).tolist()


In [7]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 100)

In [8]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    

# Sequence Batch Processing Process
all_embeddings_from_filtered_data = []
for batch in batches:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_filtered_data.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_filtered_data[:2])  

Clave de API cargada correctamente.
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Error 503: {"error":"Model sentence-transformers/all-MiniLM-L6-v2 is currently loading","estimated_time":20.0}
Processing batch with 100 sentences...
Error 503: {"error":"Model sentence-transformers/all-MiniLM-L6-v2 is currently loading","estimated_time":20.0}
Processing batch with 100 sentences...
Error 503: {"error":"Model sentence-transformers/all-MiniLM-L6-v2 is currently loading","estimated_time":20.0}
Processing batch with 100 sentences...
Error 503: {"error":"Model sentence-transformers/all-MiniLM-L6-v2 is currently loading","estimated_time":20.0}
Processing batch with 100 sentences...
Error 503: {"

### Sentences for vectorize from 'user_preferences'

In [9]:
# Sentences we want to be embedded from user_preferences
user_pref['sentences_to_embed'] = (user_pref.TitulosPeliculas.fillna('') +
                                   user_pref.TitulosSeries.fillna('') + 
                                   user_pref.GenerosFavoritos.fillna('') +
                                   user_pref.DetallesPeliculas.fillna('') +
                                   user_pref.DetallesSeries.fillna(''))

# Formatting as str-list to send to the model
sentences_from_user_pref = user_pref.sentences_to_embed.dropna().astype(str).tolist()

# We split the sentences in batches as we did previously with filtered_data
batches_user_pref = split_into_batches(sentences_from_user_pref, 100)

In [10]:
# Sequence Batch Processing Process
all_embeddings_from_user_pref = []
for batch in batches_user_pref:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_user_pref.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_user_pref[:2])  

Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100

### Similarities

In [24]:
# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
all_embeddings_from_user_pref_array = np.array(all_embeddings_from_user_pref)
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data)

user_for_example = 1

# Taking first user as example to calculate the cosine_similarity
user_embedding_example = all_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # Asegurar forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
content_similarities = cosine_similarity(user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
most_similar_indexes = content_similarities.argsort()[::-1]

# Top-10
topten_most_similar_indexes = most_similar_indexes[:10]

In [25]:
# To display the most similar indexes and their similarity scores
print("Most similar indexes:", most_similar_indexes[:10])
print("Highest similarities:", content_similarities[most_similar_indexes[:10]])

Most similar indexes: [  54   61 2487 1256 4190   72   38  584 1896  666]
Highest similarities: [0.76089586 0.71299264 0.64834726 0.64739428 0.62114582 0.61801408
 0.61334271 0.6105758  0.61034363 0.60730179]


In [28]:
# search most_similar_indexes, and preferences, and get recommendations
user_id = user_pref.loc[user_for_example]['userId']
movies_preferred = user_pref[user_pref['userId']==user_id]['TitulosPeliculas']
series_preferred = user_pref[user_pref['userId']==user_id]['TitulosSeries']

# Displaying Preferences & Recommendations
print(f'''User {user_id} Preferences:
      ''')
print(f'''Movies preference:''')
for movie in movies_preferred.iloc[0].split(';'):
    print(f'      {movie.strip()}')

print(f'''TV Shows preference:''')
for series in series_preferred.iloc[0].split(';'):
    print(f'      {series.strip()}')

print(f'''Recomendations for user: {user_id}
      ''')

recomendations_user = filtered_data.loc[topten_most_similar_indexes]['CleanTitle']
for recommendation in recomendations_user:
    print(f'      {recommendation}')


User e129f0be-7021-70f6-8c43-24806020bacd Preferences:
      
Movies preference:
      Regreso al futuro
      Toy Story
      Ratatouille
      Bohemian Rhapsody
      La ciudad de las estrellas (La La Land)
TV Shows preference:
      Yo nunca
      Big Time Rush
      Victorious
      Con amor, Victor
      O11CE
Recomendations for user: e129f0be-7021-70f6-8c43-24806020bacd
      
      Regreso al futuro II
      Regreso al futuro III
      Think Thank Presents: Think Thank Almanac
      Guardaespaldas
      Living the Dream Tour
      Alta fidelidad
      El gran Lebowski
      La dama y el vagabundo
      Ney Matogrosso Interpreta Cartola
      El niño con el pijama de rayas
