In [1]:
import ast
import boto3
import json
import math
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F

from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
#from torch.nn.functional import cosine_similarity

In [2]:
# uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

### User Preferences from DynamoDB

In [13]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


In [14]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()
user_pref.reset_index(inplace=True,drop=True)
print(f'Duplicates: {user_pref.duplicated().sum()}')
user_pref.info()

Duplicates: 0
<class 'pandas.core.frame.DataFrame'>
Index: 5133 entries, 0 to 13665
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userId             5133 non-null   object
 1   favoriteMoviesIds  5133 non-null   object
 2   favoriteGenresIds  5133 non-null   object
 3   favoriteSeriesIds  5133 non-null   object
dtypes: object(4)
memory usage: 200.5+ KB


In [5]:
# en esta parte el codigo queda freezado mientras se trabaja en el contenido a obtener desde fire
# ya que necesito ir a buscar los 'Genres', 'CleanTitle', y 'Synopsis' en la base de Fire Base para crear las sentences

### Content Data from FireBase

In [None]:
import requests
import json

# URL de la base de datos
url = "https://bubbo-dfba0-default-rtdb.europe-west1.firebasedatabase.app/documents/Content_latest_ad_flixole_jsonl.json"

# Realizar la solicitud GET
response = requests.get(url)

if response.status_code == 200:
    # Convertir la respuesta en un diccionario
    data = response.json()

    # Obtener las primeras 10 claves
    first_10 = list(data.items())[:10]  # Extrae los primeros 10 elementos como lista de pares clave-valor

    # Mostrar los resultados
    print("Primeros 10 registros:")
    for key, value in first_10:
        print(f"Clave: {key}, Valor: {value}")
else:
    print(f"Error al acceder a la base de datos: {response.status_code}")


In [20]:
# From FireBase
# filtered_data = pd.read_csv(r'../../../filtered_data.csv',sep=';')

### Sentences for vectorize from 'filtered_data' / AHORA DESDE FIREBASE

In [None]:
# Making the sentences to embed 
 

# Formatting as str-list to send to the model
sentences_from_filtered_data = filtered_data.sentences_to_embed.dropna().astype(str).tolist()


In [7]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 100)

In [None]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    

# Sequence Batch Processing Process
all_embeddings_from_filtered_data = []
for batch in batches:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_filtered_data.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_filtered_data[:2])  

### Sentences for vectorize from 'user_preferences' / AHORA DESDE DYNAMODB

In [9]:
# Sentences we want to be embedded from user_preferences
user_pref['sentences_to_embed'] = (user_pref.TitulosPeliculas.fillna('') +
                                   user_pref.TitulosSeries.fillna('') + 
                                   user_pref.GenerosFavoritos.fillna('') +
                                   user_pref.DetallesPeliculas.fillna('') +
                                   user_pref.DetallesSeries.fillna(''))

# Formatting as str-list to send to the model
sentences_from_user_pref = user_pref.sentences_to_embed.dropna().astype(str).tolist()

# We split the sentences in batches as we did previously with filtered_data
batches_user_pref = split_into_batches(sentences_from_user_pref, 100)

In [None]:
# Sequence Batch Processing Process
all_embeddings_from_user_pref = []
for batch in batches_user_pref:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_user_pref.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_user_pref[:2])  

### Similarities

In [24]:
# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
all_embeddings_from_user_pref_array = np.array(all_embeddings_from_user_pref)
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data)

user_for_example = 1

# Taking first user as example to calculate the cosine_similarity
user_embedding_example = all_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # Asegurar forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
content_similarities = cosine_similarity(user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
most_similar_indexes = content_similarities.argsort()[::-1]

# Top-10
topten_most_similar_indexes = most_similar_indexes[:10]

In [None]:
# To display the most similar indexes and their similarity scores
print("Most similar indexes:", most_similar_indexes[:10])
print("Highest similarities:", content_similarities[most_similar_indexes[:10]])

In [None]:
# search most_similar_indexes, and preferences, and get recommendations
user_id = user_pref.loc[user_for_example]['userId']
movies_preferred = user_pref[user_pref['userId']==user_id]['TitulosPeliculas']
series_preferred = user_pref[user_pref['userId']==user_id]['TitulosSeries']

# Displaying Preferences & Recommendations
print(f'''User {user_id} Preferences:
      ''')
print(f'''Movies preference:''')
for movie in movies_preferred.iloc[0].split(';'):
    print(f'      {movie.strip()}')

print(f'''TV Shows preference:''')
for series in series_preferred.iloc[0].split(';'):
    print(f'      {series.strip()}')

print(f'''Recomendations for user: {user_id}
      ''')

recomendations_user = filtered_data.loc[topten_most_similar_indexes]['CleanTitle']
for recommendation in recomendations_user:
    print(f'      {recommendation}')
