### Environmental Variables

In [1]:
import ast
import boto3
import firebase_admin
import json
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F
from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from firebase_admin import credentials, firestore
from requests.exceptions import ReadTimeout
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [2]:
# uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

In [3]:
# Start Firebase 
if not firebase_admin._apps:
    cred_path = r'../../../bubbo-dfba0-firebase-adminsdk-fbsvc-79dc4511e7.json'  
    cred = credentials.Certificate(cred_path)
    firebase_admin.initialize_app(cred)

# MOVIES AND SERIES CONTENT

### ETL

In [None]:
# CON ESTO TRAEMOS LOS CONTENIDOS DE MOVIES/SERIES EN 'filtered_data', LO LIMPIAMOS 
# Y PREPARAMOS PARA LOS EMBEDDINGS, 
# Y TRAEMOS GENEROS EN 'df_genres' 

# Firestore conexion and db collection name
db = firestore.client()
collection_ref = db.collection('Data_Clean') # 
docs = collection_ref.stream()
data = [{**doc.to_dict(), 'id': doc.id} for doc in docs]
df = pd.DataFrame(data)
filtered_data = df 
filtered_data = filtered_data.replace("",pd.NA)
filtered_data = filtered_data.dropna()                                                                
filtered_data = filtered_data.drop_duplicates()
filtered_data = filtered_data.drop(columns='id')
filtered_data['ID'] = filtered_data['ID'].astype(str)

# Making the sentences to embed 
filtered_data['sentences_to_embed'] = (
    filtered_data.CleanTitle.fillna('') +
    filtered_data.Synopsis.fillna('') +
    filtered_data.Genre.fillna('').apply(
        lambda x: ', '.join(ast.literal_eval(x)) if x.startswith('[') and x.endswith(']') else x ) +
    filtered_data.Cast.fillna('') +
    filtered_data.Directors.fillna('')
)

# Genres INFO
collection_ref_2 = db.collection('Genres_DB') 
docs_2 = collection_ref_2.stream()
data_2 = [{**doc.to_dict(), 'id': doc.id} for doc in docs_2]
df_2 = pd.DataFrame(data_2)
df_genres = df_2
df_genres.rename(columns={'id':'genero_id'}, inplace=True)

# User Preferences CONTENT

### Extracción de Datos de user_preference (por el momento de dynamodb)

In [82]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),     ###################### DIRECTOR MAS CAST HAY QUE TRAER DESPUES CUANDO COMPLETO EL DF LUEGO DE FILTERED_DATA
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


### Transforming 'user_pref' to our purposes

In [83]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()             
user_pref.reset_index(inplace=True,drop=True)

# convertir los valores en listas para expandirlos con explode
user_pref['favoriteGenresIds'] = user_pref['favoriteGenresIds'].apply(lambda x: x.split(';'))
user_pref['favoriteMoviesIds'] = user_pref['favoriteMoviesIds'].apply(lambda x: x.split(';'))
user_pref['favoriteSeriesIds'] = user_pref['favoriteSeriesIds'].apply(lambda x: x.split(';'))

# expandir preferencias de favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_pref[['userId','favoriteGenresIds']].explode('favoriteGenresIds')
user_fav_movies = user_pref[['userId','favoriteMoviesIds']].explode('favoriteMoviesIds')
user_fav_series = user_pref[['userId','favoriteSeriesIds']].explode('favoriteSeriesIds')


# merge para traerme los CleanTitle, Synopsis, 'Genre'
user_fav_genres['favoriteGenresIds'] = user_fav_genres['favoriteGenresIds']#.astype(int)          # <- Nuevo agregado# <- Nuevo agregado# <- Nuevo agregado VER ESTO EN LOS EMB
user_fav_genres = user_fav_genres.merge(df_genres[['genero_id','genero_name']], left_on='favoriteGenresIds', right_on='genero_id') 


filtered_data = filtered_data.dropna(subset=['ID'])                                                                                    # <- Nuevo agregado# <- Nuevo agregado# <- Nuevo agregado
filtered_data['ID'] = filtered_data['ID'].astype(str).str.strip()                                                                                 # <- Nuevo agregado# <- Nuevo agregado# <- Nuevo agregado
user_fav_movies['favoriteMoviesIds'] = user_fav_movies['favoriteMoviesIds'].astype(str).str.strip()
user_fav_series['favoriteSeriesIds'] = user_fav_series['favoriteSeriesIds'].astype(str).str.strip()

user_fav_movies = user_fav_movies.merge(filtered_data[['ID','CleanTitle','Synopsis', 'Cast', 'Directors']], left_on='favoriteMoviesIds', right_on='ID', how='left')  ###### en esta y lasig fila agregué synopsis
user_fav_series = user_fav_series.merge(filtered_data[['ID','CleanTitle','Synopsis', 'Cast', 'Directors']], left_on='favoriteSeriesIds', right_on='ID', how='left')

user_fav_genres = user_fav_genres.drop(columns='genero_id')
user_fav_genres.rename(columns={'genero_name':'Genres'}, inplace=True)
user_fav_movies = user_fav_movies.drop(columns='ID')
user_fav_movies.rename(columns={'CleanTitle':'Movies_Titles', 'Synopsis':'Movies_Synopsis', 'Cast':'Movies_Cast', 'Directors':'Movies_Directors'}, inplace=True)
user_fav_series = user_fav_series.drop(columns='ID')
user_fav_series.rename(columns={'CleanTitle':'Series_Titles', 'Synopsis':'Series_Synopsis', 'Cast':'Series_Cast', 'Directors':'Series_Directors'}, inplace=True)

# reAGRUPO por userId para que me queden las listas CleanTitle, Synopsis, 'Genre' por user segun sus favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_fav_genres.groupby('userId')[['favoriteGenresIds','Genres']].agg(list).reset_index()
user_fav_movies = user_fav_movies.groupby('userId')[['favoriteMoviesIds','Movies_Titles', 'Movies_Synopsis', 'Movies_Cast', 'Movies_Directors']].agg(list).reset_index()
user_fav_series = user_fav_series.groupby('userId')[['favoriteSeriesIds','Series_Titles', 'Series_Synopsis', 'Series_Cast', 'Series_Directors']].agg(list).reset_index()

#termino de acomodar 'user_pref' para dar paso a los embeddings
user_pref = user_pref.merge(user_fav_genres, left_on='userId', right_on='userId').drop(columns=['favoriteGenresIds_y'])
user_pref.rename(columns={'favoriteGenresIds_x':'favoriteGenresIds'},inplace=True)
user_pref = user_pref.merge(user_fav_movies, left_on='userId', right_on='userId').drop(columns=['favoriteMoviesIds_y'])
user_pref.rename(columns={'favoriteMoviesIds_x':'favoriteMoviesIds'},inplace=True)
user_pref = user_pref.merge(user_fav_series, left_on='userId', right_on='userId').drop(columns=['favoriteSeriesIds_y'])
user_pref.rename(columns={'favoriteSeriesIds_x':'favoriteSeriesIds'},inplace=True)
user_pref = user_pref.reindex(['userId', 'favoriteGenresIds', 'Genres', 'favoriteMoviesIds', 'Movies_Titles','Movies_Synopsis', 'Movies_Cast', 'Movies_Directors', 'favoriteSeriesIds', 'Series_Titles', 'Series_Synopsis', 'Series_Cast', 'Series_Directors'], axis=1)

# Sentences we want to be embedded from user_preferences MOVIES
user_pref['movies_sentences_to_embed'] = (user_pref.Movies_Titles.fillna('') +
                                   user_pref.Movies_Synopsis.fillna('')+
                                   user_pref.Genres.fillna('') +
                                   user_pref.Movies_Cast.fillna('') +
                                   user_pref.Movies_Directors.fillna(''))

# Sentences we want to be embedded from user_preferences SERIES
user_pref['series_sentences_to_embed'] = (user_pref.Series_Titles.fillna('') + 
                                   user_pref.Series_Synopsis.fillna('') +
                                   user_pref.Genres.fillna('') +
                                   user_pref.Series_Cast.fillna('') +
                                   user_pref.Series_Directors.fillna(''))

### Building Sentences for vectorize from 'filtered_data' and 'user_pref==Movies' & 'user_pref==Series'

In [84]:
# LOS ID PARA EL INDEX Y LAS SENTENCES A FORMATO PARA PEDIR LOS EMBEDDINGS

# De filtered_data (todos los contenidos)
ids_from_filtered_data = filtered_data['ID'].tolist()  # Guardamos los IDs
sentences_from_filtered_data = filtered_data['sentences_to_embed'].dropna().astype(str).tolist()

# De user_pref dividido en movies y series
# Guardo userId junto con la sentence a vectorizar
movies_sentences_from_user_pref = user_pref[['userId', 'movies_sentences_to_embed']].dropna().astype(str)
series_sentences_from_user_pref = user_pref[['userId', 'series_sentences_to_embed']].dropna().astype(str)


### Batches to send to HF

In [None]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, 
# we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 50)

# We split the sentences in batches as we did previously with filtered_data
movies_batches_user_pref = split_into_batches(movies_sentences_from_user_pref, 50)
series_batches_user_pref = split_into_batches(series_sentences_from_user_pref, 50)

### Loading Access to HF

In [None]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload, timeout=10)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    
# No prestar atencion nadie mas que Agus // esta aqui para cuando se completen los 200k que funcionen ok y esten almacenados para lograr obtener 
# los nuevos embeedings validos

# all_embeddings_from_filtered_data = []
# for batch in batches:
#     print(f"Processing batch with {len(batch)} sentences...")
#     time.sleep(7)
#     embeddings = get_embeddings_from_api(batch)
#     if embeddings:
#         all_embeddings_from_filtered_data.extend(embeddings)

# # Asociamos cada embedding con su respectivo ID
# all_embeddings_dict = {id_: emb for id_, emb in zip(ids_from_filtered_data, all_embeddings_from_filtered_data)}

# print("Embeddings processed successfully:")
# print(list(all_embeddings_dict.items())[:2])  # Muestra los primeros pares ID - embedding


Clave de API cargada correctamente.


### Sending Sentences to vectorize for HuggingFace

In [None]:
### MOVIES
movies_embeddings_dict = {}  # Diccionario para almacenar {userId: embedding}
num_batches = len(movies_batches_user_pref)  # cuantas batches tengo

for i, batch in enumerate(movies_batches_user_pref, start=1):
    batch_user_ids = batch['userId'].tolist()
    batch_sentences = batch['movies_sentences_to_embed'].tolist()
    
    while True:
        try: 
            print(f"Processing movies batch {i}/{num_batches} with {len(batch)} sentences...")
            time.sleep(1)
            embeddings = get_embeddings_from_api(batch_sentences)
            
            if embeddings:
                movies_embeddings_dict.update({uid: emb for uid, emb in zip(batch_user_ids, embeddings)})
                break
        except ReadTimeout:
            print(f'Timeout on batch {i}. Retrying...')
            time.sleep(5)

print("Movies Embeddings processed successfully.")

In [None]:
# Series
series_embeddings_dict = {}  # Diccionario para almacenar {userId: embedding}
num_batches_series = len(series_batches_user_pref)  # Total de batches

for i, batch in enumerate(series_batches_user_pref, start=1):
    batch_user_ids = batch['userId'].tolist()
    batch_sentences = batch['series_sentences_to_embed'].tolist()
    
    while True:  # Intentar hasta que se procese correctamente
        try:
            print(f"Processing series batch {i}/{num_batches_series} with {len(batch)} sentences...")
            time.sleep(1)
            embeddings = get_embeddings_from_api(batch_sentences)
            
            if embeddings:
                series_embeddings_dict.update({uid: emb for uid, emb in zip(batch_user_ids, embeddings)})
                break  # Salir del bucle si se procesó correctamente
        except ReadTimeout:
            print(f"Timeout on batch {i}. Retrying...")
            time.sleep(5)  # Esperar antes de reintentar

print("Series Embeddings processed successfully.")

# Working with Similarities

### getting local similarities for testing purposes, but awaiting for availability of 200k well-fed titles to obtain new effective embeddings

In [86]:
# NO USAR ESTO - SOLO LO MANTENGO PARA TRAER LOS EMBEDDINGS EN CSV PARA HACER PRUEBAS HASTA QUE
# TENGAMOS LOS NUEVOS EMBEDDINGS DE LOS CONTENIDOS CARGADOS EN FIREBASE/VERTEX


# Genres database, local by now. Then we have to get them linked to firebase, or wathever
all_embeddings_from_filtered_data = pd.read_csv(r'../../../all_content_embeddings.csv')

# Conversión optimizada para el dict de los embeddings que habiamos guardado en csv
def fast_convert(emb):
    if isinstance(emb, str): 
        return np.array(json.loads(emb), dtype=np.float32)  # Usa float32 para ahorrar memoria
    return emb

all_embeddings_dict = {
    id_: fast_convert(emb)
    for id_, emb in zip(
        all_embeddings_from_filtered_data['ID'], 
        all_embeddings_from_filtered_data['Embedding']
    )
}

In [None]:
import faiss
import pickle


movies_embeddings = r'example_files_to_delete/movies_embeddings.faiss'
movies_user_ids = r'example_files_to_delete/movies_user_ids.pkl'
series_embeddings = r'example_files_to_delete/series_embeddings.faiss'
series_user_ids = r'example_files_to_delete/series_user_ids.pkl'


# Cargar los embeddings y los user ids para las películas
def load_movie_data(embedding_file, user_ids_file):
    # Cargar embeddings de películas
    movie_embeddings = faiss.read_index(embedding_file)
    
    # Cargar user ids de películas
    with open(user_ids_file, 'rb') as f:
        movie_user_ids = pickle.load(f)
    
    return movie_embeddings, movie_user_ids

# Cargar los embeddings y los user ids para las series
def load_series_data(embedding_file, user_ids_file):
    # Cargar embeddings de series
    series_embeddings = faiss.read_index(embedding_file)
    
    # Cargar user ids de series
    with open(user_ids_file, 'rb') as f:
        series_user_ids = pickle.load(f)
    
    return series_embeddings, series_user_ids

# Crear un diccionario con los embeddings para películas
def create_embeddings_dict(user_ids, embeddings):
    embeddings_dict = {}
    
    # Convertir los embeddings a una matriz (si es necesario)
    embeddings_matrix = embeddings.reconstruct_n(0, embeddings.ntotal)  # Convierte los embeddings en una matriz de numpy
    
    # Asignar el embedding a cada userId en el diccionario
    for user_id, embedding in zip(user_ids, embeddings_matrix):
        embeddings_dict[user_id] = embedding.tolist()  # Convertimos el embedding a lista para almacenamiento
    
    return embeddings_dict

# Cargar los datos de películas y series
movie_embeddings, movie_user_ids = load_movie_data(movies_embeddings, movies_user_ids)
series_embeddings, series_user_ids = load_series_data(series_embeddings, series_user_ids)

# Crear los diccionarios con los embeddings
movies_embeddings_dict = create_embeddings_dict(movie_user_ids, movie_embeddings)
series_embeddings_dict = create_embeddings_dict(series_user_ids, series_embeddings)


### Movies Similarities 

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###

# Convertir los embeddings a arrays de numpy para cálculos más rápidos
all_embeddings_from_filtered_data_array = np.array(list(all_embeddings_dict.values()))

# Diccionarios para almacenar las recomendaciones
movies_recommendations_dict = {}

# Obtener recomendaciones para cada usuario en movies_embeddings_dict
for user_id, user_embedding in movies_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    movies_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # Ordenar por similitud y seleccionar el top-10
    movies_most_similar_indexes = movies_content_similarities.argsort()[::-1][:10]
    
    # Convertir los índices en IDs reales
    movies_recommended_ids = [filtered_data.iloc[i]['ID'] for i in movies_most_similar_indexes]
    
    # Guardar en el diccionario
    movies_recommendations_dict[user_id] = movies_recommended_ids


# Mostrar ejemplos
print("Ejemplo de recomendaciones para un usuario en Movies:")
example_user = list(movies_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {movies_recommendations_dict[example_user]}")




### Tv Shows (Series) Similarities

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###
# dict para las recomendaciones
series_recommendations_dict = {}

number_target_series_recommend = 25

# recommend's para cada usuario en series_embeddings_dict
for user_id, user_embedding in series_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    series_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # orden por similitud y top-goal
    series_most_similar_indexes = series_content_similarities.argsort()[::-1][:number_target_series_recommend]
    
    # paso los indices a IDs reales
    series_recommended_ids = [filtered_data.iloc[i]['ID'] for i in series_most_similar_indexes]
    
    # Guardar en el diccionario
    series_recommendations_dict[user_id] = series_recommended_ids

# Mostrar ejemplos
print("\nEjemplo de recomendaciones para un usuario en Series:")
example_user = list(series_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {series_recommendations_dict[example_user]}")


# Recomendations

In [None]:
# guardo
with open("movies_recommendations.json", "w") as f:
    json.dump(movies_recommendations_dict, f, indent=4)

with open("series_recommendations.json", "w") as f:
    json.dump(series_recommendations_dict, f, indent=4)

print("Recomendaciones guardadas en JSON")
