### Environmental Variables

In [1]:
import ast
import boto3
import firebase_admin
import json
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F
import vertexai
from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from firebase_admin import credentials, firestore
from requests.exceptions import ReadTimeout
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from vertexai.language_models import TextEmbeddingModel

In [2]:
# uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")  # Borrar esto cuando se completen los embeddings desde VERTEX
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

# Asegurar que Vertex AI use las credenciales correctas
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../../../bubbo-dfba0-47e395cdcdc7.json"


In [3]:
# Ruta a las credenciales de Firebase para la app principal
cred_default_path = "../../../bubbo-dfba0-47e395cdcdc7.json"

# Inicializar Firebase
if not firebase_admin._apps:
    # Inicializar la app por defecto
    cred_default = credentials.Certificate(cred_default_path)
    default_app = firebase_admin.initialize_app(cred_default, name="default")

# Conectar Firestore con la app inicializada
db = firestore.client(app=default_app)  
collection_Data_EN = db.collection('Data_EN') 
collection_Genres_DB = db.collection('Genres_DB') 

In [4]:
# 🔹 3. Inicializar Vertex AI con la credencial correcta
PROJECT_ID = "bubbo-dfba0"
REGION = "us-central1"  # Región donde está alojado el servicio
MODEL_ID = "text-multilingual-embedding-002"

vertexai.init(project=PROJECT_ID, location=REGION)

# 🔹 4. Cargar modelo de embeddings
model = TextEmbeddingModel.from_pretrained(MODEL_ID)

# User Preferences CONTENT

### Extracción de Datos de user_preference (por el momento de dynamodb)

In [5]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),     
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


In [6]:
# Para resetear sin tener que consultar dynamo de nuevo
#user_pref_aux = user_pref
#user_pref = user_pref_aux

### Transforming 'user_pref' to our purposes

In [7]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()             
user_pref.reset_index(inplace=True,drop=True)

In [8]:
user_pref.head()

Unnamed: 0,userId,favoriteMoviesIds,favoriteGenresIds,favoriteSeriesIds
0,1189e07e-b011-7011-7fbb-a75a4fe1dd6a,27205;157336;155;19995;293660,12;10759;80;10751;27;28,1399;71446;1402;93405;1396;807
1,e119804e-3011-700f-a5e3-e111aec24ac8,27205;157336;155;19995;118340,80;18;27;14;10765;9648;53;10766;878;10752;1076...,1399;66732;93405;84958;82856;121;274
2,d17900ae-a001-70a3-d3bc-9463452af02c,155;293660;24428;299536;118340,28;10759;12;16;10762;80;99;10764;10763;18;35;1...,1399;71446;1402;76479;100088
3,c1d9a05e-50c1-7041-a60e-9a56c092e612,27205;157336;155;19995;293660,28;10759;12;16;10762;80;35;99;10764;10763;18;1...,1399;71446;66732;1402;93405
4,b179c0ce-30f1-70a4-81eb-bb2d5b0020a3,27205;157336;550;680;13,99;10764;10763;80;9648;27;18;53;10766;12;10759...,1399;66732;93405;1396;456


In [9]:
user_updating_preferences = user_pref[user_pref['userId']== 'e119804e-3011-700f-a5e3-e111aec24ac8'].head()
user_updating_preferences

Unnamed: 0,userId,favoriteMoviesIds,favoriteGenresIds,favoriteSeriesIds
1,e119804e-3011-700f-a5e3-e111aec24ac8,27205;157336;155;19995;118340,80;18;27;14;10765;9648;53;10766;878;10752;1076...,1399;66732;93405;84958;82856;121;274


In [10]:
# convertir los valores en listas para expandirlos con explode
user_updating_preferences['favoriteGenresIds'] = user_updating_preferences['favoriteGenresIds'].apply(lambda x: x.split(';'))
user_updating_preferences['favoriteMoviesIds'] = user_updating_preferences['favoriteMoviesIds'].apply(lambda x: x.split(';'))
user_updating_preferences['favoriteSeriesIds'] = user_updating_preferences['favoriteSeriesIds'].apply(lambda x: x.split(';'))

# expandir preferencias de favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_updating_preferences[['userId','favoriteGenresIds']].explode('favoriteGenresIds')
user_fav_movies = user_updating_preferences[['userId','favoriteMoviesIds']].explode('favoriteMoviesIds')
user_fav_series = user_updating_preferences[['userId','favoriteSeriesIds']].explode('favoriteSeriesIds')

# IDs únicos que necesitamos de firestore
movie_ids = user_fav_movies['favoriteMoviesIds'].dropna().unique().tolist()
series_ids = user_fav_series['favoriteSeriesIds'].dropna().unique().tolist()
all_ids = set(movie_ids + series_ids) ## Esto es para buscar todo junto en una sola consulta

# Traer solo esos documentos desde Firestore
user_updating_preferences_data = []
for doc_id in all_ids:
    doc = collection_Data_EN.document(doc_id).get()
    if doc.exists:
        doc_data = doc.to_dict()
        doc_data['ID'] = doc.id  # arego el ID al dict
        user_updating_preferences_data.append(doc_data)

user_updating_preferences_data = pd.DataFrame(user_updating_preferences_data)

# lim´pio los IDs para los merge
user_updating_preferences_data['ID'] = user_updating_preferences_data['ID'].astype(str).str.strip()
user_updating_preferences_data['Genre'] = user_updating_preferences_data['Genre'].astype(str).str.strip()
user_fav_movies['favoriteMoviesIds'] = user_fav_movies['favoriteMoviesIds'].astype(str).str.strip()
user_fav_series['favoriteSeriesIds'] = user_fav_series['favoriteSeriesIds'].astype(str).str.strip()

# merges
user_fav_movies = user_fav_movies.merge(user_updating_preferences_data[['ID', 'CleanTitle', 'Synopsis','Genre']], 
                                        left_on='favoriteMoviesIds', right_on='ID', how='left')
user_fav_movies = user_fav_movies.drop('ID', axis=1)

user_fav_series = user_fav_series.merge(user_updating_preferences_data[['ID', 'CleanTitle', 'Synopsis','Genre']], 
                                        left_on='favoriteSeriesIds', right_on='ID', how='left')
user_fav_series = user_fav_series.drop('ID', axis=1)

# renombro las columnas para mejor orden y trabajo simple 
user_fav_movies.rename(columns={'CleanTitle':'Movies_Titles', 'Synopsis':'Movies_Synopsis', 'Genre':'Genres'}, inplace=True) ## Volver a incluir esto >> , 'Cast':'Movies_Cast', 'Directors':'Movies_Directors'
user_fav_series.rename(columns={'CleanTitle':'Series_Titles', 'Synopsis':'Series_Synopsis', 'Genre':'Genres'}, inplace=True) ## Volver a incluir esto >>, 'Cast':'Series_Cast', 'Directors':'Series_Directors'


In [11]:
user_fav_movies

Unnamed: 0,userId,favoriteMoviesIds,Movies_Titles,Movies_Synopsis,Genres
0,e119804e-3011-700f-a5e3-e111aec24ac8,27205,Inception,"Dom is a skilled thief, the best in the danger...",Suspense
1,e119804e-3011-700f-a5e3-e111aec24ac8,157336,Interstellar,From director Christopher Nolan (Inception) co...,Drama
2,e119804e-3011-700f-a5e3-e111aec24ac8,155,The Dark Knight,As a zealous prosecutor has just been appointe...,Drama; Suspense
3,e119804e-3011-700f-a5e3-e111aec24ac8,19995,,,
4,e119804e-3011-700f-a5e3-e111aec24ac8,118340,Guardians of the Galaxy,"From Marvel, the studio that brought you Iron ...",Comedy; Action; Fantasy; Adventure


In [12]:
# Sentences we want to be embedded from user_preferences MOVIES
user_fav_movies['movies_sentences_to_embed'] = (user_fav_movies.Movies_Titles.fillna('') + ' ' +
                                   user_fav_movies.Movies_Synopsis.fillna('')+
                                   user_fav_movies.Genres.fillna('') ) 
'''+
                                   user_pref.Movies_Cast.fillna('') +
                                   user_pref.Movies_Directors.fillna(''))''' ########## Lo comentado aca es porque Data_EN no tiene Cast y Directors

# Sentences we want to be embedded from user_preferences SERIES
user_fav_series['series_sentences_to_embed'] = (user_fav_series.Series_Titles.fillna('') + ' ' +
                                   user_fav_series.Series_Synopsis.fillna('') +
                                   user_fav_series.Genres.fillna('') )
'''+
                                   user_pref.Series_Cast.fillna('') +
                                   user_pref.Series_Directors.fillna(''))''' ########## Iden coment anterior Descomentar luego 

"+\n                                   user_pref.Series_Cast.fillna('') +\n                                   user_pref.Series_Directors.fillna(''))"

In [29]:
pd.set_option('display.max_colwidth', None)

In [30]:
print(user_fav_movies['movies_embeddings'][:1])

0    [-0.026602301746606827, 0.028560519218444824, 0.06465091556310654, 0.02077191136777401, -0.004675277974456549, -0.011423131451010704, 0.04505652189254761, -0.0007553789182566106, -0.017281683161854744, -0.003544130129739642, -0.04188522323966026, 0.08269699662923813, -0.02344983071088791, -0.0047472440637648106, -0.0375727117061615, 0.04891278222203255, -0.020134152844548225, -0.05802438408136368, -0.021529223769903183, 0.022875092923641205, -0.09180022776126862, 0.04565851762890816, 0.11059856414794922, -0.08253663778305054, 0.0022585606202483177, -0.02051333524286747, 0.03603992983698845, 0.061868369579315186, -0.062214385718107224, 0.05766313523054123, 0.003034910187125206, 0.04189164191484451, -0.027310272678732872, 0.061250992119312286, -0.011283356696367264, -0.04602236673235893, -0.008484824560582638, 0.06412173062562943, 0.010962994769215584, -0.02589067816734314, -0.006816479843109846, -0.11265116184949875, -0.009289391338825226, 0.04866735264658928, -0.019435374066233635

In [18]:
user_fav_series

Unnamed: 0,userId,favoriteSeriesIds,Series_Titles,Series_Synopsis,Genres,series_sentences_to_embed,series_embeddings
0,e119804e-3011-700f-a5e3-e111aec24ac8,1399,The Iron Throne,"In medieval times, in a world where summer str...",Drama,"The Iron Throne In medieval times, in a world ...","[-0.06221407651901245, 0.026763223111629486, 0..."
1,e119804e-3011-700f-a5e3-e111aec24ac8,66732,,,,,
2,e119804e-3011-700f-a5e3-e111aec24ac8,93405,,,,,
3,e119804e-3011-700f-a5e3-e111aec24ac8,84958,R U Invited?,Anika’s tribute to the Great Bear Rainforest g...,Drama,R U Invited? Anika’s tribute to the Great Bear...,"[-0.028456561267375946, 0.060793571174144745, ..."
4,e119804e-3011-700f-a5e3-e111aec24ac8,82856,Ninja Shinobuden,Shinobu is an apprentice ninja at a ninja scho...,Music,Ninja Shinobuden Shinobu is an apprentice ninj...,"[-0.11264573782682419, -0.0491345152258873, 0...."
5,e119804e-3011-700f-a5e3-e111aec24ac8,121,The Lord of the Rings: The Two Towers (Extende...,Frodo and Sam must trust Gollum with their liv...,Adventure; Action,The Lord of the Rings: The Two Towers (Extende...,"[-0.08823558688163757, 0.004783325362950563, 0..."
6,e119804e-3011-700f-a5e3-e111aec24ac8,274,The Silence of the Lambs,FBI agent Clarice Starling is sent to intervie...,Suspense,The Silence of the Lambs FBI agent Clarice Sta...,"[-0.05498502030968666, -0.012232079170644283, ..."


### Building Sentences for vectorize 'user_pref==Movies' & 'user_pref==Series' to VERTEX

In [21]:
# genero embedings del user
def get_embedding(text):
    """Genera el embedding para un texto usando Vertex AI"""
    if pd.isna(text) or text.strip() == "":  # Si el texto está vacío o es NaN
        return None
    return model.get_embeddings([text])[0].values  # Devuelve la lista de valores del embedding

user_fav_movies["movies_embeddings"] = user_fav_movies["movies_sentences_to_embed"].apply(get_embedding)
user_fav_series["series_embeddings"] = user_fav_series["series_sentences_to_embed"].apply(get_embedding)

In [22]:
user_fav_movies.head(2)

Unnamed: 0,userId,favoriteMoviesIds,Movies_Titles,Movies_Synopsis,Genres,movies_sentences_to_embed,movies_embeddings
0,e119804e-3011-700f-a5e3-e111aec24ac8,27205,Inception,"Dom is a skilled thief, the best in the danger...",Suspense,"Inception Dom is a skilled thief, the best in ...","[-0.026602301746606827, 0.028560519218444824, ..."
1,e119804e-3011-700f-a5e3-e111aec24ac8,157336,Interstellar,From director Christopher Nolan (Inception) co...,Drama,Interstellar From director Christopher Nolan (...,"[-0.04311944916844368, -0.004800629336386919, ..."


### Sending Sentences to vectorize for VERTEX now

# Working with Similarities

### getting local similarities for testing purposes, but awaiting for availability of 200k well-fed titles to obtain new effective embeddings

### Movies Similarities 

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###

# Convertir los embeddings a arrays de numpy para cálculos más rápidos
all_embeddings_from_filtered_data_array = np.array(list(all_embeddings_dict.values()))

# Diccionarios para almacenar las recomendaciones
movies_recommendations_dict = {}

# Obtener recomendaciones para cada usuario en movies_embeddings_dict
for user_id, user_embedding in movies_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    movies_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # Ordenar por similitud y seleccionar el top-10
    movies_most_similar_indexes = movies_content_similarities.argsort()[::-1][:10]
    
    # Convertir los índices en IDs reales
    movies_recommended_ids = [filtered_data.iloc[i]['ID'] for i in movies_most_similar_indexes]
    
    # Guardar en el diccionario
    movies_recommendations_dict[user_id] = movies_recommended_ids


# Mostrar ejemplos
print("Ejemplo de recomendaciones para un usuario en Movies:")
example_user = list(movies_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {movies_recommendations_dict[example_user]}")




### Tv Shows (Series) Similarities

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###
# dict para las recomendaciones
series_recommendations_dict = {}

number_target_series_recommend = 25

# recommend's para cada usuario en series_embeddings_dict
for user_id, user_embedding in series_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    series_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # orden por similitud y top-goal
    series_most_similar_indexes = series_content_similarities.argsort()[::-1][:number_target_series_recommend]
    
    # paso los indices a IDs reales
    series_recommended_ids = [filtered_data.iloc[i]['ID'] for i in series_most_similar_indexes]
    
    # Guardar en el diccionario
    series_recommendations_dict[user_id] = series_recommended_ids

# Mostrar ejemplos
print("\nEjemplo de recomendaciones para un usuario en Series:")
example_user = list(series_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {series_recommendations_dict[example_user]}")


# Recomendations

In [None]:
# guardo
with open("movies_recommendations.json", "w") as f:
    json.dump(movies_recommendations_dict, f, indent=4)

with open("series_recommendations.json", "w") as f:
    json.dump(series_recommendations_dict, f, indent=4)

print("Recomendaciones guardadas en JSON")
