### Environmental Variables

In [2]:
import ast
import boto3
import firebase_admin
import json
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F
import vertexai
from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from firebase_admin import credentials, firestore
from google.cloud import storage
from requests.exceptions import ReadTimeout
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from vertexai.language_models import TextEmbeddingModel

In [3]:
# uploading the environment variables and get the API key
load_dotenv()
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

# Asegurar que Vertex AI use las credenciales correctas
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../../../bubbo-dfba0-47e395cdcdc7.json"

BUCKET_NAME = 'embeddings_bucket_backup'
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)


In [4]:
# Ruta a las credenciales de Firebase para la app principal
cred_default_path = "../../../bubbo-dfba0-47e395cdcdc7.json"

# Inicializar Firebase
if not firebase_admin._apps:
    # Inicializar la app por defecto
    cred_default = credentials.Certificate(cred_default_path)
    default_app = firebase_admin.initialize_app(cred_default, name="default")

# Conectar Firestore con la app inicializada
db = firestore.client(app=default_app)  
collection_Data_EN = db.collection('Data_EN') 
collection_Genres_DB = db.collection('Genres_DB') 

In [5]:
# Inicializar Vertex AI
PROJECT_ID = "bubbo-dfba0"
REGION = "us-central1"  # Región donde está alojado el servicio
MODEL_ID = "text-multilingual-embedding-002"

vertexai.init(project=PROJECT_ID, location=REGION)

# 🔹 4. Cargar modelo de embeddings
model = TextEmbeddingModel.from_pretrained(MODEL_ID)

# User Preferences CONTENT

### Extracción de Datos de user_preference (por el momento de dynamodb)

In [6]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),     
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


In [None]:
# Para resetear sin tener que consultar dynamo de nuevo
#user_pref_aux = user_pref
user_pref = user_pref_aux

### Transforming 'user_pref' to our purposes

In [8]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()             
user_pref.reset_index(inplace=True,drop=True)

In [9]:
# extraigo algunos para probar luego
user_random_list = user_pref['userId'].sample(15,random_state=420)


In [10]:
# extraigo uno solo que variará en las prubas
random_user = user_random_list.sample(1).iloc[0]
random_user

'01b900ce-b071-7059-6adf-05f64709f92e'

In [11]:
user_updating_preferences = user_pref[user_pref['userId']== f'{random_user}'].head()
user_updating_preferences

Unnamed: 0,userId,favoriteMoviesIds,favoriteGenresIds,favoriteSeriesIds
3929,01b900ce-b071-7059-6adf-05f64709f92e,155;19995;293660;24428;118340,9648;10749;27;18;80;16;10762;10770;10766;10767...,93405;85271;1416;456;119051
4697,01b900ce-b071-7059-6adf-05f64709f92e,27205;155;19995;293660;24428,10749;18;12;10759;28;9648;14;10765;27,93405;85271;84958;1416;18165


In [12]:
# Solo para probar con un usuario existente pero inventando el onboarding para que matchee
# con algunos los embeddings que exiten ya en el bucket (borrar esto luego de que el bucket de 
# embeddings esté completo)

user_updating_preferences = pd.DataFrame([{
    "userId": "6119d08e-6071-70d7-0018-d3eaf3ceb928",
    "favoriteMoviesIds": "1003996;10028000;1001595;299534;597",
    "favoriteGenresIds": "28;10759;80;10402;9648;99;10764;10763;27;10751",
    "favoriteSeriesIds": "1002156;100477;10002349;1402;93405"
}])

user_updating_preferences


Unnamed: 0,userId,favoriteMoviesIds,favoriteGenresIds,favoriteSeriesIds
0,6119d08e-6071-70d7-0018-d3eaf3ceb928,1003996;10028000;1001595;299534;597,28;10759;80;10402;9648;99;10764;10763;27;10751,1002156;100477;10002349;1402;93405


In [13]:
# convertir los valores en listas para expandirlos con explode
user_updating_preferences['favoriteGenresIds'] = user_updating_preferences['favoriteGenresIds'].apply(lambda x: x.split(';'))
user_updating_preferences['favoriteMoviesIds'] = user_updating_preferences['favoriteMoviesIds'].apply(lambda x: x.split(';'))
user_updating_preferences['favoriteSeriesIds'] = user_updating_preferences['favoriteSeriesIds'].apply(lambda x: x.split(';'))

# expandir preferencias de favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_updating_preferences[['userId','favoriteGenresIds']].explode('favoriteGenresIds')
user_fav_movies = user_updating_preferences[['userId','favoriteMoviesIds']].explode('favoriteMoviesIds')
user_fav_series = user_updating_preferences[['userId','favoriteSeriesIds']].explode('favoriteSeriesIds')

# IDs únicos que necesitamos traer desde el bucket de embeddings
movie_ids = user_fav_movies['favoriteMoviesIds'].dropna().unique().tolist()
series_ids = user_fav_series['favoriteSeriesIds'].dropna().unique().tolist()


In [14]:
# Traer solo los json que necesito
user_fav_movies_embeddings = []
user_fav_series_embeddings = []
blobs = list(bucket.list_blobs())

# Busco las coincidencias entre los ID preferidos del usuario y el contenido del bucket
movies_matching_blobs = [blob for blob in blobs if blob.name.removesuffix('.json') in movie_ids]
series_matching_blobs = [blob for blob in blobs if blob.name.removesuffix('.json') in series_ids]

def getting_matching_blobs(matching_blobs, example_data):
    for blob in matching_blobs:
        content = blob.download_as_text()  # Descarga el contenido del JSON como string
        blob_data = json.loads(content)  # Convierte el JSON a diccionario
        
        # Extraemos solo los campos que necesitamos ('ID' y 'embedding')
        filtered_data = {
            'ID': blob.name.removesuffix('.json'),
            'embedding': blob_data.get('embedding')  # Usamos get() para evitar KeyError si no existe 'embedding'
        }
        
        # Convertimos los datos filtrados en un DataFrame y concatenamos
        filtered_df = pd.DataFrame([filtered_data])  # Crear un DataFrame temporal
        example_data = pd.concat([example_data, filtered_df], ignore_index=True)  # Concatenamos
        
    return example_data  # Retornamos el DataFrame actualizado

# Inicializamos los DataFrames vacíos
user_fav_movies_embeddings = pd.DataFrame(columns=['ID', 'embedding'])
user_fav_series_embeddings = pd.DataFrame(columns=['ID', 'embedding'])

# Llamamos a la función para procesar los datos
user_fav_movies_embeddings = getting_matching_blobs(movies_matching_blobs, user_fav_movies_embeddings)
user_fav_series_embeddings = getting_matching_blobs(series_matching_blobs, user_fav_series_embeddings)

# Concatenar las listas de películas y series (VER SI LO BORRO O NO)
user_updating_preferences_data = pd.concat([user_fav_movies_embeddings, user_fav_series_embeddings], ignore_index=True)


In [15]:
user_updating_preferences_data

Unnamed: 0,ID,embedding
0,1001595,"[-0.006456031929701567, 0.007282349746674299, ..."
1,10028000,"[-0.018329104408621788, -0.027649477124214172,..."
2,1003996,"[-0.005377643741667271, 0.036034710705280304, ..."
3,10002349,"[-0.034691113978624344, 0.042049847543239594, ..."
4,1002156,"[-0.010540147311985493, -0.01704169251024723, ..."
5,100477,"[-0.06357842683792114, -0.005006975494325161, ..."


# Working with Similarities (FROM HERE IS OLD! Must be re-done)

IndentationError: unexpected indent (1802731378.py, line 3)

### Movies Similarities 

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###

# Convertir los embeddings a arrays de numpy para cálculos más rápidos
all_embeddings_from_filtered_data_array = np.array(list(all_embeddings_dict.values()))

# Diccionarios para almacenar las recomendaciones
movies_recommendations_dict = {}

# Obtener recomendaciones para cada usuario en movies_embeddings_dict
for user_id, user_embedding in movies_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    movies_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # Ordenar por similitud y seleccionar el top-10
    movies_most_similar_indexes = movies_content_similarities.argsort()[::-1][:10]
    
    # Convertir los índices en IDs reales
    movies_recommended_ids = [filtered_data.iloc[i]['ID'] for i in movies_most_similar_indexes]
    
    # Guardar en el diccionario
    movies_recommendations_dict[user_id] = movies_recommended_ids


# Mostrar ejemplos
print("Ejemplo de recomendaciones para un usuario en Movies:")
example_user = list(movies_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {movies_recommendations_dict[example_user]}")




NameError: name 'all_embeddings_dict' is not defined

### Tv Shows (Series) Similarities

In [None]:
### >>>>>> NO CORRER ESTA CELDA EN LOCAL PORQUE PUEDE DEMORAR HORAS <<<<<<<< ###
# dict para las recomendaciones
series_recommendations_dict = {}

number_target_series_recommend = 25

# recommend's para cada usuario en series_embeddings_dict
for user_id, user_embedding in series_embeddings_dict.items():
    user_embedding_array = np.array(user_embedding).reshape(1, -1)  # Asegurar la forma correcta
    series_content_similarities = cosine_similarity(user_embedding_array, all_embeddings_from_filtered_data_array).flatten()
    
    # orden por similitud y top-goal
    series_most_similar_indexes = series_content_similarities.argsort()[::-1][:number_target_series_recommend]
    
    # paso los indices a IDs reales
    series_recommended_ids = [filtered_data.iloc[i]['ID'] for i in series_most_similar_indexes]
    
    # Guardar en el diccionario
    series_recommendations_dict[user_id] = series_recommended_ids

# Mostrar ejemplos
print("\nEjemplo de recomendaciones para un usuario en Series:")
example_user = list(series_recommendations_dict.keys())[0]
print(f"Usuario: {example_user} - Recomendaciones: {series_recommendations_dict[example_user]}")


# Recomendations

In [None]:
# guardo
with open("movies_recommendations.json", "w") as f:
    json.dump(movies_recommendations_dict, f, indent=4)

with open("series_recommendations.json", "w") as f:
    json.dump(series_recommendations_dict, f, indent=4)

print("Recomendaciones guardadas en JSON")
