In [18]:
import ast
import boto3
import firebase_admin
import json
import math
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F

from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from firebase_admin import credentials, firestore
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
#from torch.nn.functional import cosine_similarity

In [19]:
# uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

In [20]:
# Genres database, local by now
df_genres = pd.read_csv(r'generos.csv', sep=',')


### Content Data from FireBase (remains missing the conextion to firebase collection)

In [21]:
# Start Firebase if it's not done
if not firebase_admin._apps:
    cred_path = r'C:\Users\Agustín\OneDrive\Formación\2. Practicas\Data Scientist & ML Engineer\bubbo-dfba0-firebase-adminsdk-fbsvc-79dc4511e7.json'  
    cred = credentials.Certificate(cred_path)
    firebase_admin.initialize_app(cred)

In [22]:
# Firestore conexion and db collection name
db = firestore.client()
collection_ref = db.collection('Data_Clean')
# to get it all
docs = collection_ref.stream()
# documents to dictionaries
data = [{**doc.to_dict(), 'id': doc.id} for doc in docs]
df = pd.DataFrame(data)

In [None]:
# Cuando corro esta celda, "reseteo" a filtered_data
filtered_data = df
filtered_data = filtered_data.replace("",pd.NA)
filtered_data = filtered_data.dropna()                                                                
filtered_data = filtered_data.drop_duplicates()
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141207 entries, 0 to 141441
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CleanTitle    141207 non-null  object
 1   PlatformName  141207 non-null  object
 2   ID            141207 non-null  object
 3   Genre         141207 non-null  object
 4   Type          141207 non-null  object
 5   Synopsis      141207 non-null  object
 6   Cast          141207 non-null  object
 7   Directors     141207 non-null  object
 8   id            141207 non-null  object
dtypes: object(9)
memory usage: 10.8+ MB


### User Preferences from DynamoDB

In [51]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),     ###################### DIRECTOR MAS CAST HAY QUE TRAER DESPUES CUANDO COMPLETO EL DF LUEGO DE FILTERED_DATA
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


In [52]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()             
user_pref.reset_index(inplace=True,drop=True)
print(f'Duplicates: {user_pref.duplicated().sum()}')
user_pref.info()

Duplicates: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8879 entries, 0 to 8878
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userId             8879 non-null   object
 1   favoriteMoviesIds  8879 non-null   object
 2   favoriteGenresIds  8879 non-null   object
 3   favoriteSeriesIds  8879 non-null   object
dtypes: object(4)
memory usage: 277.6+ KB


In [54]:
# Ahora tengo que traer tambien las Cast y Director

# convertir los valores en listas para expandirlos con explode
user_pref['favoriteGenresIds'] = user_pref['favoriteGenresIds'].apply(lambda x: x.split(';'))
user_pref['favoriteMoviesIds'] = user_pref['favoriteMoviesIds'].apply(lambda x: x.split(';'))
user_pref['favoriteSeriesIds'] = user_pref['favoriteSeriesIds'].apply(lambda x: x.split(';'))

# expandir preferencias de favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_pref[['userId','favoriteGenresIds']].explode('favoriteGenresIds')
user_fav_movies = user_pref[['userId','favoriteMoviesIds']].explode('favoriteMoviesIds')
user_fav_series = user_pref[['userId','favoriteSeriesIds']].explode('favoriteSeriesIds')


# merge para traerme los CleanTitle, Synopsis, 'Genre'
user_fav_genres['favoriteGenresIds'] = user_fav_genres['favoriteGenresIds'].astype(int)
user_fav_genres = user_fav_genres.merge(df_genres[['genero_id','genero_name']], left_on='favoriteGenresIds', right_on='genero_id') 


filtered_data = filtered_data.dropna(subset=['ID'])                                                                                    # <- Nuevo agregado# <- Nuevo agregado# <- Nuevo agregado
filtered_data['ID'] = filtered_data['ID'].astype(str)                                                                                   # <- Nuevo agregado# <- Nuevo agregado# <- Nuevo agregado

user_fav_movies = user_fav_movies.merge(filtered_data[['ID','CleanTitle','Synopsis', 'Cast', 'Directors']], left_on='favoriteMoviesIds', right_on='ID', how='left')  ###### en esta y lasig fila agregué synopsis
user_fav_series = user_fav_series.merge(filtered_data[['ID','CleanTitle','Synopsis', 'Cast', 'Directors']], left_on='favoriteSeriesIds', right_on='ID', how='left')

user_fav_genres = user_fav_genres.drop(columns='genero_id')
user_fav_genres.rename(columns={'genero_name':'Genres'}, inplace=True)
user_fav_movies = user_fav_movies.drop(columns='ID')
user_fav_movies.rename(columns={'CleanTitle':'Movies_Titles', 'Synopsis':'Movies_Synopsis', 'Cast':'Movies_Cast', 'Directors':'Movies_Directors'}, inplace=True)
user_fav_series = user_fav_series.drop(columns='ID')
user_fav_series.rename(columns={'CleanTitle':'Series_Titles', 'Synopsis':'Series_Synopsis', 'Cast':'Series_Cast', 'Directors':'Series_Directors'}, inplace=True)

# reAGRUPO por userId para que me queden las listas CleanTitle, Synopsis, 'Genre' por user segun sus favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_fav_genres.groupby('userId')[['favoriteGenresIds','Genres']].agg(list).reset_index()
user_fav_movies = user_fav_movies.groupby('userId')[['favoriteMoviesIds','Movies_Titles', 'Movies_Synopsis', 'Movies_Cast', 'Movies_Directors']].agg(list).reset_index()
user_fav_series = user_fav_series.groupby('userId')[['favoriteSeriesIds','Series_Titles', 'Series_Synopsis', 'Series_Cast', 'Series_Directors']].agg(list).reset_index()


In [56]:
#termino de acomodar 'user_pref' para dar paso a los embeddings
user_pref = user_pref.merge(user_fav_genres, left_on='userId', right_on='userId').drop(columns=['favoriteGenresIds_y'])
user_pref.rename(columns={'favoriteGenresIds_x':'favoriteGenresIds'},inplace=True)
user_pref = user_pref.merge(user_fav_movies, left_on='userId', right_on='userId').drop(columns=['favoriteMoviesIds_y'])
user_pref.rename(columns={'favoriteMoviesIds_x':'favoriteMoviesIds'},inplace=True)
user_pref = user_pref.merge(user_fav_series, left_on='userId', right_on='userId').drop(columns=['favoriteSeriesIds_y'])
user_pref.rename(columns={'favoriteSeriesIds_x':'favoriteSeriesIds'},inplace=True)
user_pref = user_pref.reindex(['userId', 'favoriteGenresIds', 'Genres', 'favoriteMoviesIds', 'Movies_Titles','Movies_Synopsis', 'Movies_Cast', 'Movies_Directors', 'favoriteSeriesIds', 'Series_Titles', 'Series_Synopsis', 'Series_Cast', 'Series_Directors'], axis=1)

### Sentences for vectorize from 'filtered_data' / AHORA DESDE FIREBASE

In [60]:
# Making the sentences to embed 
filtered_data['sentences_to_embed'] = (
    filtered_data.CleanTitle.fillna('') +
    filtered_data.Synopsis.fillna('') +
    filtered_data.Genre.fillna('').apply(
        lambda x: ', '.join(ast.literal_eval(x)) if x.startswith('[') and x.endswith(']') else x ) +
    filtered_data.Cast.fillna('') +
    filtered_data.Directors.fillna('')
)

ids_from_filtered_data = filtered_data['ID'].tolist()  # Guardamos los IDs
sentences_from_filtered_data = filtered_data['sentences_to_embed'].dropna().astype(str).tolist()



In [61]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 75)

In [66]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload, timeout=10)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    

########################################################################################################################## NUEVO DESDE AQUI EN ESTA CELDA, mantengo los IDs

all_embeddings_from_filtered_data = []
for batch in batches:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_filtered_data.extend(embeddings)

# Asociamos cada embedding con su respectivo ID
all_embeddings_dict = {id_: emb for id_, emb in zip(ids_from_filtered_data, all_embeddings_from_filtered_data)}

print("Embeddings processed successfully:")
print(list(all_embeddings_dict.items())[:2])  # Muestra los primeros pares ID - embedding


Clave de API cargada correctamente.
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing batch with 75 sentences...
Processing bat

In [69]:
# Verificar cuántos embeddings se procesaron correctamente
num_processed_batches = len(all_embeddings_from_filtered_data) // 100  # 100 es el tamaño de cada batch

print(f"Se procesaron {num_processed_batches} batches exitosamente.")

# Ver el último batch procesado con éxito
if num_processed_batches > 0:
    print("Último batch procesado correctamente:")
    print(batches[num_processed_batches - 1])
else:
    print("No se procesó ningún batch correctamente.")


Se procesaron 750 batches exitosamente.
Último batch procesado correctamente:
['Love RiceAt the Kokuritsu Inaho Academy, five new students try to supplant bread as the popular grain at school. The new students form the Love Rice team and dare themselves to perform at the Harvest Show to convey the tasty appeal of rice grains.AnimeNo CastNo Directors', 'Pirata & CapitanoPirata y Murana buscan el tesoro del pirata Sardina, como Pirata va ganando con ayuda de Weboo, Murana lo aleja enviándole una invitación falsa para un concurso de música. Pero al llegar al lugar del tesoro, ¡ambas necesitan a Weboo para entrar!ChildrenKaycie Chase; Tiffany HofstetterFrançois Narboux', 'Memorial de Maria MouraMaria Moura perde o pai na infância e aos 17 anos encontra a mãe morta em casa. Acreditando ser o padrasto o assassino, Maria procura se vingar a morte da sua mãe.DramaNo CastNo Directors', 'Un conte peut en cacher un autreVersions animées des poèmes de Roald Dahl, un auteur de livres pour enfants t

In [None]:
import pandas as pd

# Cargar el archivo manualmente y procesarlo línea por línea
file_path = "../../../batches_error.csv"

with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Convertir a DataFrame
df = pd.DataFrame({"output": [line.strip() for line in lines]})

# Identificar los índices de los batches fallidos
failed_batches = []
batch_index = 1  # Inicializa el contador de batches

for line in df["output"]:
    if "Processing batch with" in line:
        current_batch = batch_index
        batch_index += 1
    elif "Error" in line:
        failed_batches.append(current_batch)

# Mostrar los batches que deben reintentarse
print("Batches que deben reintentarse MAÑANA:", failed_batches)


Batches que deben reintentarse: [165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169,

### Sentences for vectorize from 'user_preferences' / AHORA DESDE DYNAMODB

In [65]:
# Sentences we want to be embedded from user_preferences MOVIES
user_pref['movies_sentences_to_embed'] = (user_pref.Movies_Titles.fillna('') +
                                   user_pref.Movies_Synopsis.fillna('')+
                                   user_pref.Genres.fillna('') +
                                   user_pref.Movies_Cast.fillna('') +
                                   user_pref.Movies_Directors.fillna(''))

# Sentences we want to be embedded from user_preferences SERIES
user_pref['series_sentences_to_embed'] = (user_pref.Series_Titles.fillna('') + 
                                   user_pref.Series_Synopsis.fillna('') +
                                   user_pref.Genres.fillna('') +
                                   user_pref.Series_Cast.fillna('') +
                                   user_pref.Series_Directors.fillna(''))

############################################################################################################## NUEVO BBBB
# Guardar userId junto con la oración a vectorizar
movies_sentences_from_user_pref = user_pref[['userId', 'movies_sentences_to_embed']].dropna().astype(str)
series_sentences_from_user_pref = user_pref[['userId', 'series_sentences_to_embed']].dropna().astype(str)
############################################################################################################## NUEVO BBBB CIERRO

# We split the sentences in batches as we did previously with filtered_data
movies_batches_user_pref = split_into_batches(movies_sentences_from_user_pref, 75)
series_batches_user_pref = split_into_batches(series_sentences_from_user_pref, 75)


In [None]:
############################################################################################################## NUEVO CCCC
movies_embeddings_dict = {}  # Diccionario para almacenar {userId: embedding}
for batch in movies_batches_user_pref:
    print(f"Processing movie batch with {len(batch)} sentences...")
    
    batch_user_ids = batch['userId'].tolist()
    batch_sentences = batch['movies_sentences_to_embed'].tolist()
    
    embeddings = get_embeddings_from_api(batch_sentences)
    if embeddings:
        movies_embeddings_dict.update({uid: emb for uid, emb in zip(batch_user_ids, embeddings)})

print("Movies Embeddings processed successfully.")
############################################################################################################## NUEVO CCCC CIERRO

In [None]:
############################################################################################################## NUEVO DDDD
series_embeddings_dict = {}  # Diccionario para almacenar {userId: embedding}
for batch in series_batches_user_pref:
    print(f"Processing series batch with {len(batch)} sentences...")
    
    batch_user_ids = batch['userId'].tolist()
    batch_sentences = batch['series_sentences_to_embed'].tolist()
    
    embeddings = get_embeddings_from_api(batch_sentences)
    if embeddings:
        series_embeddings_dict.update({uid: emb for uid, emb in zip(batch_user_ids, embeddings)})

print("Series Embeddings processed successfully.")
############################################################################################################## NUEVO DDDD CIERRO

### Movies Similarities 

In [None]:
# All this cell content is about movies
# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
movies_embeddings_from_user_pref_array = np.array(movies_embeddings_from_user_pref)
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data)

user_for_example = 2

# Taking first user as example to calculate the cosine_similarity
movies_user_embedding_example = movies_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # Asegurar forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
movies_content_similarities = cosine_similarity(movies_user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
movies_most_similar_indexes = movies_content_similarities.argsort()[::-1]

# Top-10
movies_topten_most_similar_indexes = movies_most_similar_indexes[:10]

In [None]:
# To display the movies most similar indexes and their similarity scores
print("Movies Most similar indexes:", movies_most_similar_indexes[:10])
print("Movies Highest similarities:", movies_content_similarities[movies_most_similar_indexes[:10]])

### Tv Shows (Series) Similarities

In [None]:
# All this cell content is about series 
# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
series_embeddings_from_user_pref_array = np.array(series_embeddings_from_user_pref)
'''Estas lineas comentadas ya las hicen en movies, las dejo para verlas nada mas.
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data) esta linea ya la hice en movies

user_for_example = 1'''

# Taking first user as example to calculate the cosine_similarity
series_user_embedding_example = series_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
series_content_similarities = cosine_similarity(series_user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
series_most_similar_indexes = series_content_similarities.argsort()[::-1]

# Top-10
series_topten_most_similar_indexes = series_most_similar_indexes[:10]

In [None]:
# To display the series most similar indexes and their similarity scores
print("Series Most similar indexes:", series_most_similar_indexes[:10])
print("Series Highest similarities:", series_content_similarities[series_most_similar_indexes[:10]])

# Recomendations

In [None]:
# search most_similar_indexes, and preferences, and get recommendations
user_id = user_pref.loc[user_for_example]['userId']
movies_preferred = user_pref[user_pref['userId']==user_id]['Movies_Titles']
series_preferred = user_pref[user_pref['userId']==user_id]['Series_Titles']

# Displaying Preferences & Recommendations
print(f'''User {user_id} Preferences:
      ''')
print(f'''Movies preference:''')
for movie in movies_preferred.iloc[0]:# .split(';'):
    print(f'      {movie.strip()}')

print(f'''TV Shows preference:''')
for series in series_preferred.iloc[0]:# .split(';'):
    print(f'''      {series.strip()}
          ''')

print(f'''Recomendations for user: 
      {user_id}
      ''')

################################################################################################################# NUEVO AAAA
# Convertimos los índices más similares en IDs reales
movies_recommended_ids = [ids_from_filtered_data[i] for i in movies_topten_most_similar_indexes]
series_recommended_ids = [ids_from_filtered_data[i] for i in series_topten_most_similar_indexes]
################################################################################################################# CIERRO NUEVO AAAA

# Ahora buscamos los títulos usando los IDs reales
movies_recomendations_user = filtered_data[filtered_data['ID'].isin(movies_recommended_ids)]['CleanTitle']
series_recomendations_user = filtered_data[filtered_data['ID'].isin(series_recommended_ids)]['CleanTitle']


print('Movies Recommendations:')
for recommendation in movies_recomendations_user:
    print(f'      {recommendation}')

print('Tv Shows Recommendations:')
for recommendation in series_recomendations_user:
    print(f'      {recommendation}')