In [99]:
import ast
import boto3
import firebase_admin
import json
import math
import numpy as np
import os
import pandas as pd
import requests
import time
import torch
import torch.nn.functional as F

from botocore.exceptions import ClientError
from decimal import Decimal
from dotenv import load_dotenv
from firebase_admin import credentials, firestore
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
#from torch.nn.functional import cosine_similarity

In [100]:
# uploading the environment variables and get the API key
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")

In [101]:
# Genres database, local by now
df_genres = pd.read_csv(r'generos.csv', sep=',')


### Content Data from FireBase (remains missing the conextion to firebase collection)

In [102]:
# Start Firebase if it's not done
if not firebase_admin._apps:
    cred_path = r'C:\Users\Agustín\OneDrive\Formación\2. Practicas\Data Scientist & ML Engineer\bubbo-dfba0-firebase-adminsdk-fbsvc-79dc4511e7.json'  # Asegúrate de que la ruta sea correcta
    cred = credentials.Certificate(cred_path)
    firebase_admin.initialize_app(cred)

In [103]:
# Firestore conexion and db collection name
db = firestore.client()
collection_ref = db.collection('Data_Clean')
# to get it all
docs = collection_ref.stream()
# documents to dictionaries
data = [{**doc.to_dict(), 'id': doc.id} for doc in docs]
df = pd.DataFrame(data)

In [104]:
filtered_data = df
filtered_data = filtered_data.replace("",pd.NA)
filtered_data = filtered_data.dropna()
filtered_data = filtered_data.drop_duplicates()
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141207 entries, 0 to 141441
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   CleanTitle    141207 non-null  object
 1   PlatformName  141207 non-null  object
 2   ID            141207 non-null  object
 3   Genre         141207 non-null  object
 4   Type          141207 non-null  object
 5   Synopsis      141207 non-null  object
 6   Cast          141207 non-null  object
 7   Directors     141207 non-null  object
 8   id            141207 non-null  object
dtypes: object(9)
memory usage: 10.8+ MB


### User Preferences from DynamoDB

In [183]:
# To get the info from DynamoDB, user preferences
CONFIG = {
    'aws': {
        'access_key': AWS_ACCESS_KEY,
        'secret_key': AWS_SECRET_KEY,
        'region': 'eu-west-3',
        'table': 'User-7kkcm5dn2rb77hst5nh7gbdisa-staging'
    },
    'columns': ['userId', 'favoriteMoviesIds', 'favoriteGenresIds', 'favoriteSeriesIds'],
}

# conexion
session = boto3.Session(
    aws_access_key_id=CONFIG['aws']['access_key'],
    aws_secret_access_key=CONFIG['aws']['secret_key'],
    region_name=CONFIG['aws']['region']
)

table = session.resource('dynamodb').Table(CONFIG['aws']['table'])

# Values to String
def _process_value(value):
    if isinstance(value, Decimal):
        return str(int(value))
    return str(value)

# Retrive info from DynamoDB and gets a DataFrame
def fetch_preferences():
    try:
        items = []
        start_key = None

        while True:
            # scan with defined 'columns'  in previous 'CONFIG'
            scan_params = {
                'ProjectionExpression': ', '.join(CONFIG['columns'])
            }
            if start_key:
                scan_params['ExclusiveStartKey'] = start_key

            response = table.scan(**scan_params)
            items.extend(response.get('Items', []))

            # check for next pages
            start_key = response.get('LastEvaluatedKey')
            if not start_key:
                break

        # data extracted processing
        processed_data = [{
            'userId': _process_value(item.get('userId', '')),
            'favoriteMoviesIds': ';'.join(map(_process_value, item.get('favoriteMoviesIds', []))),
            'favoriteGenresIds': ';'.join(map(_process_value, item.get('favoriteGenresIds', []))),
            'favoriteSeriesIds': ';'.join(map(_process_value, item.get('favoriteSeriesIds', [])))
        } for item in items]

        df = pd.DataFrame(processed_data)
        return df

    except ClientError as e:
        print(f"Error al conectar con DynamoDB: {e}")
        return pd.DataFrame()

# calling function to get the df
user_pref = fetch_preferences()


In [184]:
# limpio el dataframe dejando solo users con genero, movie_favs y tvshow_favs
user_pref = user_pref[user_pref['userId'].str.len()==36]
user_pref = user_pref.replace("",pd.NA)
user_pref = user_pref.dropna()
user_pref.reset_index(inplace=True,drop=True)
print(f'Duplicates: {user_pref.duplicated().sum()}')
user_pref.info()

Duplicates: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8144 entries, 0 to 8143
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   userId             8144 non-null   object
 1   favoriteMoviesIds  8144 non-null   object
 2   favoriteGenresIds  8144 non-null   object
 3   favoriteSeriesIds  8144 non-null   object
dtypes: object(4)
memory usage: 254.6+ KB


In [185]:
# celda para producir no terminada
# Ahora tengo que traer tambien las synopsis por titulo para poder hacer los embeddings correctamente


# convertir los valores en listas para expandirlos con explode
user_pref['favoriteGenresIds'] = user_pref['favoriteGenresIds'].apply(lambda x: x.split(';'))
user_pref['favoriteMoviesIds'] = user_pref['favoriteMoviesIds'].apply(lambda x: x.split(';'))
user_pref['favoriteSeriesIds'] = user_pref['favoriteSeriesIds'].apply(lambda x: x.split(';'))

# expandir preferencias de favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_pref[['userId','favoriteGenresIds']].explode('favoriteGenresIds')
user_fav_movies = user_pref[['userId','favoriteMoviesIds']].explode('favoriteMoviesIds')
user_fav_series = user_pref[['userId','favoriteSeriesIds']].explode('favoriteSeriesIds')


# merge para traerme los CleanTitle, Synopsis, 'Genre'
user_fav_genres['favoriteGenresIds'] = user_fav_genres['favoriteGenresIds'].astype(int)
user_fav_genres = user_fav_genres.merge(df_genres[['genero_id','genero_name']], left_on='favoriteGenresIds', right_on='genero_id') 

user_fav_movies = user_fav_movies.merge(filtered_data[['ID','CleanTitle','Synopsis']], left_on='favoriteMoviesIds', right_on='ID')  ###### en esta y lasig fila agregué synopsis
user_fav_series = user_fav_series.merge(filtered_data[['ID','CleanTitle','Synopsis']], left_on='favoriteSeriesIds', right_on='ID')

user_fav_genres = user_fav_genres.drop(columns='genero_id')
user_fav_genres.rename(columns={'genero_name':'Genres'}, inplace=True)
user_fav_movies = user_fav_movies.drop(columns='ID')
user_fav_movies.rename(columns={'CleanTitle':'Movies_Titles', 'Synopsis':'Movies_Synopsis'}, inplace=True)
user_fav_series = user_fav_series.drop(columns='ID')
user_fav_series.rename(columns={'CleanTitle':'Series_Titles', 'Synopsis':'Series_Synopsis'}, inplace=True)

# reAGRUPO por userId para que me queden las listas CleanTitle, Synopsis, 'Genre' por user segun sus favoriteMoviesIds, favoriteGenresIds, y favoriteSeriesIds por userId
user_fav_genres = user_fav_genres.groupby('userId')[['favoriteGenresIds','Genres']].agg(list).reset_index()
user_fav_movies = user_fav_movies.groupby('userId')[['favoriteMoviesIds','Movies_Titles', 'Movies_Synopsis']].agg(list).reset_index()
user_fav_series = user_fav_series.groupby('userId')[['favoriteSeriesIds','Series_Titles', 'Series_Synopsis' ]].agg(list).reset_index()


In [186]:
user_fav_movies.head()

Unnamed: 0,userId,favoriteMoviesIds,Movies_Titles,Movies_Synopsis
0,0109009e-50f1-703a-d2d7-a8cb2bc6ba4f,"[293660, 13, 672]","[Deadpool, Forrest Gump, Harry Potter et la ch...",[Opplev den opprinnelige historien om Wade Wil...
1,0109106e-f0e1-70e3-5f43-ff611d12dfff,"[157336, 19995, 293660, 24428, 299536]","[Interstellar, Avatar, Deadpool, Marvel's The ...","[From director Christopher Nolan (Inception, T..."
2,0109206e-4071-7016-6dc0-bc6bb9535f19,"[27205, 157336, 155, 293660, 24428]","[Inception, Interstellar, The Dark Knight, Dea...",[Dom Cobb (Leonardo DiCaprio) is a skilled thi...
3,010920ae-b001-7061-2bf2-c4d4f436aba1,"[293660, 24428, 475557, 299534, 106646]","[Deadpool, Marvel's The Avengers, Joker, Aveng...",[Opplev den opprinnelige historien om Wade Wil...
4,010920de-1051-7006-0e91-5a0db48ec63a,"[24428, 299534, 475557, 122, 68721]","[Marvel's The Avengers, Avengers: Endgame, Jok...",[Marvel makes cinematic history as it unites t...


In [187]:
user_fav_series.head()

Unnamed: 0,userId,favoriteSeriesIds,Series_Titles,Series_Synopsis
0,0109009e-50f1-703a-d2d7-a8cb2bc6ba4f,[1399],[Game of Thrones],[Il y a de l'orage dans l'air au royaume des S...
1,0109106e-f0e1-70e3-5f43-ff611d12dfff,"[60735, 60574]","[The Flash, Peaky Blinders]",[Fast-paced superhero drama following Central ...
2,0109206e-4071-7016-6dc0-bc6bb9535f19,[1396],[The Mirror],[Andrej Tarkovskijs løst selvbiografiske klass...
3,010920ae-b001-7061-2bf2-c4d4f436aba1,[1418],[Te Doy Mis Ojos],"[Una noche de invierno, Pilar sale huyendo de ..."
4,010920de-1051-7006-0e91-5a0db48ec63a,"[63174, 60574]","[Lucifer, Peaky Blinders]",[Lucifer kjeder seg og er ikke lykkelig som He...


In [188]:
#termino de acomodar 'user_pref' para dar paso a los embeddings
user_pref = user_pref.merge(user_fav_genres, left_on='userId', right_on='userId').drop(columns=['favoriteGenresIds_y'])
user_pref.rename(columns={'favoriteGenresIds_x':'favoriteGenresIds'},inplace=True)
user_pref = user_pref.merge(user_fav_movies, left_on='userId', right_on='userId').drop(columns=['favoriteMoviesIds_y'])
user_pref.rename(columns={'favoriteMoviesIds_x':'favoriteMoviesIds'},inplace=True)
user_pref = user_pref.merge(user_fav_series, left_on='userId', right_on='userId').drop(columns=['favoriteSeriesIds_y'])
user_pref.rename(columns={'favoriteSeriesIds_x':'favoriteSeriesIds'},inplace=True)
user_pref = user_pref.reindex(['userId', 'favoriteGenresIds', 'Genres', 'favoriteMoviesIds', 'Movies_Titles','Movies_Synopsis', 'favoriteSeriesIds', 'Series_Titles', 'Series_Synopsis'], axis=1)

In [191]:
user_pref.head()

Unnamed: 0,userId,favoriteGenresIds,Genres,favoriteMoviesIds,Movies_Titles,Movies_Synopsis,favoriteSeriesIds,Series_Titles,Series_Synopsis
0,d17900ae-a001-70a3-d3bc-9463452af02c,"[28, 10759, 12, 16, 10762, 80, 99, 10764, 1076...","[Acción, Action & Adventure, Aventura, Animaci...","[155, 293660, 24428, 299536, 118340]","[The Dark Knight, Deadpool, Marvel's The Aveng...",[Having struck his first blow against the crim...,"[1399, 71446, 1402, 76479, 100088]",[Game of Thrones],[Il y a de l'orage dans l'air au royaume des S...
1,c1d9a05e-50c1-7041-a60e-9a56c092e612,"[28, 10759, 12, 16, 10762, 80, 35, 99, 10764, ...","[Acción, Action & Adventure, Aventura, Animaci...","[27205, 157336, 155, 19995, 293660]","[Inception, Interstellar, The Dark Knight, Ava...",[Dom Cobb (Leonardo DiCaprio) is a skilled thi...,"[1399, 71446, 66732, 1402, 93405]",[Game of Thrones],[Il y a de l'orage dans l'air au royaume des S...
2,c1f9d00e-6001-70e0-28b6-9fcaee14754c,"[16, 10762, 14, 10765, 878, 12, 10759, 10751]","[Animación, Kids, Fantasía, Sci-Fi & Fantasy, ...","[24428, 299536, 118340, 150540, 283995]","[Marvel's The Avengers, Avengers: Infinity War...",[Marvel makes cinematic history as it unites t...,"[66732, 93405, 456, 119051, 65334]","[Miraculous, les aventures de Ladybug et Chat ...","[Marinette et Adrien, deux collégiens parisien..."
3,f139102e-50c1-70e7-9ccb-063a02d6fe53,"[28, 10759, 12, 99, 10764, 10763, 80, 18, 27, ...","[Acción, Action & Adventure, Aventura, Documen...","[157336, 155, 293660, 24428, 299536]","[Interstellar, The Dark Knight, Deadpool, Marv...","[From director Christopher Nolan (Inception, T...","[1399, 66732, 71712, 84958, 60735]","[Game of Thrones, The Flash]",[Il y a de l'orage dans l'air au royaume des S...
4,c1c9308e-60c1-704f-d3a7-2b6cbae8986f,"[35, 10749, 10770, 10766, 10767, 9648, 14, 107...","[Comedia, Romance, Película de TV, Soap, Talk,...","[671, 13, 278, 68718, 475557]","[Forrest Gump, Frihetens Regn, Django Unchaine...",[Tom Hanks gives an astonishing performance as...,"[1399, 71446, 66732, 1402, 93405]",[Game of Thrones],[Il y a de l'orage dans l'air au royaume des S...


### Sentences for vectorize from 'filtered_data' / AHORA DESDE FIREBASE

In [158]:
# Making the sentences to embed 
filtered_data['sentences_to_embed'] = (
    filtered_data.CleanTitle.fillna('') +
    filtered_data.Synopsis.fillna('') +
    filtered_data.Genre.fillna('').apply(
        lambda x: ', '.join(ast.literal_eval(x)) if x.startswith('[') and x.endswith(']') else x
    )
)

# Formatting as str-list to send to the model
sentences_from_filtered_data = filtered_data.sentences_to_embed.dropna().astype(str).tolist()


In [193]:
# Since the model on Hugging Face processes only requests that can be completed within 60 seconds, we need to divide the sentences into batches.
def split_into_batches(sentences, batch_size):
    return [sentences[i:i + batch_size] for i in range(0, len(sentences), batch_size)]

# After trying with different values, we've reach the maximum batch size to get response succesfully
batches = split_into_batches(sentences_from_filtered_data, 100)

In [194]:
# Check key availability
if HUGGINGFACE_API_KEY is None:
    print("Error: No se encontró la clave de API de Hugging Face.")
else:
    print("Clave de API cargada correctamente.")

# Model URL
API_URL = "https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/all-MiniLM-L6-v2"

# API header and key
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_KEY}"}  

# Function to get embeddings from Hugging Face API
def get_embeddings_from_api(sentences):
    url = API_URL
    payload = {"inputs": sentences}
    
    response = requests.post(url, headers=headers, json=payload)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None
    

# Sequence Batch Processing Process
all_embeddings_from_filtered_data = []
for batch in batches:
    print(f"Processing batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        all_embeddings_from_filtered_data.extend(embeddings)

print("Embeddings processed successfully:")
print(all_embeddings_from_filtered_data[:2])  

Clave de API cargada correctamente.
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 sentences...
Processing batch with 100 se

### Sentences for vectorize from 'user_preferences' / AHORA DESDE DYNAMODB

In [195]:
## hay que corregir el codigo para dividir los embedings en genero+series+synopsis y por un lado y genero+movies+synopsis por el otro

# Sentences we want to be embedded from user_preferences MOVIES
user_pref['movies_sentences_to_embed'] = (user_pref.Movies_Titles.fillna('') +
                                   user_pref.Genres.fillna('') +
                                   user_pref.Movies_Synopsis.fillna(''))

# Sentences we want to be embedded from user_preferences SERIES
user_pref['series_sentences_to_embed'] = (user_pref.Series_Titles.fillna('') + 
                                   user_pref.Genres.fillna('') +
                                   user_pref.Series_Synopsis.fillna(''))

# Formatting as str-list to send to the model
movies_sentences_from_user_pref = user_pref.movies_sentences_to_embed.dropna().astype(str).tolist()
series_sentences_from_user_pref = user_pref.series_sentences_to_embed.dropna().astype(str).tolist()

# We split the sentences in batches as we did previously with filtered_data
movies_batches_user_pref = split_into_batches(movies_sentences_from_user_pref, 100)
series_batches_user_pref = split_into_batches(series_sentences_from_user_pref, 100)


In [197]:
# Sequence Batch Processing Process for Movies Embeddings
movies_embeddings_from_user_pref = []
for batch in movies_batches_user_pref:
    print(f"Processing movie batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        movies_embeddings_from_user_pref.extend(embeddings)

print("Movies Embeddings processed successfully:")
print(movies_embeddings_from_user_pref[:2])  

Processing movie batch with 100 sentences...
Error 503: {"error":"Service Unavailable"}
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing movie batch with 100 sentences...
Processing m

In [198]:
# Sequence Batch Processing Process for Movies Embeddings
series_embeddings_from_user_pref = []
for batch in series_batches_user_pref:
    print(f"Processing series batch with {len(batch)} sentences...")
    embeddings = get_embeddings_from_api(batch)
    if embeddings:
        series_embeddings_from_user_pref.extend(embeddings)

print("Series Embeddings processed successfully:")
print(series_embeddings_from_user_pref[:2])  

Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 sentences...
Processing series batch with 100 s

### Similarities

In [None]:
# revisar que esto se haga ahora en terminos de movies y de series <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# We have to convert the embeddings list to numpy arrays in order to calculate cosine similarities wiht sklearn
all_embeddings_from_user_pref_array = np.array(all_embeddings_from_user_pref)
all_embeddings_from_filtered_data_array = np.array(all_embeddings_from_filtered_data)

user_for_example = 1

# Taking first user as example to calculate the cosine_similarity
user_embedding_example = all_embeddings_from_user_pref_array[user_for_example].reshape(1, -1)  # Asegurar forma correcta para cosine_similarity

# To calculate similarity between the user example embeding and the whole content from filtered data
content_similarities = cosine_similarity(user_embedding_example, all_embeddings_from_filtered_data_array).flatten()

# Sort indexes by similarity
most_similar_indexes = content_similarities.argsort()[::-1]

# Top-10
topten_most_similar_indexes = most_similar_indexes[:10]

In [None]:
# To display the most similar indexes and their similarity scores
print("Most similar indexes:", most_similar_indexes[:10])
print("Highest similarities:", content_similarities[most_similar_indexes[:10]])

In [None]:
# search most_similar_indexes, and preferences, and get recommendations
user_id = user_pref.loc[user_for_example]['userId']
movies_preferred = user_pref[user_pref['userId']==user_id]['TitulosPeliculas']
series_preferred = user_pref[user_pref['userId']==user_id]['TitulosSeries']

# Displaying Preferences & Recommendations
print(f'''User {user_id} Preferences:
      ''')
print(f'''Movies preference:''')
for movie in movies_preferred.iloc[0].split(';'):
    print(f'      {movie.strip()}')

print(f'''TV Shows preference:''')
for series in series_preferred.iloc[0].split(';'):
    print(f'      {series.strip()}')

print(f'''Recomendations for user: {user_id}
      ''')

recomendations_user = filtered_data.loc[topten_most_similar_indexes]['CleanTitle']
for recommendation in recomendations_user:
    print(f'      {recommendation}')
