In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob
from boto3 import session
from io import BytesIO

### Constantes

In [7]:
REGION='nyc3',
ENDPOINT='https://nyc3.digitaloceanspaces.com',
AWS_KEY='',
AWS_SECRET=''
BUCKET='bb-bubbo'

### Conectar S3

In [8]:
def client_s3():
    conn = session.Session()
    client_s3 = conn.client(
        's3',
        region_name=REGION,
        endpoint_url=ENDPOINT,
        aws_access_key_id=AWS_KEY,
        aws_secret_access_key=AWS_SECRET
    )

    return client_s3

In [3]:
client = client_s3()

### Obtener Objetos

In [10]:
paginator = client.get_paginator('list_objects_v2')
list_paginas = []

pages = paginator.paginate(Bucket=BUCKET)

for page in pages:
    if page.get('Contents'):
        for obj in page['Contents']:
            list_paginas.append(
                {
                    "Key":obj['Key'],
                    "LastModified": obj['LastModified'],
                    "Type": obj['Key'].split('/')[0]
                }
            )

### Crear DF Listas de Files

In [11]:
df_files = pd.DataFrame(list_paginas)

In [13]:
df_files.groupby('Type')['Key'].count()

Type
ComingSoon       2
Content       3968
Episodes      3968
Sources          4
Stats         3969
Name: Key, dtype: int64

### Obtener Key

In [41]:
df_select = df_files[
    (df_files['Key'].str.contains('es_hbomax'))&
    (df_files['Type']=='Content')
].sort_values('LastModified', ascending=False)

In [42]:
df_select.shape

(1, 3)

In [43]:
df_select.head()

Unnamed: 0,Key,LastModified,Type
1385,Content/latest/es_hbomax.jsonl,2025-04-07 14:45:55.646000+00:00,Content


In [44]:
file_list = list(df_select.Key.unique())

In [47]:
len(file_list)

1

### Cargar Archivos

In [48]:
frame = []

In [49]:
for _key in file_list:
    obj = client.get_object(Bucket=BUCKET, Key=_key)
    data = obj['Body'].read()

In [50]:
df = pd.read_json(BytesIO(data), lines=True)

In [51]:
df.shape

(2254, 43)

In [53]:
df.groupby(['PlatformName','PlatformCountry'])['PlatformId'].count()

PlatformName  PlatformCountry
Max           ES                 2254
Name: PlatformId, dtype: int64

### Abrir External Ids

In [59]:
df_external_ids = df.explode('ExternalIds')

In [60]:
df_external_ids = df_external_ids[['PlatformName','PlatformCountry','Id','Title','Type','Year','Directors','Cast','ExternalIds','Deeplinks']]

In [66]:
df_external_ids['UniversalLink'] = df_external_ids['Deeplinks'].apply(lambda x : x.get('Web'))
df_external_ids['ExternalId'] = df_external_ids['ExternalIds'].apply(lambda x : x.get('ID') if x else None)
df_external_ids['ExternalProvider'] = df_external_ids['ExternalIds'].apply(lambda x : x.get('Provider') if x else None)
df_external_ids['ExternalType'] = df_external_ids['ExternalIds'].apply(lambda x : x.get('ContentType') if x else None)

In [67]:
df_external_ids = df_external_ids.drop(columns=['Deeplinks','ExternalIds'])

### Buscar por External Ids

In [77]:
df_external_ids[
    (df_external_ids['ExternalId']=='250307')&
    (df_external_ids['ExternalProvider']=='tmdb')&
    (df_external_ids['ExternalType']=='Tv Show')
]

Unnamed: 0,PlatformName,PlatformCountry,Id,Title,Type,Year,Directors,Cast,UniversalLink,ExternalId,ExternalProvider,ExternalType
2033,Max,ES,e6e7bad9-d48d-4434-b334-7c651ffc4bdf,The Pitt,Tv Show,2024.0,[R Scott Gemmill],"[Tracy Ifeachor, Fiona Dourif, Gerran Howell, ...",https://play.max.com/show/e6e7bad9-d48d-4434-b...,250307,tmdb,Tv Show


### Buscar por Titulos

In [82]:
df_external_ids[df_external_ids['Title'].str.contains('The Pitt')]\
    .drop(columns=['ExternalProvider','ExternalType','ExternalId'])\
    .drop_duplicates(subset=['PlatformName','PlatformCountry','Id'])

Unnamed: 0,PlatformName,PlatformCountry,Id,Title,Type,Year,Directors,Cast,UniversalLink
2033,Max,ES,e6e7bad9-d48d-4434-b334-7c651ffc4bdf,The Pitt,Tv Show,2024.0,[R Scott Gemmill],"[Tracy Ifeachor, Fiona Dourif, Gerran Howell, ...",https://play.max.com/show/e6e7bad9-d48d-4434-b...
