# Realizar consultas por fecha como argumento

In [1]:
# Se importan las librerías
# Para más información de datetime: https://docs.python.org/3/library/datetime.html

import boto3
import pandas as pd
from io import StringIO, BytesIO
from datetime import datetime, timedelta

In [2]:
# Se establece el argumento
arg_date = '2022-12-30'

In [4]:
# strptime: https://www.geeksforgeeks.org/python-datetime-strptime-function/
arg_date_dt = datetime.strptime(arg_date, '%Y-%m-%d').date() - timedelta(days=1)

In [5]:
# Se establece el servicio y se accede al bucket
s3 = boto3.resource('s3')
bucket = s3.Bucket('xetra-1234')

Se obtienen todos los objetos cuya fecha sea mayor o igual que el argumento de fecha:

In [6]:
objects = [obj for obj in bucket.objects.all() if datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d').date() >= arg_date_dt]

In [7]:
# La línea anterior paso a paso:
for obj in bucket.objects.all():    
    if datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d').date() >= arg_date_dt:
        print(obj.key)
        # print(obj.key.split("/"))
        # print(obj.key.split("/")[0])
        # Con datetime.strptime se transforma a un objeto datetime
        # print(datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d'))
        # print(datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d').date())

2022-12-29/2022-12-29_BINS_XETR00.csv
2022-12-29/2022-12-29_BINS_XETR01.csv
2022-12-29/2022-12-29_BINS_XETR02.csv
2022-12-29/2022-12-29_BINS_XETR03.csv
2022-12-29/2022-12-29_BINS_XETR04.csv
2022-12-29/2022-12-29_BINS_XETR05.csv
2022-12-29/2022-12-29_BINS_XETR06.csv
2022-12-29/2022-12-29_BINS_XETR07.csv
2022-12-29/2022-12-29_BINS_XETR08.csv
2022-12-29/2022-12-29_BINS_XETR09.csv
2022-12-29/2022-12-29_BINS_XETR10.csv
2022-12-29/2022-12-29_BINS_XETR11.csv
2022-12-29/2022-12-29_BINS_XETR12.csv
2022-12-29/2022-12-29_BINS_XETR13.csv
2022-12-29/2022-12-29_BINS_XETR14.csv
2022-12-29/2022-12-29_BINS_XETR15.csv
2022-12-29/2022-12-29_BINS_XETR16.csv
2022-12-29/2022-12-29_BINS_XETR17.csv
2022-12-29/2022-12-29_BINS_XETR18.csv
2022-12-29/2022-12-29_BINS_XETR19.csv
2022-12-29/2022-12-29_BINS_XETR20.csv
2022-12-29/2022-12-29_BINS_XETR21.csv
2022-12-29/2022-12-29_BINS_XETR22.csv
2022-12-29/2022-12-29_BINS_XETR23.csv
2022-12-30/2022-12-30_BINS_XETR00.csv
2022-12-30/2022-12-30_BINS_XETR01.csv
2022-12-30/2

In [8]:
objects

[s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR00.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR01.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR02.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR03.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR04.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR05.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR06.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR07.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR08.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR09.csv'),
 s3.ObjectSummary(bucket_name='xetra-1234', key='2022-12-29/2022-12-29_BINS_XETR10.csv'),
 s3.Object

In [9]:
# Se obtiene el objeto inicial por posición en el bucket objects
csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8')
data = StringIO(csv_obj_init)
df_init = pd.read_csv(data, delimiter=',')

In [10]:
df_init.columns

Index(['ISIN', 'Mnemonic', 'SecurityDesc', 'SecurityType', 'Currency',
       'SecurityID', 'Date', 'Time', 'StartPrice', 'MaxPrice', 'MinPrice',
       'EndPrice', 'TradedVolume', 'NumberOfTrades'],
      dtype='object')

In [11]:
# Concatenar todos los objetos con pandas, como son más, tarda más
df_all = pd.DataFrame(columns=df_init.columns)
for obj in objects:
    csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=',')
    df_all = pd.concat([df,df_all], ignore_index=True)

In [12]:
# Se obtienen los primeros 10
df_all.head(10)

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,US98956P1021,ZIM,ZIMMER BIOMET HLDGS DL-01,Common stock,EUR,4582018,2022-12-31,20:30,113.1,113.1,113.1,113.1,0,1
1,US9224171002,VEO,"VEECO INSTRUMENTS DL-,01",Common stock,EUR,6198311,2022-12-31,20:30,24.6,24.6,24.6,24.6,0,1
2,IT0005143547,EM8,ENERGICA MOTOR CO.S.P.A.,Common stock,EUR,7026075,2022-12-31,20:30,3.1,3.1,3.1,3.1,0,1
3,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2022-12-31,16:00,20.215,20.215,20.185,20.185,60,2
4,CA32076V1031,FMV,FIRST MAJESTIC SILVER,Common stock,EUR,2504197,2022-12-31,16:00,10.06,10.06,10.06,10.06,11,1
5,DE000ETFL011,EL4A,DK DAX,ETF,EUR,2504258,2022-12-31,16:00,129.4,129.4,129.4,129.4,52,1
6,DE0005933931,EXS1,ISHS CORE DAX UC.ETF EOA,ETF,EUR,2504265,2022-12-31,16:00,122.26,122.26,122.2,122.24,4796,4
7,FR0010869495,LYQL,LY.D.SHORTDAX X2 UETF A,ETF,EUR,2504266,2022-12-31,16:00,1.6134,1.6134,1.6134,1.6134,1083,1
8,LU0252633754,LYY7,MUL-LYXOR DAX DR UC.ETF A,ETF,EUR,2504267,2022-12-31,16:00,134.78,134.78,134.78,134.78,25,1
9,LU0252634307,LYY8,MUL-LYX.DLY.LEVDAX U.E. A,ETF,EUR,2504268,2022-12-31,16:00,115.48,115.48,115.48,115.48,23,1


In [13]:
# Filtrar y extraer sólo algunas columnas
columns = ["ISIN", "Mnemonic", "Date", "Time", "StartPrice", "EndPrice", "MinPrice", "MaxPrice", "TradedVolume"]
df_all = df_all.loc[:, columns]

In [14]:
df_all

Unnamed: 0,ISIN,Mnemonic,Date,Time,StartPrice,EndPrice,MinPrice,MaxPrice,TradedVolume
0,US98956P1021,ZIM,2022-12-31,20:30,113.100,113.100,113.100,113.100,0
1,US9224171002,VEO,2022-12-31,20:30,24.600,24.600,24.600,24.600,0
2,IT0005143547,EM8,2022-12-31,20:30,3.100,3.100,3.100,3.100,0
3,CA0679011084,ABR,2022-12-31,16:00,20.215,20.185,20.185,20.215,60
4,CA32076V1031,FMV,2022-12-31,16:00,10.060,10.060,10.060,10.060,11
...,...,...,...,...,...,...,...,...,...
395752,IE00BLH3CV30,YODA,2022-12-29,08:59,5.564,5.564,5.564,5.564,392
395753,FR0004056851,AYJ,2022-12-29,08:59,14.550,14.600,14.550,14.600,2057
395754,IE0003Z9E2Y3,4COP,2022-12-29,08:59,26.135,26.135,26.135,26.135,12
395755,DE000DTR0CK8,DTG,2022-12-29,08:59,27.050,27.140,27.050,27.230,7130


In [15]:
# Eliminar missing values: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
# inplace=True elimina los NaN y modifica el df original, si setea como false, creará un nuevo df sin los NaN eliminados.
df_all.dropna(inplace=True)

In [16]:
# Consultar la forma del df para verificar si se borró alguna fila
df_all.shape

(395757, 9)

## Obtener precio de apertura por ISIN y día

In [17]:
# Se creará una nueva columna en el df con el precio de apertura.
df_all['opening_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN', 'Date'])['StartPrice'].transform('first')

In [18]:
# Código anterior paso a paso
df_sorted = df_all.sort_values(by=['Time'])
print("df_sorted\n", df_sorted)
df_grouped = df_sorted.groupby(['ISIN', 'Date'])
print("df_grouped\n", df_grouped)
df_start = df_grouped['StartPrice']
print("df_start\n", df_start)
df_transformed = df_start.transform('first')
print("df_transformed\n", df_transformed)

df_sorted
                 ISIN Mnemonic        Date   Time  StartPrice  EndPrice  \
377392  DE000SYM9999      SY1  2022-12-29  08:00      104.60    104.75   
377396  DE0005493365      HYQ  2022-12-29  08:00      347.80    347.80   
377397  DE0005550602     DRW8  2022-12-29  08:00       46.35     46.35   
377398  DE0005659700      EUZ  2022-12-29  08:00       64.00     64.20   
377399  DE0005664809      EVT  2022-12-29  08:00       25.46     25.67   
...              ...      ...         ...    ...         ...       ...   
263839  US9224171002      VEO  2022-12-29  20:30       24.60     24.60   
263838  US98956P1021      ZIM  2022-12-29  20:30      113.10    113.10   
131921  IT0005143547      EM8  2022-12-30  20:30        3.10      3.10   
131919  US98956P1021      ZIM  2022-12-30  20:30      113.10    113.10   
0       US98956P1021      ZIM  2022-12-31  20:30      113.10    113.10   

        MinPrice  MaxPrice TradedVolume  opening_price  
377392    104.45    104.80        10909    

In [19]:
# Consultar por clave ISIN
df_all[df_all['ISIN']=='AT0000A0E9W5']

Unnamed: 0,ISIN,Mnemonic,Date,Time,StartPrice,EndPrice,MinPrice,MaxPrice,TradedVolume,opening_price
1405,AT0000A0E9W5,SANT,2022-12-31,16:03,14.01,14.06,14.01,14.06,817,13.88
1711,AT0000A0E9W5,SANT,2022-12-31,16:04,14.06,14.06,14.06,14.06,43,13.88
2173,AT0000A0E9W5,SANT,2022-12-31,16:05,14.04,14.04,14.04,14.04,2733,13.88
2991,AT0000A0E9W5,SANT,2022-12-31,16:08,14.06,14.06,14.06,14.06,44,13.88
3240,AT0000A0E9W5,SANT,2022-12-31,16:09,14.04,14.04,14.04,14.04,441,13.88
...,...,...,...,...,...,...,...,...,...,...
393685,AT0000A0E9W5,SANT,2022-12-29,08:52,14.16,14.18,14.16,14.18,500,13.88
393932,AT0000A0E9W5,SANT,2022-12-29,08:53,14.08,14.17,14.08,14.17,384,13.88
394480,AT0000A0E9W5,SANT,2022-12-29,08:55,14.14,14.18,14.14,14.18,1455,13.88
394705,AT0000A0E9W5,SANT,2022-12-29,08:56,14.13,14.13,14.13,14.14,2289,13.88


## Obtener el precio de cierre por ISIN y día

In [20]:
df_all['closing_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN', 'Date'])['EndPrice'].transform('last')

In [21]:
df_all[df_all['ISIN']=='AT0000A0E9W5']

Unnamed: 0,ISIN,Mnemonic,Date,Time,StartPrice,EndPrice,MinPrice,MaxPrice,TradedVolume,opening_price,closing_price
1405,AT0000A0E9W5,SANT,2022-12-31,16:03,14.01,14.06,14.01,14.06,817,13.88,14.08
1711,AT0000A0E9W5,SANT,2022-12-31,16:04,14.06,14.06,14.06,14.06,43,13.88,14.08
2173,AT0000A0E9W5,SANT,2022-12-31,16:05,14.04,14.04,14.04,14.04,2733,13.88,14.08
2991,AT0000A0E9W5,SANT,2022-12-31,16:08,14.06,14.06,14.06,14.06,44,13.88,14.08
3240,AT0000A0E9W5,SANT,2022-12-31,16:09,14.04,14.04,14.04,14.04,441,13.88,14.08
...,...,...,...,...,...,...,...,...,...,...,...
393685,AT0000A0E9W5,SANT,2022-12-29,08:52,14.16,14.18,14.16,14.18,500,13.88,14.08
393932,AT0000A0E9W5,SANT,2022-12-29,08:53,14.08,14.17,14.08,14.17,384,13.88,14.08
394480,AT0000A0E9W5,SANT,2022-12-29,08:55,14.14,14.18,14.14,14.18,1455,13.88,14.08
394705,AT0000A0E9W5,SANT,2022-12-29,08:56,14.13,14.13,14.13,14.14,2289,13.88,14.08


## Aggregations

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.aggregate.html

In [22]:
df_all = df_all.groupby(['ISIN', 'Date'], as_index=False).agg(opening_price_eur=('opening_price', 'min'), closing_price_eur=('closing_price', 'min'), minimum_price_eur=('MinPrice', 'min'), maximum_price_eur=('MaxPrice', 'max'), daily_traded_volume=('TradedVolume', 'sum'))

In [23]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume
0,AT000000STR1,2022-12-29,36.6000,36.700,35.7500,36.700,1773
1,AT000000STR1,2022-12-30,36.6000,36.700,35.7500,36.700,1773
2,AT000000STR1,2022-12-31,36.6000,36.700,35.7500,36.700,1773
3,AT00000FACC2,2022-12-29,8.0500,8.570,7.8700,8.570,10205
4,AT00000FACC2,2022-12-30,8.0500,8.570,7.8700,8.570,10205
...,...,...,...,...,...,...,...
9691,XS2376095068,2022-12-30,34.2880,36.500,34.0640,36.500,1000
9692,XS2376095068,2022-12-31,34.2880,36.500,34.0640,36.500,1000
9693,XS2434891219,2022-12-29,3.4412,3.662,3.4184,3.662,0
9694,XS2434891219,2022-12-30,3.4412,3.662,3.4184,3.662,0


### Porcentaje Cambio Anterior Cierre

In [24]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html

df_all['prev_closing_price'] = df_all.sort_values(by=['Date']).groupby(['ISIN'])['closing_price_eur'].shift(1)

In [25]:
df_all

Unnamed: 0,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,prev_closing_price
0,AT000000STR1,2022-12-29,36.6000,36.700,35.7500,36.700,1773,
1,AT000000STR1,2022-12-30,36.6000,36.700,35.7500,36.700,1773,36.700
2,AT000000STR1,2022-12-31,36.6000,36.700,35.7500,36.700,1773,36.700
3,AT00000FACC2,2022-12-29,8.0500,8.570,7.8700,8.570,10205,
4,AT00000FACC2,2022-12-30,8.0500,8.570,7.8700,8.570,10205,8.570
...,...,...,...,...,...,...,...,...
9691,XS2376095068,2022-12-30,34.2880,36.500,34.0640,36.500,1000,36.500
9692,XS2376095068,2022-12-31,34.2880,36.500,34.0640,36.500,1000,36.500
9693,XS2434891219,2022-12-29,3.4412,3.662,3.4184,3.662,0,
9694,XS2434891219,2022-12-30,3.4412,3.662,3.4184,3.662,0,3.662


In [26]:
df_all['change_prev_closing_%'] = (df_all['closing_price_eur'] - df_all['prev_closing_price']) / df_all['prev_closing_price'] * 100

In [27]:
df_all.drop(columns=['prev_closing_price'], inplace=True)

In [28]:
df_all = df_all.round(decimals=2)

In [29]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html
df_all.reset_index(inplace=True)

Obtener el df final que cumple con un criterio de fecha

In [30]:
df_all = df_all[df_all.Date >= arg_date]

In [31]:
df_all

Unnamed: 0,index,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,change_prev_closing_%
1,1,AT000000STR1,2022-12-30,36.60,36.70,35.75,36.70,1773,0.0
2,2,AT000000STR1,2022-12-31,36.60,36.70,35.75,36.70,1773,0.0
4,4,AT00000FACC2,2022-12-30,8.05,8.57,7.87,8.57,10205,0.0
5,5,AT00000FACC2,2022-12-31,8.05,8.57,7.87,8.57,10205,0.0
7,7,AT0000606306,2022-12-30,14.51,15.00,13.65,15.28,107836,0.0
...,...,...,...,...,...,...,...,...,...
9689,9689,XS2314660700,2022-12-31,22.26,21.92,21.92,22.28,0,0.0
9691,9691,XS2376095068,2022-12-30,34.29,36.50,34.06,36.50,1000,0.0
9692,9692,XS2376095068,2022-12-31,34.29,36.50,34.06,36.50,1000,0.0
9694,9694,XS2434891219,2022-12-30,3.44,3.66,3.42,3.66,0,0.0


## Salvar a S3 en formato .parquet

Apache Parquet es un formato de almacenamiento en columnas disponible para cualquier proyecto en el ecosistema de Hadoop, independientemente de la elección del framework de procesamiento de datos, el modelo de datos o el lenguaje de programación.

https://parquet.apache.org/docs/overview/

In [32]:
key = 'xetra_daily_report_' + datetime.today().strftime("%Y%m%d_%H%M%S") + '.parquet'

In [39]:
'''
BytesIO() se utiliza para guardar los datos como bytes en un búfer en memoria. 
Con este bloque de código se está almacenando el archivo .parquet en el bucket target. Es decir, se guarda la 
información directamente en la nube de amazon y no en el disco duro local.
'''

out_buffer = BytesIO()
df_all.to_parquet(out_buffer, index=False)
bucket_target = s3.Bucket('xetra-ajlj')
bucket_target.put_object(Body=out_buffer.getvalue(), Key=key)

s3.Object(bucket_name='xetra-ajlj', key='xetra_daily_report_20230217_232455.parquet')

In [40]:
s3.Object(bucket_name='xetra-vcl-itspr', key='xetra_daily_report_20220218_134804.parquet')

s3.Object(bucket_name='xetra-vcl-itspr', key='xetra_daily_report_20220218_134804.parquet')

## Leer el archivo subido a s3

Esto se realiza para verificar que el reporte fue guardado correctamente.

In [41]:
for obj in bucket_target.objects.all():
    print(obj.key)

xetra_daily_report_20230217_232455.parquet


In [43]:
# Se lee el archivo parquet
prq_obj = bucket_target.Object(key='xetra_daily_report_20230217_232455.parquet').get().get('Body').read()
data = BytesIO(prq_obj)
df_report = pd.read_parquet(data)

In [44]:
df_report

Unnamed: 0,index,ISIN,Date,opening_price_eur,closing_price_eur,minimum_price_eur,maximum_price_eur,daily_traded_volume,change_prev_closing_%
0,1,AT000000STR1,2022-12-30,36.60,36.70,35.75,36.70,1773,0.0
1,2,AT000000STR1,2022-12-31,36.60,36.70,35.75,36.70,1773,0.0
2,4,AT00000FACC2,2022-12-30,8.05,8.57,7.87,8.57,10205,0.0
3,5,AT00000FACC2,2022-12-31,8.05,8.57,7.87,8.57,10205,0.0
4,7,AT0000606306,2022-12-30,14.51,15.00,13.65,15.28,107836,0.0
...,...,...,...,...,...,...,...,...,...
6459,9689,XS2314660700,2022-12-31,22.26,21.92,21.92,22.28,0,0.0
6460,9691,XS2376095068,2022-12-30,34.29,36.50,34.06,36.50,1000,0.0
6461,9692,XS2376095068,2022-12-31,34.29,36.50,34.06,36.50,1000,0.0
6462,9694,XS2434891219,2022-12-30,3.44,3.66,3.42,3.66,0,0.0
