## Hacemos la importacion de las librerias necesarias

In [7]:
import boto3
import pandas as pd
from io import StringIO, BytesIO
from datetime import datetime, timedelta
from scipy.stats import linregress
import matplotlib.pyplot as plt


In [8]:
def read_csv_to_df(bucket, objects):
    
    #Get the columns of the objects
    csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8') #Revisar este
    data = StringIO(csv_obj_init)
    df_init = pd.read_csv(data, delimiter=',')
    
    #Create a dataframe with the columns of the objects
    df_all = pd.DataFrame(columns=df_init.columns)
    
    #Concat the objects to the dataframe with the columns
    for obj in objects:
        csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
        data = StringIO(csv_obj)
        df = pd.read_csv(data, delimiter=',')
        df_all = pd.concat([df,df_all], ignore_index=True)
    
    return df_all

def write_df_to_s3(df_all, key, bucket_target):
    
    #Create the buffer to store the dataframe
    out_buffer = BytesIO()
    
    #Create a .parquet file
    df_all.to_parquet(out_buffer, index=False)
    
    #Upload the file to the bucket with the key and the .parquet file stored in the buffer
    bucket_target.put_object(Body=out_buffer.getvalue(), Key=key)
    pass

def return_objects(bucket, arg_date_dt):
    
    #Get all the objects according to the condition given and return them
    objects = [obj for obj in bucket.objects.all() if datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d').date() >= arg_date_dt]
    return objects

In [9]:
# Application Layer

def extract(bucket, args):
    #Get the objects 
    objects = return_objects(bucket, args)
    
    #Get the dataframe out of the csv files
    df_all = read_csv_to_df(bucket, objects)
    
    return df_all

def transform_report(df_all, arg_date):
    
    #Drop the null values
    df_all.dropna(inplace=True)
    
    #Filter by day and 
    df_all = df_all.groupby(['ISIN', 'Date'], as_index=False)
    
    
    
    return df_all

def load(df_all, bucket_target):
    #Generate a key to save the dataframe
    key = 'xetra_daily_report_' + datetime.today().strftime("%Y%m%d_%H%M%S") + '.parquet'
    
    #Write the file in the cloud
    write_df_to_s3(df_all, key, bucket_target)
    pass

def etl_report(bucket_target, key):
    prq_obj = bucket_target.Object(key=key).get().get('Body').read()
    data = BytesIO(prq_obj)
    df_report = pd.read_parquet(data)
    
    return df_report

In [10]:
def main():
    # Parameters/Configurations
    #--------------------------
    
    #Establish the connection to the bucket
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('xetra-1234')
    
    #Establish the bucket target
    bucket_target = s3.Bucket('xetra-cdhm')
    
    #Set the date parameter
    arg_date = '2022-12-31'
    arg_date_dt = datetime.strptime(arg_date, '%Y-%m-%d').date() - timedelta(days=1)
    
    #Set a key to get the created report
    #---------------
    #Check if it is posible to get the last bucket created and get the key
    key = 'xetra_daily_report_20230223_223956.parquet'
    
    
    df_all = extract(bucket, arg_date_dt)
    #df_transformed = transform_report(df_all)
    # Init
    #----------------------------------------------
    '''
    df_all = extract(bucket, arg_date_dt)
    df_transformed = transform_report(df_all, arg_date)
    load(df_transformed, bucket_target)
    report = etl_report(bucket_target, key)
    print(report)
    '''
    return df_all
    # run application

In [11]:
df = main()

In [12]:
df

Unnamed: 0,ISIN,Mnemonic,SecurityDesc,SecurityType,Currency,SecurityID,Date,Time,StartPrice,MaxPrice,MinPrice,EndPrice,TradedVolume,NumberOfTrades
0,US98956P1021,ZIM,ZIMMER BIOMET HLDGS DL-01,Common stock,EUR,4582018,2022-12-31,20:30,113.100,113.100,113.100,113.100,0,1
1,US9224171002,VEO,"VEECO INSTRUMENTS DL-,01",Common stock,EUR,6198311,2022-12-31,20:30,24.600,24.600,24.600,24.600,0,1
2,IT0005143547,EM8,ENERGICA MOTOR CO.S.P.A.,Common stock,EUR,7026075,2022-12-31,20:30,3.100,3.100,3.100,3.100,0,1
3,CA0679011084,ABR,BARRICK GOLD CORP.,Common stock,EUR,2504196,2022-12-31,16:00,20.215,20.215,20.185,20.185,60,2
4,CA32076V1031,FMV,FIRST MAJESTIC SILVER,Common stock,EUR,2504197,2022-12-31,16:00,10.060,10.060,10.060,10.060,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263833,IE00BLH3CV30,YODA,HANETFPROCSPACE DLA,ETF,EUR,6498298,2022-12-30,08:59,5.564,5.564,5.564,5.564,392,1
263834,FR0004056851,AYJ,"VALNEVA SE EO -,15",Common stock,EUR,6769323,2022-12-30,08:59,14.550,14.600,14.550,14.600,2057,9
263835,IE0003Z9E2Y3,4COP,GLXETFS-CP.MIN. DLA,ETF,EUR,7058231,2022-12-30,08:59,26.135,26.135,26.135,26.135,12,1
263836,DE000DTR0CK8,DTG,DAIMLER TRUCK HLDG JGE NA,Common stock,EUR,7126155,2022-12-30,08:59,27.050,27.230,27.050,27.140,7130,54


In [13]:
len(df)

263838

In [14]:
len(df['ISIN'].unique())

3232

In [15]:
start_time = "08:00"
end_time = "12:00"

start_time_dt = datetime.strptime(start_time, '%H:%M').time()

start_time_dt

datetime.time(8, 0)

In [16]:
df_times = df['Time' > start_time_dt]

TypeError: '>' not supported between instances of 'str' and 'datetime.time'

In [17]:
df_grouped = df.groupby(['ISIN', 'Date'])

In [18]:
len(df_grouped)

6464

In [53]:
#df = da.groupby(["ISIN"])

In [22]:

#Plotear el end price de cada isin

# # se selecciona la ventana de tiempo
# start_date = '2022-12-29'
# end_date = '2022-12-31'

# # se filtran los datos dentro de la ventana de tiempo
# times_window = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

# # se seleccionan las columnas a utilizar
# x = times_window['StartPrice']
# y = times_window['EndPrice']

# # se calcula la regresión lineal
# slope, intercept, r_value, p_value, std_err = linregress(x, y)

# # se define una lista con las fechas de los próximos 2 días
# next_days = pd.date_range(start_date, periods=3, freq='D').strftime('%Y-%m-%d')

# print('NEXT DAYS:')
# print(next_days)

# print('VENTANA DE TIEMPO:')

# # se imprimen los datos de la ventana de tiempo
# print(times_window)


# # se hace una predicción del EndPrice para cada uno de los próximos 2 días
# predictions = []
# for day in next_days:
#     if times_window.loc[times_window['Date']==day, 'StartPrice'].shape[0] > 0:
#         prediction = slope * times_window.loc[times_window['Date']==day, 'StartPrice'].values[0] + intercept
#         predictions.append(prediction)
#     else:
#         print(f"No hay datos disponibles para el día {day}")

# se imprimen las predicciones
# for i, prediction in enumerate(predictions):
#     print(f"El EndPrice predicho para el día {next_days[i]} es {prediction:.2f}")

KeyboardInterrupt: 