## Hacemos la importacion de las librerias necesarias

In [53]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO, BytesIO
from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression


In [54]:
def read_csv_to_df(bucket, objects):
    
    #Get the columns of the objects
    csv_obj_init = bucket.Object(key=objects[0].key).get().get('Body').read().decode('utf-8') #Revisar este
    data = StringIO(csv_obj_init)
    df_init = pd.read_csv(data, delimiter=',')
    
    #Create a dataframe with the columns of the objects
    df_all = pd.DataFrame(columns=df_init.columns)
    
    #Concat the objects to the dataframe with the columns
    for obj in objects:
        csv_obj = bucket.Object(key=obj.key).get().get('Body').read().decode('utf-8')
        data = StringIO(csv_obj)
        df = pd.read_csv(data, delimiter=',')
        df_all = pd.concat([df,df_all], ignore_index=True)
    
    return df_all

def write_df_to_s3(df_all, key, bucket_target):
    
    #Create the buffer to store the dataframe
    out_buffer = BytesIO()
    
    #Create a .parquet file
    df_all.to_parquet(out_buffer, index=False)
    
    #Upload the file to the bucket with the key and the .parquet file stored in the buffer
    bucket_target.put_object(Body=out_buffer.getvalue(), Key=key)
    pass

def return_objects(bucket, arg_date_dt):
    
    #Get all the objects according to the condition given and return them
    objects = [obj for obj in bucket.objects.all() if datetime.strptime(obj.key.split("/")[0], '%Y-%m-%d').date() >= arg_date_dt]
    return objects

In [55]:
# Application Layer

def extract(bucket, args):
    #Get the objects 
    objects = return_objects(bucket, args)
    
    #Get the dataframe out of the csv files
    df_all = read_csv_to_df(bucket, objects)
    
    return df_all

def transform_report(df_all):
    
    #Drop the null values
    df_all.dropna(inplace=True)
    
    df_all['start_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN', 'Date'])['StartPrice'].transform('first')

    df_all['end_price'] = df_all.sort_values(by=['Time']).groupby(['ISIN', 'Date'])['EndPrice'].transform('last')

    df_all = df_all.query('"08:00" < Time < "12:00"').groupby(['ISIN', 'Date'], as_index=False).agg(start_price=('start_price', 'min'), end_price=('end_price', 'min'), minimum_price=('MinPrice', 'min'), maximum_price=('MaxPrice', 'max'), daily_traded_volume=('TradedVolume', 'sum'))
    
    df_all["end_price_mx"] = df_all["end_price"] * 19.08
    
    deviation = ['start_price','end_price']

    df_all["standard_deviation"] = df_all[deviation].std(axis=1)
    
    return df_all

def load(df_all, bucket_target):
    #Generate a key to save the dataframe
    key = 'xetra_daily_report_' + datetime.today().strftime("%Y%m%d_%H%M%S") + '.parquet'
    
    #Write the file in the cloud
    write_df_to_s3(df_all, key, bucket_target)
    pass

def etl_report(bucket_target, key):
    prq_obj = bucket_target.Object(key=key).get().get('Body').read()
    data = BytesIO(prq_obj)
    df_report = pd.read_parquet(data)
    
    return df_report

In [56]:
def main():
    # Parameters/Configurations
    #--------------------------
    
    #Establish the connection to the bucket
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('xetra-1234')
    
    #Establish the bucket target
    bucket_target = s3.Bucket('xetra-cdhm')
    
    #Set the date parameter
    arg_date = '2022-12-31'
    arg_date_dt = datetime.strptime(arg_date, '%Y-%m-%d').date() - timedelta(days=1)
    
    #Set a key to get the created report
    #---------------
    #Check if it is posible to get the last bucket created and get the key
    key = 'xetra_daily_report_20230223_223956.parquet'
    
    
    df_all = extract(bucket, arg_date_dt)
    df_transformed = transform_report(df_all)
    # Init
    #----------------------------------------------
    
    df_all = extract(bucket, arg_date_dt)
    df_transformed = transform_report(df_all, arg_date)
    load(df_transformed, bucket_target)
    report = etl_report(bucket_target, key)
    print(report)
    
    return df_transformed
    # run application

In [57]:
df = main()

In [58]:
df

Unnamed: 0,ISIN,Date,start_price,end_price,minimum_price,maximum_price,daily_traded_volume,end_price_mx,standard_deviation
0,AT000000STR1,2022-12-30,36.6000,36.700,35.7500,36.6000,1666,700.23600,0.070711
1,AT000000STR1,2022-12-31,36.6000,36.700,35.7500,36.6000,1666,700.23600,0.070711
2,AT00000FACC2,2022-12-30,8.0500,8.570,7.8700,8.0500,5507,163.51560,0.367696
3,AT00000FACC2,2022-12-31,8.0500,8.570,7.8700,8.0500,5507,163.51560,0.367696
4,AT0000606306,2022-12-30,14.5100,15.000,13.8700,14.9100,67471,286.20000,0.346482
...,...,...,...,...,...,...,...,...,...
5725,XS2314660700,2022-12-31,22.2600,21.918,22.2600,22.2600,0,418.19544,0.241831
5726,XS2376095068,2022-12-30,34.2880,36.500,34.2880,34.2880,0,696.42000,1.564120
5727,XS2376095068,2022-12-31,34.2880,36.500,34.2880,34.2880,0,696.42000,1.564120
5728,XS2434891219,2022-12-30,3.4412,3.662,3.4412,3.4412,0,69.87096,0.156129


In [70]:
#Selecciona la columna 'end_price' donde la fecha se encuentre entre '2022-12-30' y '2022-12-31'
y = df.query('Date >= "2022-12-30" and Date <= "2022-12-31"')['end_price']

#Crea una matriz X con los valores de la columna 'start_price'
x = np.array(df.query('Date >= "2022-12-30" and Date <= "2022-12-31"')['start_price']).reshape(-1, 1)

#Crea una instancia de la clase LinearRegression
model = LinearRegression()

#Entrena el modelo con los datos
model.fit(x, y)

#Imprime los coeficientes del modelo
print('COEFICIENTES:')
print(model.coef_)

print('SCORE:')
print(model.score(x, y))

y_pred = model.predict(x)
print('PREDICCION:')
print(y_pred)

# Imprime el intercepto del modelo
print('INTERCEPTO DE MODELO:')
print(model.intercept_)



COEFICIENTES:
[0.99738756]
SCORE:
0.999995224330338
PREDICCION:
[37.47320384 37.47320384  8.99778898 ... 35.1672438   4.40102919
  4.40102919]
INTERCEPTO DE MODELO:
0.9688191182257953


In [None]:
# plot = df.groupby(['ISIN'])['end_price'].plot(legend=True)
# plot.set_xlabel('Date')
# plot.set_ylabel('End Price')
# plot.set_title('End Price per ISIN')
# plt.show()




# # se selecciona la ventana de tiempo
# start_date = '2022-12-29'
# end_date = '2022-12-31'

# # se filtran los datos dentro de la ventana de tiempo
# times_window = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

# # se seleccionan las columnas a utilizar
# x = times_window['StartPrice']
# y = times_window['EndPrice']

# # se calcula la regresión lineal
# slope, intercept, r_value, p_value, std_err = linregress(x, y)

# # se define una lista con las fechas de los próximos 2 días
# next_days = pd.date_range(start_date, periods=3, freq='D').strftime('%Y-%m-%d')

# print('NEXT DAYS:')
# print(next_days)

# print('VENTANA DE TIEMPO:')

# # se imprimen los datos de la ventana de tiempo
# print(times_window)


# # se hace una predicción del EndPrice para cada uno de los próximos 2 días
# predictions = []
# for day in next_days:
#     if times_window.loc[times_window['Date']==day, 'StartPrice'].shape[0] > 0:
#         prediction = slope * times_window.loc[times_window['Date']==day, 'StartPrice'].values[0] + intercept
#         predictions.append(prediction)
#     else:
#         print(f"No hay datos disponibles para el día {day}")

# se imprimen las predicciones
# for i, prediction in enumerate(predictions):
#     print(f"El EndPrice predicho para el día {next_days[i]} es {prediction:.2f}")