# data retrieval

In [None]:
# importing
import pandas as pd
import numpy as np
import glob, os

In [None]:
# download data from AWS
def download_multi_dates (from_date, until_date):
    """Receiving the data from the AWS Bucket from Deutsche Börse

    Parameters
    ----------
    from_date : str
        the date it should start
    until_date: str
        the date until you want the data
    """  
    # create list for required dates
    dates = list(pd.date_range(from_date, until_date, freq="D").strftime("%Y-%m-%d"))
    
    # download
    number_days = len (dates)
    i = 0
    while i < number_days:
        date = dates[i]
        ! aws s3 ls s3://deutsche-boerse-xetra-pds/{date}/ --no-sign-request
        ! mkdir "../data/deutsche-boerse-xetra-pds/{date}"
        ! aws s3 sync s3://deutsche-boerse-xetra-pds/{date} "../data/deutsche-boerse-xetra-pds/{date}" --no-sign-request
        ! ls "../data/deutsche-boerse-xetra-pds/{date}"
        # increase i
        i += 1

In [None]:
# load the files from a date directory
def load_csv_dir_multi(from_date, until_date, data_dir):
    """Loading several CSV Files
    
    Parameters
    ----------
    from_date : str
        the date it should start
    until_date: str
        the date until you want the data
    data_dir : str
        the directory of the data
    """
    
    # create list for required dates
    dates = list(pd.date_range(from_date, until_date, freq="D").strftime("%Y-%m-%d"))
    
    merged_df = pd.DataFrame()
    number_days = len (dates)
    i = 0
    while i < number_days:
        date = dates[i]
        data_dir_merge = data_dir % date
        if len(os.listdir(data_dir_merge) ) != 0: # check, if folder is not empty (empty folders -> weekends etc.) are not concatenated
            merged_df = pd.concat([merged_df, pd.concat(map(pd.read_csv, glob.glob(os.path.join(data_dir_merge, "*.csv"))))])
            print("Loading and merging from {}".format(data_dir_merge))
        i += 1
    return merged_df

In [None]:
# specify dates
from_date = "2017-07-01"
until_date = "2021-01-31"
download_multi_dates(from_date, until_date)

In [None]:
# change directory to your local folder
data_dir = "../data/deutsche-boerse-xetra-pds/%s/"
print("Loading from {}".format(data_dir))
merged_df = load_csv_dir_multi(from_date, until_date, data_dir)

In [None]:
# save full merged data
merged_df.to_csv("../data/07_17_to_01_21_full.csv")