In [6]:
#this file performs the unzipping of the files, it creates a single data frame for every asset and saves it in the data folder
import os
import pandas as pd
import gzip
import tarfile
import xlrd
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [7]:
#function that returns an array of strings, where each string is the name of a file in the folder passed as argument
def get_file_names(folder):
    file_names = []
    for file in os.listdir(folder):
        file_names.append(file)
    return file_names

#define a function that creates a file in the data folder, it takes as argument the name of the file and the content of the file
def create_file(file_name, content):
    with open(file_name, 'w') as file:
        file.write(content)

#this function takes as argument the name of a tar file file_path and uncompress it and store the content in the repertory output_path
def extract_tar(file_path, output_path):
    #Extracts the contents of a .tar file to the specified output path.
    #Args:
    #file_path: The path to the .tar file.
    #output_path: The path where the contents of the .tar file will be extracted.
    try:
        with tarfile.open(file_path, 'r') as tar:
            tar.extractall(output_path)
        print(f"Extraction of {file_path} successful.")
    except tarfile.TarError as e:
        print(f"Error extracting {file_path}: {e}")

def extract_csv_gz_file(source_file, destination_directory):
    """
    Extracts a .csv.gz file to a specified directory.

    Args:
    - source_file: The path to the .csv.gz file to be extracted.
    - destination_directory: The directory where the extracted file will be saved.
    """
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)
    try:
        file_name = os.path.basename(source_file)
        output_file = os.path.join(destination_directory, os.path.splitext(file_name)[0])
        with gzip.open(source_file, 'rb') as f_in, open(output_file, 'wb') as f_out:
            f_out.write(f_in.read())
        print(f"File extracted to: {output_file}")
    except Exception as e:
        print(f"Error extracting file: {e}")

def xl_to_datetime(xltime):
    #transform xltime into an object datetime
    date_value = int(xltime)
    time_value = (xltime - date_value) * 24 * 60 * 60  # Convert fraction of a day to seconds
    date_tuple = xlrd.xldate_as_tuple(date_value, 0)  # 0 for 1900-based date system
    year, month, day, hour, minute, second = date_tuple
    date_time_obj = datetime.datetime(year, month, day, hour, minute, second) + datetime.timedelta(seconds=time_value)
    return date_time_obj

def convert_to_float(value):
    #converts the value to float if it is possible, otherwise it returns nan
    try:
        float_value = float(value)
        return float_value if np.isfinite(float_value) else np.nan
    except (ValueError, TypeError):
        return np.nan
    
def resample_df(df):
    #resample the dataframe df to 1 minute frequency
    #one apply the function xl_to_datetime to the column xltime of merged_df
    df['datetime'] = df['xltime'].apply(xl_to_datetime)
    df['bid-price'] = df['bid-price'].astype(float)
    df['ask-price'] = df['ask-price'].astype(float)
    df['bid-volume'] = df['bid-volume'].astype(float)
    df['ask-volume'] = df['ask-volume'].astype(float)
    #one drops the column xltime
    df = df.drop(columns=['xltime'])
    #one sets the column datetime as index
    df = df.set_index('datetime')
    df = df.resample('1T').agg({
        'bid-price': 'mean',
        'ask-price': 'mean',
        'bid-volume': 'sum',
        'ask-volume': 'sum'
    })
    return df

def create_folder(directory_path, folder_name):
    # Combine directory path and folder name to create the full path for the new folder
    new_folder_path = os.path.join(directory_path, folder_name)

    # Create the new folder if it doesn't already exist
    if not os.path.exists(new_folder_path):
        os.makedirs(new_folder_path)


In [11]:
# stock_tickers contains the name of all the files/stocks
stock_tickers = get_file_names('data/raw/sp100_2004-8/bbo')
# remove .DS_Store from the list: I dont know why but there exists a .DS_Store file in the folder when using function get_file_names
# stock_tickers.remove('.DS_Store') I have no '.DS_Store'file 
 
# len stock_tickers = 87 


In [12]:
# this cell opens the tar file that contains all the data for each particular stock. 
# the result is a folder for each stock that contains all the data for that stock (for each date)
for file_name in stock_tickers:
    file_path = f"data/raw/sp100_2004-8/bbo/{file_name}/{file_name}_bbo.tar"
    output_path = f"data/raw/sp100_2004-8/bbo/{file_name}/"
    extract_tar(file_path, output_path)

Extraction of data/raw/sp100_2004-8/bbo/AA.N/AA.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/ABT.N/ABT.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/AEP.N/AEP.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/ALL.N/ALL.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/APA.N/APA.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/AVP.N/AVP.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/AXP.N/AXP.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BA.N/BA.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BAC.N/BAC.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BAX.N/BAX.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BHI.N/BHI.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BK.N/BK.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/BMY.N/BMY.N_bbo.tar successful.
Extraction of data/raw/sp100_2004-8/bbo/C.N/C.N_bbo.tar successful.
Ex

In [15]:
# this cell creates the data frame for each stock and saves it in the data folder. it also resamples the data to 1 minute frequency
# observe that it does not uncompress the file. it just reads the compressed file and creates the data frame

asset_names = get_file_names('data/raw/sp100_2004-8/bbo')
# remove .DS_Store from the list
# asset_names.remove('.DS_Store')
for asset_name in asset_names:
    names = get_file_names(f"data/raw/sp100_2004-8/bbo/{asset_name}")
    # remove names that are not ending with .csv.gz
    names = [file for file in names if file.endswith('.csv.gz')]
    resampled_df = pd.DataFrame()
    for file_name in names:
        df = pd.read_csv(f"data/raw/sp100_2004-8/bbo/{asset_name}/{file_name}")
        df = df.apply(lambda x: x.apply(convert_to_float))
        #df = df.fillna(method='ffill')
        df = resample_df(df)
        resampled_df = pd.concat([resampled_df, df])
    # one sorts the index of resampled_df
    resampled_df = resampled_df.sort_index()
    create_folder('data/cleaned', asset_name)
    resampled_df.to_parquet(f"data/cleaned/{asset_name}/{asset_name}.parquet")

