In [2]:
from enum import Enum
from datetime import datetime, timedelta
import pandas as pd 
import os 
import re 

class Granularity(Enum):
    """ The possible Granularity to build the OHLC old_data from lob """
    Sec1 = "1S"
    Sec5 = "5S"
    Sec15 = "15S"
    Sec30 = "30S"
    Min1 = "1Min"
    Min5 = "5Min"
    Min15 = "15Min"
    Min30 = "30Min"
    Hour1 = "1H"
    Hour2 = "2H"
    Hour6 = "6H"
    Hour12 = "12H"
    Day1 = "1D"
    Day2 = "2D"
    Day5 = "7D"
    Month1 = "30D"

class OrderEvent(Enum):
    """event types of orderbook"""
    submission = 1
    cancellation = 2
    deletion = 3
    execution_visible = 4
    execution_hidden = 5
    cross_trade = 6
    halt = 7

In [3]:
def orderbook_columns(level: int):
    """ return the column names for the LOBSTER orderbook, acording the input level """
    orderbook_columns = []
    for i in range(1, level + 1):
        orderbook_columns += ["psell" + str(i), "vsell" + str(i), "pbuy" + str(i), "vbuy" + str(i)]
    return orderbook_columns

def message_columns():
    """ return the message columns for the LOBSTER orderbook """
    return ["time", "event_type", "order_id", "size", "price", "direction", "unk"]

In [4]:
def lobster_to_sec_df(message_df, orderbook_df,
                      datetime_start: datetime,
                      granularity: Granularity = Granularity.Sec1,
                      level: int = 10, 
                      add_messages=True):
    """ create a dataframe with midprices, sell and buy for each second

        message_df : a csv df with the messages (lobster old_data format) without initial start lob
        ordebook_df : a csv df with the orderbook (lobster old_data format) without initial start lob
        datetime_start : should be a start date in the message file and orderbook file
        granularity : the granularity to use in the mid-prices computation
        plot : whether print or not the mid_prices
        level : the level of the old_data
        add_messages : if True keep messages along the orderbook data. It does not work with granularity != None
    """
    start_date = datetime_start

    # to be sure that columns are okay
    orderbook_df.columns = orderbook_columns(level)
    message_df.columns = message_columns()

    # convert the time to seconds and structure the df to the input granularity
    orderbook_df["seconds"] = message_df["time"]

    if add_messages and granularity is not None:
        orderbook_df[message_df.columns] = message_df[message_df.columns]
        accepted_orders = [o.value for o in (OrderEvent.execution_visible, OrderEvent.submission, OrderEvent.execution_hidden)]
        orderbook_df = orderbook_df[orderbook_df["event_type"].isin(accepted_orders)]

    orderbook_df["date"] = [start_date + timedelta(seconds=i) for i in orderbook_df["seconds"]]

    if granularity is not None:
        orderbook_df.set_index("date", inplace=True)
        orderbook_df = orderbook_df.resample(granularity.value).first()
        orderbook_df.reset_index(inplace=True)

    orderbook_df = orderbook_df.sort_values(by="date").reset_index(drop=True).copy()
    orderbook_df.drop(columns=['seconds'], inplace=True)

    return orderbook_df.set_index('date')

In [5]:
def read_sub_routine(file_7z: str, first_date: str = "1990-01-01",
                     last_date: str = "2100-01-01",
                     type_file: str = "orderbook",
                     level: int = 10,
                     path: str = "") -> dict:
    """
        :param file_7z: the input file where the csv with old_data are stored
        :param first_date: the first day to load from the input file
        :param last_date: the last day to load from the input file
        :param type_file: the kind of old_data to read. type_file in ("orderbook", "message")
        :param level: the LOBSTER level of the orderbook
        :param path: data path
        :return: a dictionary with {day : dataframe}
    """
    assert type_file in ("orderbook", "message"), "The input type_file: {} is not valid".format(type_file)

    columns = message_columns() if type_file == "message" else orderbook_columns(level)
    # if both none then we automatically detect the dates from the files
    first_date = datetime.strptime(first_date, "%Y-%m-%d")
    last_date = datetime.strptime(last_date, "%Y-%m-%d")

    all_period = {}  # day :  df

    path = path + file_7z
    for file in sorted(os.listdir(path)):
        # read only the selected type of file
        if type_file not in str(file):
            continue

        # read only the old_data between first_ and last_ input dates
        m = re.search(r".*([0-9]{4}-[0-9]{2}-[0-9]{2}).*", str(file))
        if m:
            entry_date = datetime.strptime(m.group(1), "%Y-%m-%d")
            if entry_date < first_date or entry_date > last_date:
                continue
        else:
            print("error for file: {}".format(file))
            continue

        curr = path + '/' + file
        df = pd.read_csv(curr, names=columns)
        # put types
        all_period[entry_date] = df

    return all_period

In [5]:
!ls /home/ema/dev/shocks/data/lobster/_data_dwn_48_332__AAPL_2021-11-01_2022-04-30_10.7z

/home/ema/dev/shocks/data/lobster/_data_dwn_48_332__AAPL_2021-11-01_2022-04-30_10.7z


In [6]:
def from_folder_to_unique_df(file_7z: str, 
                             first_date: str = "1990-01-01",
                             last_date: str = "2100-01-01",
                             plot: bool = False, 
                             level: int = 10,
                             path: str = "",
                             granularity: Granularity = Granularity.Sec1,
                             add_messages = True):
    """ return a unique df with also the label

        add_messages : if True keep messages along the orderbook data. It does not work with granularity != None

    """
    message_dfs = read_sub_routine(file_7z, first_date, last_date, "message", level=level, path=path)
    orderbook_dfs = read_sub_routine(file_7z, first_date, last_date, "orderbook", level=level, path=path)
    frames = []

    assert list(message_dfs.keys()) == list(orderbook_dfs.keys()), "the messages and orderbooks have different days!!"
    
    for d in message_dfs.keys():
        tmp_df = lobster_to_sec_df(
            message_dfs[d], orderbook_dfs[d], d, granularity=granularity,
            level=level, add_messages=add_messages)
        frames.append(tmp_df)

    result = pd.concat(frames, ignore_index=False)

    return result, orderbook_dfs, message_dfs

In [7]:
df, orderbook, message = from_folder_to_unique_df("/home/ema/dev/shocks/data/lobster/AAPL", level=10)


  df = pd.read_csv(curr, names=columns)
  df = pd.read_csv(curr, names=columns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orderbook_df["date"] = [start_date + timedelta(seconds=i) for i in orderbook_df["seconds"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orderbook_df["date"] = [start_date + timedelta(seconds=i) for i in orderbook_df["seconds"]]


In [56]:
m = pd.concat(message, ignore_index=False)
o = pd.concat(orderbook, ignore_index=False)

In [8]:
df

Unnamed: 0_level_0,psell1,vsell1,pbuy1,vbuy1,psell2,vsell2,pbuy2,vbuy2,psell3,vsell3,...,vsell10,pbuy10,vbuy10,time,event_type,order_id,size,price,direction,unk
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-11-04 09:30:00,1516600.0,100.0,1516000.0,2306.0,1516700.0,100.0,1515600.0,14.0,1516800.0,1.0,...,32.0,1514600.0,4.0,34200.005664,1.0,16985769.0,100.0,1516600.0,-1.0,
2021-11-04 09:30:01,1516600.0,100.0,1515900.0,56.0,1516700.0,100.0,1515800.0,130.0,1516800.0,1.0,...,30.0,1514800.0,38.0,34201.003166,4.0,17481517.0,85.0,1516100.0,1.0,
2021-11-04 09:30:02,1516000.0,100.0,1515600.0,101.0,1516200.0,100.0,1515500.0,13.0,1516300.0,100.0,...,429.0,1514500.0,21.0,34202.026983,1.0,17975229.0,100.0,1515600.0,1.0,UBSS
2021-11-04 09:30:03,1515900.0,100.0,1515700.0,300.0,1516000.0,100.0,1515600.0,100.0,1516100.0,828.0,...,26.0,1514800.0,112.0,34203.000064,1.0,18288397.0,100.0,1515900.0,-1.0,UBSS
2021-11-04 09:30:04,1516400.0,101.0,1515900.0,100.0,1516500.0,100.0,1515800.0,200.0,1516600.0,186.0,...,21.0,1515000.0,5311.0,34204.029662,5.0,0.0,81.0,1516350.0,1.0,UBSS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-05 15:59:55,1512900.0,114.0,1512700.0,100.0,1513000.0,20861.0,1512600.0,600.0,1513100.0,553.0,...,861.0,1511800.0,110.0,57595.005644,4.0,470972721.0,97.0,1512800.0,-1.0,
2021-11-05 15:59:56,1512900.0,99.0,1512600.0,500.0,1513000.0,20961.0,1512500.0,200.0,1513100.0,553.0,...,861.0,1511700.0,202.0,57596.007355,4.0,471073561.0,20.0,1512900.0,-1.0,
2021-11-05 15:59:57,1512800.0,400.0,1512600.0,100.0,1513000.0,20941.0,1512500.0,200.0,1513100.0,553.0,...,1761.0,1511700.0,202.0,57597.001966,4.0,471167605.0,23.0,1512800.0,-1.0,UBSS
2021-11-05 15:59:58,1512100.0,300.0,1511600.0,500.0,1512200.0,200.0,1511500.0,105.0,1512400.0,6600.0,...,453.0,1510700.0,250.0,57598.000533,1.0,471289429.0,100.0,1511300.0,1.0,UBSS
