# Storing Financial Data Efficiently

In [55]:
import pandas as pd
from helpers.sample_generate import generate_sample_data

In [56]:
print(generate_sample_data(rows=5, cols=4, freq='1min'))

                            No0         No1         No2         No3
2023-01-01 00:00:00  100.000000  100.000000  100.000000  100.000000
2023-01-01 00:01:00   99.928362  100.063330  100.104386   99.975458
2023-01-01 00:02:00   99.938943  100.026508  100.132050  100.048829
2023-01-01 00:03:00   99.826146   99.971623  100.166635  100.050730
2023-01-01 00:04:00   99.778683  100.009084  100.186006  100.060818


## HDFStore

### .h5 for immutable data

In [57]:
%time data = generate_sample_data(rows=1e3, cols=10, freq='D').round(4)

Wall time: 12.2 ms


In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2023-01-01 to 2025-09-26
Freq: D
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No0     1000 non-null   float64
 1   No1     1000 non-null   float64
 2   No2     1000 non-null   float64
 3   No3     1000 non-null   float64
 4   No4     1000 non-null   float64
 5   No5     1000 non-null   float64
 6   No6     1000 non-null   float64
 7   No7     1000 non-null   float64
 8   No8     1000 non-null   float64
 9   No9     1000 non-null   float64
dtypes: float64(10)
memory usage: 85.9 KB


### HDFStore write to .h5

In [59]:
h5 = pd.HDFStore('data/data.h5', 'w')

In [60]:
%time h5['data'] = data

Wall time: 5.03 ms


In [61]:
h5

<class 'pandas.io.pytables.HDFStore'>
File path: data/data.h5

In [62]:
h5.close()

In [63]:
ls -n data

 Volume in drive C is OS
 Volume Serial Number is 8A40-DA6B

 Directory of C:\Users\Michael\source\algo-trading-examples


 Directory of C:\Users\Michael\source\algo-trading-examples\data

2023/12/25  18:27    <DIR>          .
2023/12/25  18:27    <DIR>          ..
2023/12/25  18:19           176,760 aapl.json
2023/12/25  18:19           129,024 aapl.xls
2023/12/25  18:19            58,639 aapl.xlsx
2023/12/13  22:41        29,580,549 all_stocks_5yr.csv
2023/12/25  18:27            95,240 data.h5
2023/12/25  18:18           152,878 data2.h5
               6 File(s)     30,193,090 bytes
               2 Dir(s)  62,634,717,184 bytes free


File Not Found


### HDFStore read from .h5

In [64]:
h5 = pd.HDFStore('data/data.h5', 'r')
%time data_copy = h5['data']

Wall time: 4.99 ms


In [65]:
data_copy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2023-01-01 to 2025-09-26
Freq: D
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No0     1000 non-null   float64
 1   No1     1000 non-null   float64
 2   No2     1000 non-null   float64
 3   No3     1000 non-null   float64
 4   No4     1000 non-null   float64
 5   No5     1000 non-null   float64
 6   No6     1000 non-null   float64
 7   No7     1000 non-null   float64
 8   No8     1000 non-null   float64
 9   No9     1000 non-null   float64
dtypes: float64(10)
memory usage: 85.9 KB


In [66]:
h5.close()

### Pandas write to .h5 with to_hdf()

In [67]:
data.to_hdf('data/data2.h5', 'data', format='table')

In [68]:
data_copy2 = pd.read_hdf('data/data2.h5', 'data')

In [69]:
data_copy2.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2023-01-01 to 2025-09-26
Freq: D
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   No0     1000 non-null   float64
 1   No1     1000 non-null   float64
 2   No2     1000 non-null   float64
 3   No3     1000 non-null   float64
 4   No4     1000 non-null   float64
 5   No5     1000 non-null   float64
 6   No6     1000 non-null   float64
 7   No7     1000 non-null   float64
 8   No8     1000 non-null   float64
 9   No9     1000 non-null   float64
dtypes: float64(10)
memory usage: 85.9 KB


### Reading .h5 using PyTables (tables)

In [70]:
import tables as tb

In [71]:
h5 = tb.open_file('data/data.h5.', 'r')

In [72]:
h5

File(filename=data/data.h5., title='', mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/data (Group) ''
/data/axis0 (Array(10,)) ''
  atom := StringAtom(itemsize=3, shape=(), dflt=b'')
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None
/data/axis1 (Array(1000,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/data/block0_items (Array(10,)) ''
  atom := StringAtom(itemsize=3, shape=(), dflt=b'')
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := None
/data/block0_values (Array(1000, 10)) ''
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [73]:
h5.close()

## TsTables .h5ts for mutable and time series data

In [74]:
%%time
data = generate_sample_data(rows=2.5e6, cols=5,
                            freq='1s').round(4)

Wall time: 1.23 s


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2500000 entries, 2023-01-01 00:00:00 to 2023-01-29 22:26:39
Freq: S
Data columns (total 5 columns):
 #   Column  Dtype  
---  ------  -----  
 0   No0     float64
 1   No1     float64
 2   No2     float64
 3   No3     float64
 4   No4     float64
dtypes: float64(5)
memory usage: 114.4 MB


In [76]:
# pip install git+https://github.com/yhilpisch/tstables.git
import tstables
import tables as tb

In [77]:
class desc(tb.IsDescription):
    '''Description of TsTables table structure
    '''
    timestamp = tb.Int64Col(pos=0)
    N0 = tb.Float64Col(pos=1)
    N1 = tb.Float64Col(pos=2)
    N2 = tb.Float64Col(pos=3)
    N3 = tb.Float64Col(pos=4)
    N4 = tb.Float64Col(pos=5)

In [78]:
h5ts = tb.open_file('data/data.h5ts', 'w')
ts = h5ts.create_ts('/', 'data', desc)

In [79]:
h5ts

File(filename=data/data.h5ts, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/data (Group/Timeseries) ''
/data/y2023 (Group) ''
/data/y2023/m12 (Group) ''
/data/y2023/m12/d25 (Group) ''
/data/y2023/m12/d25/ts_data (Table(0,)) ''
  description := {
  "timestamp": Int64Col(shape=(), dflt=0, pos=0),
  "N0": Float64Col(shape=(), dflt=0.0, pos=1),
  "N1": Float64Col(shape=(), dflt=0.0, pos=2),
  "N2": Float64Col(shape=(), dflt=0.0, pos=3),
  "N3": Float64Col(shape=(), dflt=0.0, pos=4),
  "N4": Float64Col(shape=(), dflt=0.0, pos=5)}
  byteorder := 'little'
  chunkshape := (1365,)

In [80]:
%time ts.append(data)

Wall time: 433 ms


In [81]:
h5ts

File(filename=data/data.h5ts, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/data (Group/Timeseries) ''
/data/y2023 (Group) ''
/data/y2023/m01 (Group) ''
/data/y2023/m12 (Group) ''
/data/y2023/m12/d25 (Group) ''
/data/y2023/m12/d25/ts_data (Table(0,)) ''
  description := {
  "timestamp": Int64Col(shape=(), dflt=0, pos=0),
  "N0": Float64Col(shape=(), dflt=0.0, pos=1),
  "N1": Float64Col(shape=(), dflt=0.0, pos=2),
  "N2": Float64Col(shape=(), dflt=0.0, pos=3),
  "N3": Float64Col(shape=(), dflt=0.0, pos=4),
  "N4": Float64Col(shape=(), dflt=0.0, pos=5)}
  byteorder := 'little'
  chunkshape := (1365,)
/data/y2023/m01/d01 (Group) ''
/data/y2023/m01/d01/ts_data (Table(86400,)) ''
  description := {
  "timestamp": Int64Col(shape=(), dflt=0, pos=0),
  "N0": Float64Col(shape=(), dflt=0.0, pos=1),
  "N1": Float64Col(shape=(), dflt=0.0, pos=2),
  "N2": Float64Col(shape=(), dflt=0.

### Read .h5ts into DataFrame

In [82]:
import datetime
start = datetime.datetime(2023, 1, 2)
end = datetime.datetime(2023, 1, 3)

In [83]:
%time subset = ts.read_range(start, end)

Wall time: 23 ms


In [84]:
start = datetime.datetime(2023, 1, 2, 12, 30, 0)
end = datetime.datetime(2023, 1, 5, 17, 15, 30)
%time subset2 = ts.read_range(start, end)

Wall time: 53.4 ms


In [85]:
subset2.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 276331 entries, 2023-01-02 12:30:00 to 2023-01-05 17:15:30
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   N0      276331 non-null  float64
 1   N1      276331 non-null  float64
 2   N2      276331 non-null  float64
 3   N3      276331 non-null  float64
 4   N4      276331 non-null  float64
dtypes: float64(5)
memory usage: 12.6 MB


In [86]:
h5ts.close()

In [89]:
# In conda prompt, run "del data\data.h5ts" to remove file

## Storing Data with SQLite3

#### Note: SQLite3 is a relational database, where the SQL query language can be applied to implement more sophisticated analyses. But it is slower than the binary storage format like HDF5. 

In [90]:
%time data = generate_sample_data(1e6, 5, '1min').round(4)

Wall time: 502 ms


In [91]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000000 entries, 2023-01-01 00:00:00 to 2024-11-25 10:39:00
Freq: T
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   No0     1000000 non-null  float64
 1   No1     1000000 non-null  float64
 2   No2     1000000 non-null  float64
 3   No3     1000000 non-null  float64
 4   No4     1000000 non-null  float64
dtypes: float64(5)
memory usage: 45.8 MB


In [93]:
import sqlite3 as sq3
con = sq3.connect('data/data.sql')

In [94]:
%time data.to_sql('data', con)

Wall time: 5.45 s


1000000

In [95]:
query = 'SELECT * FROM data WHERE No1 > 105 and No2 < 108'

In [96]:
%time res = con.execute(query).fetchall()

Wall time: 469 ms


In [102]:
res[-20:]

[('2024-11-25 10:20:00', 55.3471, 140.4204, 24.5429, 41.2253, 69.7958),
 ('2024-11-25 10:21:00', 55.3155, 140.4411, 24.5369, 41.24, 69.7973),
 ('2024-11-25 10:22:00', 55.3045, 140.4083, 24.5157, 41.2141, 69.7778),
 ('2024-11-25 10:23:00', 55.2729, 140.5022, 24.5228, 41.2386, 69.7287),
 ('2024-11-25 10:24:00', 55.2248, 140.3836, 24.5315, 41.202, 69.6891),
 ('2024-11-25 10:25:00', 55.2271, 140.3174, 24.5551, 41.1153, 69.6743),
 ('2024-11-25 10:26:00', 55.2695, 140.1287, 24.586, 41.1239, 69.7369),
 ('2024-11-25 10:27:00', 55.2541, 140.1091, 24.5885, 41.1293, 69.7174),
 ('2024-11-25 10:28:00', 55.266, 140.1666, 24.578, 41.1564, 69.8169),
 ('2024-11-25 10:29:00', 55.294, 140.2414, 24.5835, 41.1425, 69.799),
 ('2024-11-25 10:30:00', 55.2612, 140.1413, 24.5864, 41.1468, 69.8397),
 ('2024-11-25 10:31:00', 55.2687, 140.3201, 24.5969, 41.0867, 69.8298),
 ('2024-11-25 10:32:00', 55.2736, 140.3591, 24.5787, 41.0774, 69.82),
 ('2024-11-25 10:33:00', 55.249, 140.5965, 24.6005, 41.0677, 69.8356),
 ('

In [103]:
con.close()

In [104]:
# In conda prompt, run "del data\data.sql" to remove file

### See also: SQLAlchemy, which allows for the use of MySQL as the relational database backend.