# 1. Multiple syms in one file

In [1]:
import pandas as pd

In [8]:
with pd.HDFStore("daily_universe-2.h5") as f:
    print(f["META"])
    #print(list(f.keys())) # behaves like a dictionary
    pass


                                 Open        High         Low       Close  \
Date                                                                        
2012-05-18 00:00:00-04:00   42.049999   45.000000   38.000000   38.230000   
2012-05-21 00:00:00-04:00   36.529999   36.660000   33.000000   34.029999   
2012-05-22 00:00:00-04:00   32.610001   33.590000   30.940001   31.000000   
2012-05-23 00:00:00-04:00   31.370001   32.500000   31.360001   32.000000   
2012-05-24 00:00:00-04:00   32.950001   33.209999   31.770000   33.029999   
...                               ...         ...         ...         ...   
2022-10-10 00:00:00-04:00  133.550003  136.110001  131.869995  133.789993   
2022-10-11 00:00:00-04:00  131.619995  132.660004  126.989998  128.539993   
2022-10-12 00:00:00-04:00  128.320007  129.669998  126.250000  127.500000   
2022-10-13 00:00:00-04:00  123.529999  131.139999  122.529999  130.289993   
2022-10-14 00:00:00-04:00  131.000000  131.789993  128.429993  129.229904   

# How to create the file

In [2]:
import yfinance as yf
import pandas as pd

In [7]:
counter = 0
for sym in ["TSLA","GOOG","META"]:
    data = yf.Ticker(sym).history(period = "max")
    
    # default format=fixed, can also use format=table
    data.to_hdf("daily_universe-2.h5", key = sym, mode = "a") 
    counter += 1
    print(counter)

1
2
3


# 2. Dealing with large files

## I can't load in this CSV without crashing Kernel

In [None]:
big_df = pd.read_csv("BTCUSDT-trades-2022-09.csv") # 175,214,455 lines

## But I can load the HDF!

In [4]:
with pd.HDFStore("BTCUSDT-trades-2022-09.h5") as f:
    # can query by any data column
    print(f.select("BTCUSDT", where = ["index > 1662768000000", "index < 1662771600000" ]))

                       id     price   amount  quote_amount  isMakerBuyer  \
timestamp                                                                  
1662768000006  1785608194  21360.11  0.00313     66.857144          True   
1662768000007  1785608195  21361.62  0.00124     26.488409         False   
1662768000008  1785608196  21361.62  0.00123     26.274793         False   
1662768000011  1785608197  21360.11  0.00125     26.700138          True   
1662768000011  1785608198  21360.11  0.00346     73.905981          True   
...                   ...       ...      ...           ...           ...   
1662771599998  1785905771  21258.39  0.00077     16.368960         False   
1662771599999  1785905772  21258.39  0.02285    485.754211         False   
1662771599999  1785905773  21258.69  0.00397     84.396999         False   
1662771599999  1785905774  21258.73  0.07390   1571.020147         False   
1662771599999  1785905775  21258.74  0.40690   8650.181306         False   

           

In [5]:
with pd.HDFStore("BTCUSDT-trades-2022-09.h5") as f:
    print(f.get_storer("BTCUSDT").table)

/BTCUSDT/table (Table(175214455,)) ''


## Chunking down and saving large CSV

In [None]:
import pandas as pd
big_df_chunked = pd.read_csv("BTCUSDT-trades-2022-09.csv", chunksize = 10_000_000,
                            names = ["id","price","amount","quote_amount","timestamp","isMakerBuyer","bestPrice"],
                            index_col = "timestamp")

store = pd.HDFStore("BTCUSDT-trades-2-2022-09.h5", mode='w', complevel = 9) # complevel = x for compression
for chunk in big_df_chunked:
    store.append('BTCUSDT', chunk)
store.close()

# Vs. RDBMS

Pros
 * No maintainance
 * Archival
 * Portable
 * quicker?
 
Cons

 * Can't change schema
 * Concurrency more difficult

# Vs CSV

Pros

 * Doesn't have to fit in memory, file size can be TBs
 * Multiple, potentially unrelated data sets in one container
 * Partial IO


Cons

 * Not plain text
 * Larger learning curve
 * More thinking required when saving down data

# Useful links

Dataframe.to_hdf
```
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_hdf.html
```

HDFStore functions
```
https://pandas.pydata.org/docs/reference/api/pandas.HDFStore.put.html
```

Cookbook (lots of other less-documented stuff)
```
https://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html#cookbook-hdf
```

HDFStore class definition
```
https://github.com/pandas-dev/pandas/blob/3d6b36557582279a8a3bc45a49aa15c5cf44bdd9/pandas/io/pytables.py#L478
```


Pandas uses Pytable under the hood. To get the absolute best performance, you'll want to tweak things there:
```
https://www.pytables.org/
```

Helpful tutorial series
```
https://www.youtube.com/watch?v=S74Kc8QYDac&list=PLPyhR4PdEeGYWHRhzmCP5stzfIha8bqVg
```