# Requirments
- Python 3.11.9
    - pyenv (https://github.com/pyenv/pyenv)
- `pip install pyarrow`

In [None]:
import pandas as pd
import numpy as np

## Creating a fake dataset

In [None]:
def get_dataset(size):
    # Create Fake Dataset
    df = pd.DataFrame()
    df["size"] = np.random.choice(["big", "medium", "small"], size)
    df["age"] = np.random.randint(1, 50, size)
    df["team"] = np.random.choice(["red", "blue", "yellow", "green"], size)
    df["win"] = np.random.choice(["yes", "no"], size)
    dates = pd.date_range("2020-01-01", "2022-12-31")
    df["date"] = np.random.choice(dates, size)
    df["prob"] = np.random.uniform(0, 1, size)
    return df


def set_dtypes(df):
    df["size"] = df["size"].astype("category")
    df["team"] = df["team"].astype("category")
    df["age"] = df["age"].astype("int16")
    df["win"] = df["win"].map({"yes": True, "no": False})
    df["prob"] = df["prob"].astype("float32")
    return df

## Testing out CSV storage format

In [8]:
print('Reading and writing CSV')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_csv('test.csv')
%time df_csv = pd.read_csv('test.csv')

Reading and writing CSV
CPU times: user 8.94 s, sys: 439 ms, total: 9.38 s
Wall time: 9.71 s
CPU times: user 1.51 s, sys: 161 ms, total: 1.67 s
Wall time: 1.73 s


## Testing out Pickle Storage format

In [7]:
print('Reading and writing Pickle')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_pickle('test.pickle')
%time df_pickle = pd.read_pickle('test.pickle')

Reading and writing Pickle
CPU times: user 1.81 ms, sys: 8.25 ms, total: 10.1 ms
Wall time: 24.8 ms
CPU times: user 817 μs, sys: 8.2 ms, total: 9.01 ms
Wall time: 13.5 ms


## Testing out Parquet storage format

In [6]:
print('Reading and writing Parquet')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_parquet('test.parquet')
%time df_parquet = pd.read_parquet('test.parquet')

Reading and writing Parquet
CPU times: user 366 ms, sys: 18 ms, total: 384 ms
Wall time: 361 ms
CPU times: user 109 ms, sys: 36.6 ms, total: 146 ms
Wall time: 52.2 ms


## Testing out Feather File format

In [5]:
print('Reading and writing Feather')
df = get_dataset(5_000_000)
df = set_dtypes(df)
%time df.to_feather('test.feather')
%time df_feather = pd.read_feather('test.feather')

Reading and writing Feather
CPU times: user 198 ms, sys: 29.6 ms, total: 228 ms
Wall time: 137 ms
CPU times: user 60.9 ms, sys: 36.6 ms, total: 97.5 ms
Wall time: 48.4 ms


In [9]:
!ll

.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m  [1;32m11k[0m [1;33maquawolf[0m [34m16 Aug 11:22[0m [1;90m-[0m[34mM[0m [38;2;166;227;161m backtesting.ipynb[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [33m1.0M[0m [1;33maquawolf[0m [34m14 Aug 13:35[0m [1;90m--[0m [38;2;249;226;175m BTCUSDT_15m_20240216_20240816.csv[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m4.7k[0m [1;33maquawolf[0m [34m16 Aug 11:53[0m [1;90m-[0m[32mN[0m [38;2;166;227;161m compare-datatypes.ipynb[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m1.5k[0m [1;33maquawolf[0m [34m15 Aug 18:11[0m [1;90m--[0m [38;2;249;226;175m readme.md[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m1.3k[0m [1;33maquawolf[0m [34m14 Aug 14:40[0m [1;90m--[0m [38;2;249;226;175m requirments.txt[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [33m244M[0m

In [10]:
!wc -l BTCUSDT_15m_20240216_20240816.csv

   17277 BTCUSDT_15m_20240216_20240816.csv
