# Dataframe Performance Benchmarking

Comparing various types of dataframes for performance. Focus is on pandas-compatibile API's for cross-utility. 

## Candidates

- [Pandas](https://pandas.pydata.org/docs/index.html)
- [Modin](https://modin.readthedocs.io/en/stable/)
- [Polars](https://pola-rs.github.io/polars-book/user-guide/index.html)
- [Mars](https://docs.pymars.org/en/latest/index.html)
- [Dask Dataframe](https://docs.dask.org/en/stable/dataframe.html)

## Scaffold

In [None]:
import math
import os
import timeit
import urllib.request

import dask.dataframe as dask_dataframe
import mars.dataframe as mars_dataframe
import modin.pandas as modin_pandas
import numpy as np
import pandas as pandas
import polars as polars
from distributed import Client

In [None]:
# show relevant python versions
!pip freeze | egrep "pandas|modin|polars|pymars|dask"

In [None]:
# some globals that we'll use through this notebook
data_dir = "./data"
data_file = f"{data_dir}/sample.csv"

In [None]:
# create a data dir if it doesn't already exist
if not os.path.exists(f"{data_dir}"):
    os.makedirs(f"{data_dir}")

In [None]:
# create a timeit function which we'll keep standard throughout benchmarking
def dataframe_timeit(stmt: str) -> float:
    print("Timing: ", stmt)
    return timeit.timeit(stmt, globals=globals(), number=5)

In [None]:
# Dask execution environment initialization
client = Client()

In [None]:
results = pandas.DataFrame(index=["pandas", "modin", "polars", "mars", "dask"])
results

In [None]:
# create sample data
# reference: https://modin.readthedocs.io/en/stable/#modin-is-a-dataframe-for-datasets-from-1mb-to-1tb
dataframe_data = np.random.randint(0, 100_000_000_000, size=(2**12, 2**12))

if not os.path.isfile(f"{data_file}"):
    df = pandas.DataFrame(dataframe_data)
    df.to_csv(f"{data_file}")
    filesize = round(os.path.getsize(data_file) / 1024 / 1024)
    print(f"{data_file} is ~{filesize} MB")

In [None]:
df = pandas.DataFrame(dataframe_data)
df.to_csv(f"{data_file}")
filesize = round(os.path.getsize(data_file) / 1024 / 1024)
print(f"{data_file} is ~{filesize} MB")

## Data Reads

In [None]:
def pandas_csv_read():
    return pandas.read_csv(data_file)


def modin_csv_read():
    return modin_pandas.read_csv(data_file)


def polars_csv_read():
    return polars.read_csv(data_file)


def mars_csv_read():
    return mars_dataframe.read_csv(data_file).execute()


def dask_csv_read():
    return dask_dataframe.read_csv(data_file)


results["read_csv_func"] = [
    "pandas_csv_read()",
    "modin_csv_read()",
    "polars_csv_read()",
    "mars_csv_read()",
    "dask_csv_read()",
]
results["read_csv_func"]

In [None]:
results["read_csv"] = results["read_csv_func"].apply(lambda x: dataframe_timeit(x))
results["read_csv"]

## Data Writes

In [None]:
# pre-prepare df csv reads to isolate only writes
pandas_df = pandas_csv_read()
modin_df = modin_csv_read()
polars_df = polars_csv_read()
mars_df = mars_csv_read()
dask_df = dask_csv_read()

In [None]:
def pandas_csv_write():
    pandas_df.to_csv(data_file)


def modin_csv_write():
    modin_df.to_csv(data_file)


def polars_csv_write():
    polars_df.to_csv(data_file)


def mars_csv_write():
    mars_df.to_csv(data_file).execute()


def dask_csv_write():
    dask_df.to_csv(data_file, single_file=True)


results["write_csv_func"] = [
    "pandas_csv_write()",
    "modin_csv_write()",
    "polars_csv_write()",
    "mars_csv_write()",
    "dask_csv_write()",
]
results["write_csv_func"]

In [None]:
results["write_csv"] = results["write_csv_func"].apply(lambda x: dataframe_timeit(x))
results["write_csv"]