In [1]:
# main libs
import numpy as np
import pandas as pd
import os

# testing formants
import feather
import pyarrow

In [2]:
# set folder for files
folder = 'bench_files'

# Creating dataset

In [3]:
class SynthDF(pd.DataFrame):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    @property
    def _constructor(self):
        return SynthDF
    
    @classmethod
    def create_data(cls, size):
        """
        Create a synthetic DataFrame with random data.

        Parameters
        ----------
        size : int
            Number of rows in the DataFrame.
        """
        df = cls()
        
        # dates
        dates = pd.date_range('2024-01-01', '2024-12-31')
        df['date'] = np.random.choice(dates, size)
        # int data
        df['tournament_id'] = np.arange(size)
        df['team_id'] = np.random.randint(1, 1000, size)
        df['members'] = np.random.randint(1, 10, size)
        # categorical data
        df['location'] = np.random.choice(['Asia', 'Europe', 'Africa', 'America', 'Oceania'], size)
        df['importance'] = np.random.choice(['local', 'minor', 'major'], size)

        # float data
        df['avg_age'] = np.random.randint(100, 500, size) / 10
        df['prize'] = np.random.randint(10000, 10000000, size) / 100
        df['prob'] = np.random.uniform(0, 1, size)
        # bool data
        df['win'] = np.random.choice([True, False], size)
        
        return df

    def dtypes_setter(self):
        """
        Set data types for columns in a synthetic DataFrame.

        - Integers: 'tournament_id' ('int32'), 'team_id' ('int16'), 'members' ('int8').
        - Categorical: 'location', 'importance'.
        - Floats: 'avg_age' ('float16'), 'prize' ('float32'), 'prob' ('float32').
        """

        # int data
        self['tournament_id'] = self['tournament_id'].astype('int32')
        self['team_id'] = self['team_id'].astype('int16')
        self['members'] = self['members'].astype('int8')

        # categorical data
        self['location'] = self['location'].astype('category')
        self['importance'] = self['importance'].astype('category')

        # float data
        self['avg_age'] = self['avg_age'] .astype('float32')
        self['prize'] = self['prize'].astype('float32')
        self['prob'] = self['prob'].astype('float32')

    def benchmark(self, f_name, write_method, read_method, kwargs_write={}, kwargs_read={}):
        """
        Benchmark the performance and space requirements of writing and reading a DataFrame.

        Parameters
        ----------
        f_name : str
            The name of the file to be used for benchmarking.
        write_method : callable
            The method used to write the DataFrame to a file.
        read_method : callable
            The method used to read the DataFrame from a file.
        kwargs_write : dict, optional
            Additional keyword arguments for the write method.
        kwargs_read : dict, optional
            Additional keyword arguments for the read method.

        Returns
        -------
        dict
            A dictionary containing benchmark results.
            - 'format': The file format extension.
            - 'df_size': The size identifier of the DataFrame.
            - 'write': Timing information for the write operation.
            - 'read': Timing information for the read operation.
            - 'size': The size of the file on disk.
            - 'metadata': Whether metadata (data types) is preserved in the read operation.
        """
        # set path to file 
        file = os.path.join(folder, f_name)

        # bench write time
        write = %timeit -o write_method(self, file, **kwargs_write)

        # bench read time
        read = %timeit -o read_method(file, **kwargs_read)

        # bench size of file
        space = os.path.getsize(file)
        print(f'{space} bytes (required space)')

        # metadata
        new_df = read_method(file, **kwargs_read)
        saved_metadata = new_df.dtypes == self.dtypes
        print('metadata is saved' if saved_metadata.all() else 'metadata is NOT saved')

        result = {
            'format': f_name.split('.')[-1],
            'df_size': f_name.split('_')[0],
            'write': write.average,
            'read': read.average,
            'size': space,
            'metadata': saved_metadata.all()
        }

        return result

In [4]:
# data synthesis
df_small = SynthDF.create_data(100000)
df_medium = SynthDF.create_data(1000000)
df_large = SynthDF.create_data(10000000)

# change dtypes
df_small.dtypes_setter()
df_medium.dtypes_setter()
df_large.dtypes_setter()

In [5]:
# init df for result
result = pd.DataFrame(columns=['format','df_size','write', 'read', 'size', 'metadata'])

# CSV

In [6]:
temp = df_small.benchmark(
    f_name='s_csv.csv', 
    write_method=pd.DataFrame.to_csv, 
    read_method=pd.read_csv,
    kwargs_write={'index': False}
)
result = pd.concat([result, pd.DataFrame.from_dict(temp, orient='index').transpose()], ignore_index=True)

1.72 s ± 291 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
168 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
6670787 bytes (required space)
metadata is NOT saved


# JSON

In [7]:
temp = df_small.benchmark(
    f_name='s_json.json', 
    write_method=pd.DataFrame.to_json, 
    read_method=pd.read_json,
)
result = pd.concat([result, pd.DataFrame.from_dict(temp, orient='index').transpose()], ignore_index=True)

189 ms ± 3.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
941 ms ± 24.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
16626447 bytes (required space)
metadata is NOT saved


# Pickle

In [8]:
temp = df_small.benchmark(
    f_name='s_pickle.pickle', 
    write_method=pd.DataFrame.to_pickle, 
    read_method=pd.read_pickle,
)
result = pd.concat([result, pd.DataFrame.from_dict(temp, orient='index').transpose()], ignore_index=True)

2.55 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.1 ms ± 193 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3002037 bytes (required space)
metadata is saved


# Feather

In [9]:
temp = df_small.benchmark(
    f_name='s_feather.feather', 
    write_method=pd.DataFrame.to_feather, 
    read_method=pd.read_feather,
)
result = pd.concat([result, pd.DataFrame.from_dict(temp, orient='index').transpose()], ignore_index=True)

8.07 ms ± 90.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
6.86 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2189170 bytes (required space)
metadata is saved


# Parquet
`
!pip istall pyarrow
`

In [None]:
# temp = df_small.benchmark(
#     f_name='s_parquet.parquet', 
#     write_method=pd.DataFrame.to_parquet, 
#     read_method=pd.read_parquet,
# )
# result = pd.concat([result, pd.DataFrame.from_dict(temp, orient='index').transpose()], ignore_index=True)