In [1]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
from pathlib import Path
import gc
import time
import os
import psutil
import json
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd
from filebench import constants as c

def time_func(func, *args, **kwargs):
    start = timer()
    result = func(*args, **kwargs)
    end = timer()
    total_in_seconds = end - start
    return result, total_in_seconds

def write_file_from_df(df, filename, filetype):
    if filetype == 'csv':
        table = pa.Table.from_pandas(df)
        csv.write_csv(table, filename)
    elif filetype == 'parquet':
        table = pa.Table.from_pandas(df)
        parquet.write_table(table, filename, compression='none')
    elif filetype == 'orc':
        table = pa.Table.from_pandas(df)
        orc.write_table(table, filename, compression='uncompressed')
    elif filetype == 'feather':
        feather.write_feather(df, filename, compression='uncompressed')

## Write Test

In [None]:
df = csv.read_csv(c.ORIGINAL_CSV_FILE).to_pandas()
filenames = [c.PARQUET_FILE, c.ORC_FILE, c.FEATHER_FILE, c.CSV_FILE]
filetypes = ['parquet', 'orc', 'feather', 'csv']

all_results = []
for i in range(10):
    results = []
    for filename, filetype in zip(filenames, filetypes):
        filepath = Path(filename)
        if filepath.is_file():
            filepath.unlink()

        _, time_in_seconds = time_func(
            write_file_from_df,
            df,
            filename,
            filetype
        )

        filesize_in_bytes = Path(filename).stat().st_size

        all_results.append({
            'test_number': i,
            'func': write_file_from_df.__name__,
            'filename': filename,
            'filetype': filetype,
            'time_in_seconds': time_in_seconds,
            'filesize_in_bytes': filesize_in_bytes,
            'filesize_in_MB': filesize_in_bytes / 1000000
        })

## Query Test

In [2]:
def entire_file_to_df(filename, filetype):
    """Read entire file convert df and get shape"""
    if filetype == 'parquet':
        df = parquet.read_table(filename).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename)
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return df.shape


def get_amount_participants(filename, filetype):
    """User_id is a column with high variety"""
    if filetype == 'parquet':
        df = parquet.read_table(filename, columns=['user_id']).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename, columns=['user_id']).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename, columns=['user_id'])
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return pd.unique(df['user_id']).shape[0]


def get_amount_colors_used(filename, filetype):
    """Pixel color is low variety 16 unique values"""
    if filetype == 'parquet':
        df = parquet.read_table(filename, columns=['pixel_color']).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename, columns=['pixel_color']).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename, columns=['pixel_color'])
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return pd.unique(df['pixel_color']).shape[0]


def get_number_of_rows(filename, filetype):        
    """Row count"""
    if filetype == 'parquet':
        num_rows = parquet.read_metadata(filename).num_rows
    elif filetype == 'orc':
        num_rows = orc.read_table(filename).num_rows
    elif filetype == 'feather':
        num_rows = feather.read_table(filename).num_rows
    elif filetype == 'csv':
        num_rows = csv.read_csv(filename).num_rows
    
    return num_rows