# Benchmarks

We create two files, each of which is larger than available RAM, and then join them. We try this using three methods:

* Tab-delimited text file
* numpy ndarray
* Pandas HDF5 table

## Setup

In [1]:
import os, shutil

NUM_ROWS = 120 * 1000 * 1000
TEMP_DIR = os.path.join(os.environ['TMPDIR'], 'flatutils')

if not os.path.exists(TEMP_DIR):
    os.mkdir(TEMP_DIR)

def file_name(index):
    return os.path.join(TEMP_DIR, 'file{0}.txt'.format(index))

def make_file(index):
    fn = file_name(index)
    print("making {0}".format(fn))
    with open(fn, 'w') as f:
        for i in range(NUM_ROWS + 1):
            if i > 0 and i % 40000000 == 0:
                print("At row {0}".format(i))
            arith_inv_i = NUM_ROWS - i
            f.write("{0}\t" \
                    "abcdefghijlmnopqrstuv{0}\t" \
                    "abcdefghijlmnopqrs{0}\t" \
                    "abcdefghijlmn{0}\n".format(arith_inv_i))
    return file_name

In [4]:
%time make_file(0)
%time make_file(1)

os.stat(file_name(0)).st_size

making /var/folders/h4/vtwhb_7s4c74lrk93lhr35cc0000gn/T/flatutils/file0.txt
At row 40000000
At row 80000000
At row 120000000
CPU times: user 3min 21s, sys: 13.5 s, total: 3min 34s
Wall time: 3min 35s
making /var/folders/h4/vtwhb_7s4c74lrk93lhr35cc0000gn/T/flatutils/file1.txt
At row 40000000
At row 80000000
At row 120000000
CPU times: user 3min 26s, sys: 13.6 s, total: 3min 39s
Wall time: 3min 41s


10595555652

The goal for each approach is to join the files together by their third column.

## Tab-delimited text file approach

In [5]:
import sys

sys.path.append('/Users/aduston/dev/flatutils/flatutils')

from flatutils import Field, Schema, FlatFile, FIELD_INT, FIELD_STRING

schema = Schema([Field("col0", FIELD_INT, 0), 
                 Field("col1", FIELD_STRING, 1),
                 Field("col2", FIELD_STRING, 2),
                 Field("col3", FIELD_STRING, 3)])

file0 = FlatFile(file_name(0), schema)
file1 = FlatFile(file_name(0), schema)

%time sorted0 = file0.output_sorted(file_name(2), "col2")

CPU times: user 26min 41s, sys: 1min 37s, total: 28min 18s
Wall time: 28min 28s


In [6]:
%time sorted1 = file1.output_sorted(file_name(3), "col2")

CPU times: user 26min 40s, sys: 1min 36s, total: 28min 16s
Wall time: 28min 26s


In [None]:
def join():
    rowiter0 = sorted0.iterate_rows()
    rowiter1 = sorted1.iterate_rows()
    row0 = next(rowiter0, None)
    row1 = next(rowiter1, None)
    total_count, join_count = 0, 0
    with open(file_name(4), 'w') as outf:
        while row0 is not None and row1 is not None:
            val0 = row0['col2']
            val1 = row1['col2']
            total_count += 1
            if val0 == val1:
                join_count += 1
                outf.write("{0}\t{1}\t{2}\t{3}".format(
                    row0['col0'], row0['col1'], 
                    row0['col2'], row1['col3']))
                row0 = next(rowiter0, None)
                row1 = next(rowiter1, None)
            elif val0 < val1:
                row0 = next(rowiter0, None)
            else:
                row1 = next(rowiter1, None)
            if total_count % 20000000 == 0:
                print("Processed {0} total rows".format(total_count))
            if join_count % 20000000 == 0:
                print("Processed {0} joined rows".format(join_count))

%time join()

Processed 20000000 total rows
Processed 20000000 joined rows
Processed 40000000 total rows
Processed 40000000 joined rows
Processed 60000000 total rows
Processed 60000000 joined rows
Processed 80000000 total rows
Processed 80000000 joined rows
Processed 100000000 total rows
Processed 100000000 joined rows


## Pandas approach
In the Pandas approach, we're going to emulate the two files as being divided between different orgs, each with 8 million rows. Then we'll "join" each org individually using Pandas dataframes.

In [None]:
import numpy as np
import pandas as pd

def join_dataframes():
    columns = ['col0', 'col1', 'col2', 'col3']
    kwargs = dict(
        chunksize=8000000,
        header=None,
        names=columns,
        index_col=2
    )
    chunkiter0 = pd.read_table(file_name(0), **kwargs)
    chunkiter1 = pd.read_table(file_name(1), **kwargs)
    df0 = next(chunkiter0, None)
    df1 = next(chunkiter1, None)
    count = 0
    while df0 is not None and df1 is not None:
        count += 1
        print("count at {0}".format(count))
        result_df = df0.join(df1, how='inner', lsuffix="_left", rsuffix="_right")
        result_df.to_csv(os.path.join(TEMP_DIR, "joined{0}.csv".format(count)))
        df0 = next(chunkiter0, None)
        df1 = next(chunkiter1, None)

%time join_dataframes()