# Interim Storage

- Store intertim results in different formats.  
- Uses **Pickle+BZ2** as default.
- Other formats: JSON, JSON+BZ2, Pickle.
- Uses **system temporary directory** as default.
- JSON: Disadvantages in key mapping (see example formats) and serialization of different formats (see example json disadvantage)

In [None]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from code root directory
import sys; sys.path.insert(0, '../..')

from data.interim_storage import InterimStorage

from data.amazon_pickle_splitter import AmazonPickleSplitter
import timeit
import os

## Examples

### Minimal

In [None]:
test_data = {1:'one', 'two':2}
print(InterimStorage('test-id').write(test_data).read())

### Write, read, info

In [3]:
# Write
print(test_data, type(test_data))
storage = InterimStorage('test-id').write(test_data)

# Read
read_data = storage.read()
print(read_data, type(read_data))

# File location
print('get_filepath ', storage.get_filepath())
print('get_directory', storage.get_directory())
print('get_id       ', storage.get_id())

{1: 'one', 'two': 2} <class 'dict'>
{1: 'one', 'two': 2} <class 'dict'>
get_filepath  /tmp/InterimStorage/test-id.pickle.bz2
get_directory /tmp/InterimStorage
get_id        test-id


### Formats

In [4]:
print(test_data, type(test_data))

storage = InterimStorage('test-id', type_=InterimStorage.JSON).write(test_data)
read_data = storage.read()
print(read_data, type(read_data), storage.get_filepath())

storage = InterimStorage('test-id', type_=InterimStorage.JSON_BZ2).write(test_data)
read_data = storage.read()
print(read_data, type(read_data), storage.get_filepath())

storage = InterimStorage('test-id', type_=InterimStorage.PICKLE).write(test_data)
read_data = storage.read()
print(read_data, type(read_data), storage.get_filepath())

storage = InterimStorage('test-id', type_=InterimStorage.PICKLE_BZ).write(test_data)
read_data = storage.read()
print(read_data, type(read_data), storage.get_filepath())

{1: 'one', 'two': 2} <class 'dict'>
{'1': 'one', 'two': 2} <class 'dict'> /tmp/InterimStorage/test-id.json
{'1': 'one', 'two': 2} <class 'dict'> /tmp/InterimStorage/test-id.json.bz2
{1: 'one', 'two': 2} <class 'dict'> /tmp/InterimStorage/test-id.pickle
{1: 'one', 'two': 2} <class 'dict'> /tmp/InterimStorage/test-id.pickle.bz2


### Storage directory, check and delete file

In [5]:
print(InterimStorage('test-id').get_filepath())
print(InterimStorage('test-id', directory='/mnt').get_filepath())
print()

storage = InterimStorage('test-id')
storage.write(test_data)
print(storage.isfile())
storage.delete_file()
print(storage.isfile())

/tmp/InterimStorage/test-id.pickle.bz2
/mnt/test-id.pickle.bz2

True
False


### JSON disadvantage

In [6]:
import numpy as np
numpy_array = np.array([1, 'one', 'two', 2], dtype='O')
InterimStorage('test-id', type_=InterimStorage.PICKLE).write(numpy_array)

# InterimStorage('test-id', type_=InterimStorage.JSON).write(numpy_array)
# -> TypeError: Object of type ndarray is not JSON serializable

<data.interim_storage.InterimStorage at 0x7f5cb809e790>

## Small benchmark

### Get data

In [7]:
directory_pickle = '/home/eml4u/EML4U/data/amazon-complete/'

In [8]:
# 10 X 2 X 10,000 = 200,000
years     = [year for year in range(2001, 2010+1)] 
stars     = [0, 4]
max_items = 10 * 1000
data = AmazonPickleSplitter(directory_pickle)

In [9]:
# Get indexes
start = timeit.default_timer()
indexes = data.get_data_splits(years=years, stars=stars, max_items=max_items)
time_read = timeit.default_timer() - start
print('Time:', time_read)  

Time: 33.969751394120976


In [10]:
# Get data
start = timeit.default_timer()
bench_data = []
for years in indexes.keys():
    for stars in indexes[years].keys():
        for item in indexes[years][stars]:
            bench_data.append(data.get_bow50(item).tolist())
print(len(bench_data))

time_read_data = timeit.default_timer() - start
print('Time:', time_read_data)
print('Example:', bench_data[0])

200000
Time: 17.573948554927483
Example: [0.34047234058380127, 0.2043135166168213, 0.3105195164680481, -0.1342063695192337, -0.7848458290100098, 0.19274359941482544, 0.13591687381267548, 0.2334279716014862, 0.4503822326660156, -0.44136351346969604, 0.064661905169487, 0.739519476890564, 0.25940191745758057, -0.0029060891829431057, 0.27662014961242676, -0.17198514938354492, -0.036287810653448105, -0.16849054396152496, 0.3470377027988434, 0.27252212166786194, -0.25882798433303833, 0.04169261083006859, -0.3211864233016968, 0.511594295501709, 0.11867643147706985, 0.11629629135131836, 0.901910662651062, 0.021725205704569817, -0.020022975280880928, -0.22301003336906433, -0.8718155026435852, 0.027210205793380737, 0.19232884049415588, -0.19159847497940063, -0.44342997670173645, -0.48215731978416443, -0.6705529093742371, 0.22982566058635712, -0.2696513533592224, -0.005211083684116602, 0.5327435731887817, 0.6535655856132507, -0.4448152780532837, 0.3689725995063782, 0.1422746181488037, 0.637244403

### Run

In [11]:
def run_bench(type_, data):
    storage = InterimStorage('bench'+type_, type_=type_)
    
    start = timeit.default_timer()
    storage.write(data)
    time_write = timeit.default_timer() - start
    
    start = timeit.default_timer()
    storage.read()
    time_read = timeit.default_timer() - start
    
    size = os.path.getsize(storage.get_filepath())
    
    storage.delete_file()
    
    return (type_, time_write, time_read, size)

In [12]:
bench_results = []
bench_results.append(run_bench(InterimStorage.JSON, bench_data))
bench_results.append(run_bench(InterimStorage.JSON_BZ2, bench_data))
bench_results.append(run_bench(InterimStorage.PICKLE, bench_data))
bench_results.append(run_bench(InterimStorage.PICKLE_BZ, bench_data))
print(bench_results)

[('json', 8.338726856047288, 4.976031355094165, 210338619), ('json.bz2', 29.061390546150506, 16.871615912998095, 77956243), ('pickle', 0.3476801458746195, 0.7958648079074919, 90812879), ('pickle.bz2', 7.979426169069484, 11.232359538087621, 38766345)]


In [13]:
def print_result(title, bench_results, index):
    max_ = float('-inf')
    min_ = float('inf')
    for t in bench_results:
        if(t[index]>max_):
            max_ = t[index]
        if(t[index]<min_):
            min_ = t[index]
    
    print(title.ljust(12), 'max'.ljust(8), 'min'.ljust(8), 'value')
    
    for t in bench_results:
        print(t[0].ljust(12), format(t[index]/max_,".4f").ljust(8), format(t[index]/min_,".4f").rjust(8), t[index])

In [14]:
print_result('Writing', bench_results, 1)

Writing      max      min      value
json         0.2869    23.9839 8.338726856047288
json.bz2     1.0000    83.5866 29.061390546150506
pickle       0.0120     1.0000 0.3476801458746195
pickle.bz2   0.2746    22.9505 7.979426169069484


In [15]:
print_result('Reading', bench_results, 2)

Reading      max      min      value
json         0.2949     6.2524 4.976031355094165
json.bz2     1.0000    21.1991 16.871615912998095
pickle       0.0472     1.0000 0.7958648079074919
pickle.bz2   0.6658    14.1134 11.232359538087621


In [16]:
print_result('Size', bench_results, 3)

Size         max      min      value
json         1.0000     5.4258 210338619
json.bz2     0.3706     2.0109 77956243
pickle       0.4317     2.3426 90812879
pickle.bz2   0.1843     1.0000 38766345


```
Writing      max      min      value
json         0.2869    23.9839 8.338726856047288  <-  24x slower
json.bz2     1.0000    83.5866 29.061390546150506 <-   worst, 84x slower
pickle       0.0120     1.0000 0.3476801458746195 <- best
pickle.bz2   0.2746    22.9505 7.979426169069484  <-   23x slower

Reading      max      min      value
json         0.2949     6.2524 4.976031355094165  <-   6x slower
json.bz2     1.0000    21.1991 16.871615912998095 <-  21x slower
pickle       0.0472     1.0000 0.7958648079074919 <- best
pickle.bz2   0.6658    14.1134 11.232359538087621 <-  14x slower

Size         max      min      value
json         1.0000     5.4258 210338619          <-   worst, 5x larger
json.bz2     0.3706     2.0109 77956243           <-  2x larger
pickle       0.4317     2.3426 90812879           <-  2x larger
pickle.bz2   0.1843     1.0000 38766345           <- best

JSON also can not serialize lots of formats
JSON can be easyly read by lots of approaches
```