In [33]:
# from memory_profiler import profile
import pickle
import time
import warnings

import psutil
import os
import sys
import json
import utils
import pickle
from db import *
from glob import glob
from joblib import Parallel, delayed


In [34]:
import collections
compare = lambda x, y: collections.Counter(x) == collections.Counter(y)

In [35]:
import redis
r = redis.Redis(host='localhost', port=6379, db=0) 

## Loading the job objects

In [37]:
import utils
jobs, blobs = utils.load_pickle_data_batch(size=100000)

The runtime for load_pickle_data_batch was 3.41266131401062 seconds.


# Terms of comparison:
    - Redis: A cache with 100 job objects
    - Mongo: Database instance with 100000 job objects
    - Postgres: Database instance with 100000 job objects        

In [38]:
# Redis
import json
import random
times_to_test = 10
rand_idx = [random.randint(0, 100) for _ in range(times_to_test)]
for idx in rand_idx:
    r.set(jobs[idx]['jid'], json.dumps(jobs[idx]))
    
    r.set(f"blob-{blobs[idx]['jid']}", json.dumps([blobs[idx]['jid']]+[i.decode("utf-8") for i in blobs[idx].values() if type(i) != int]))        

In [6]:
# for idx in rand_idx:
#     # Below casuses TypeError: Object of type bytes is not JSON serializable
# #     r.set(f"blob-{jobs[idx]['jid']}", json.dumps(blobs[idx]))
    


In [39]:
%%time
for idx in rand_idx:
    assert jobs[idx] == json.loads(r.get(idx))

CPU times: user 3.5 ms, sys: 519 µs, total: 4.02 ms
Wall time: 4.92 ms


In [40]:
%%time
for idx in rand_idx:
    assert [blobs[idx]['jid']]+[i.decode("utf-8") for i in blobs[idx].values() if type(i) != int] == json.loads(r.get(f"blob-{idx}"))

CPU times: user 14.1 ms, sys: 3.67 ms, total: 17.8 ms
Wall time: 19.4 ms


In [9]:
# Mongo

In [10]:
#  Starting the mongo docker instance in the background

In [60]:
size = 100000
batch_size = 100

db = connect_mongo()

The runtime for connect_mongo was 0.00901341438293457 seconds.


In [56]:
add_jobs_mongo(
    db, jobs=jobs, batch_size=batch_size
)

100%|██████████| 1000/1000.0 [00:03<00:00, 320.57it/s]

The runtime for add_jobs_mongo was 3.1218960285186768 seconds.





In [57]:
add_blobs_mongo(
    db, blobs=blobs, batch_size=batch_size
)

100%|██████████| 1000/1000.0 [01:22<00:00, 12.08it/s]


The runtime for add_blobs_mongo was 82.80983376502991 seconds.


In [14]:
# Postgres

In [62]:
con, meta = connect_postgres('postgres', 'ganga', 'jobs')
JOBS, BLOBS = create_tables_postgres(con ,meta)

jobs, blobs = utils.load_pickle_data_batch(size=size)

The runtime for connect_postgres was 0.02215433120727539 seconds.
The runtime for create_tables_postgres was 0.042023420333862305 seconds.
The runtime for load_pickle_data_batch was 3.583921194076538 seconds.


In [63]:
add_jobs_postgres(
    con, JOBS=JOBS, jobs=jobs, batch_size=batch_size
)


99900-100000: 100%|██████████| 1000/1000 [00:25<00:00, 38.83it/s]

The runtime for add_jobs_postgres was 25.760742902755737 seconds.





In [64]:
add_blobs_postgres(
    con, BLOBS=BLOBS, blobs=blobs, batch_size=batch_size
)


99900-100000: 100%|██████████| 1000/1000 [03:19<00:00,  5.02it/s]

The runtime for add_blobs_postgres was 199.09867978096008 seconds.





In [65]:
job_headers = ['jid', 'status', 'name', 'subjobs', 'application', 'backend', 'backend_actualCE', 'comment']
blob_headers = list(blobs[0].keys())

# Test on Mongo

In [108]:
%%time
for idx in rand_idx:
    job = [*db.jobs.find({"jid": idx}, {'_id': False})][0]
#     job = [*db.jobs.find({"jid": idx})][0]
    assert compare(jobs[idx], job)

CPU times: user 9.48 ms, sys: 0 ns, total: 9.48 ms
Wall time: 563 ms


In [107]:
%%time
for idx in rand_idx:
    blob = [*db.blobs.find({"jid": idx}, {'_id': False})][0]
#     blob = [*db.blobs.find({"jid": idx})][0]
    assert compare(blobs[idx], blob)

CPU times: user 18 ms, sys: 8.36 ms, total: 26.3 ms
Wall time: 32.4 s


# Test on Postgres

In [110]:
%%time
for idx in rand_idx:
    job = [*con.execute(f"""SELECT * from jobs where jid={idx}""")][0]
    assert compare(jobs[idx], dict(zip(job_headers, job)))


CPU times: user 4.78 ms, sys: 0 ns, total: 4.78 ms
Wall time: 21.1 ms


In [109]:
%%time
for idx in rand_idx:
    blob = []
    result = [*con.execute(f"""SELECT * from blobs where jid={idx}""")][0]
    for item in result:
        if type(item) == memoryview:
#             blob.append(bytes(item).decode("utf-8"))
            blob.append(bytes(item))
        else:
            blob.append(item)
    assert compare(blobs[idx], dict(zip(blob_headers, blob)))

CPU times: user 12.7 ms, sys: 0 ns, total: 12.7 ms
Wall time: 2.28 s
