In [1]:
from cassandra import ConsistencyLevel
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

from joblib import Parallel, delayed                                                                                                                                

from tqdm import tqdm

In [2]:
index=False

In [4]:
"""
sudo docker run --name cassandra -p 127.0.0.1:9042:9042 -e CASSANDRA_CLUSTER_NAME=GangaTest -e CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch -e CASSANDRA_DC=datacenter1 -d cassandra
"""
def cassandra_connection():
    """
    Connection object for Cassandra
    :return: session, cluster
    """
    cluster = Cluster(['127.0.0.1'], port=9042)
    session = cluster.connect()
    session.execute("""
        CREATE KEYSPACE IF NOT EXISTS fakehealthcareorg
        WITH REPLICATION =
        { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
        """)
    session.set_keyspace('fakehealthcareorg')
    return session, cluster


In [5]:
session, cluster = cassandra_connection()

# Creating the keyspace

In [6]:
session.execute("""create keyspace dev
... with replication = {'class':'SimpleStrategy','replication_factor':1};""")

<cassandra.cluster.ResultSet at 0x7f98062d1510>

In [33]:
session.execute("""use dev;""")
session.execute("""DROP TABLE IF EXISTS JOB""")

<cassandra.cluster.ResultSet at 0x7f980730ded0>

In [9]:
try:
    session.execute("""CREATE TABLE IF NOT EXISTS JOB(
        id INT PRIMARY KEY, status varchar, name varchar, subjobs INT, application varchar,
        backend varchar, backend_actualCE varchar, comment varchar
    )""")
except Exception as e:
    print(e)

In [10]:
session.execute("""drop table dev.JOB;""")
session.execute("""CREATE TABLE IF NOT EXISTS JOB(
        id INT PRIMARY KEY, status varchar, name varchar, subjobs INT, application varchar,
        backend varchar, backend_actualCE varchar, comment varchar
    )""")

<cassandra.cluster.ResultSet at 0x7f98062c62d0>

# Inserting the data:
1. 10K
2. 100K
3. 1000K

In [12]:
# 1. 10K
import pickle
jobs = pickle.load(open("rows.pkl", "rb"))
blobs = pickle.load(open("blobs.pkl", "rb"))

In [13]:
query = session.prepare("""
    INSERT INTO JOB(id, status, name, subjobs, application, backend, backend_actualCE, comment)
    VALUES          (?,      ?,    ?,       ?,           ?,       ?,                ?,       ?)
    """)

if index:
    session.execute("""CREATE INDEX name_ind ON devices (name) USING CONSISTENCY ALL;""")


In [15]:
%%time
size = 100000
_jobs = jobs[:size]
_blobs = blobs[:size]
with tqdm(total=size) as progress:
    for i, (row, blob) in enumerate(zip(_jobs, _blobs)):
        row[0] = i+1
        blob['jid'] = i+1
        try:
            session.execute(query.bind(row))        
        except Exception as e:
            print(e)
        progress.update(1)

100%|██████████| 100000/100000 [01:09<00:00, 1446.08it/s]

CPU times: user 35.7 s, sys: 7.14 s, total: 42.9 s
Wall time: 1min 9s





In [32]:
from joblib import Parallel, delayed
rows = [[i]+row[1:] for i, row in enumerate(_jobs)]
def insert(data):
    flag = session.execute(
        query.bind(data)
    )
    return flag

In [19]:
%%time
result = Parallel(n_jobs=-1, verbose=0, backend="multiprocessing")(
    map(delayed(insert), rows)
)

NameError: name 'insert' is not defined

In [30]:
%%time
future = session.execute_async("SELECT * FROM JOB")
rows = future.result()

CPU times: user 9.1 ms, sys: 155 µs, total: 9.26 ms
Wall time: 47.7 ms


In [31]:
%%time
len(list(rows))

CPU times: user 308 ms, sys: 31.1 ms, total: 339 ms
Wall time: 1.05 s


100000

In [26]:
%%time
rows = session.execute("SELECT * FROM JOB")

CPU times: user 14.2 ms, sys: 4.37 ms, total: 18.6 ms
Wall time: 60 ms


In [27]:
%%time
len(list(rows))

CPU times: user 362 ms, sys: 11.3 ms, total: 373 ms
Wall time: 1.01 s


100000