In [17]:
from cassandra import ConsistencyLevel
from cassandra.cluster import Cluster
from cassandra.query import SimpleStatement

from tqdm import tqdm

In [2]:
def cassandra_connection():
    """
    Connection object for Cassandra
    :return: session, cluster
    """
    cluster = Cluster(['127.0.0.1'], port=9042)
    session = cluster.connect()
    session.execute("""
        CREATE KEYSPACE IF NOT EXISTS fakehealthcareorg
        WITH REPLICATION =
        { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
        """)
    session.set_keyspace('fakehealthcareorg')
    return session, cluster


In [3]:
session, cluster = cassandra_connection()

# Creating the keyspace

In [8]:
session.execute("""create keyspace dev
... with replication = {'class':'SimpleStrategy','replication_factor':1};""")

<cassandra.cluster.ResultSet at 0x7f5f6868b410>

In [9]:
session.execute("""use dev;""")

<cassandra.cluster.ResultSet at 0x7f5f686b1110>

In [11]:
session.execute("""CREATE TABLE IF NOT EXISTS JOB(
    id INT PRIMARY KEY, status varchar, name varchar, subjobs INT, application varchar,
    backend varchar, backend_actualCE varchar, comment varchar
)""")

<cassandra.cluster.ResultSet at 0x7f5f67ac2590>

# Inserting the data:
1. 10K
2. 100K
3. 1000K

In [20]:
# 1. 10K
import pickle
jobs = pickle.load(open("rows.pkl", "rb"))
blobs = pickle.load(open("blobs.pkl", "rb"))

In [26]:
query = session.prepare("""
    INSERT INTO JOB(id, status, name, subjobs, application, backend, backend_actualCE, comment)
    VALUES          (?,      ?,    ?,       ?,           ?,       ?,                ?,       ?)
    """)

In [44]:
%%time
size = 10000
_jobs = jobs[:size]
_blobs = blobs[:size]

with tqdm(total=size) as progress:
    for i, (row, blob) in enumerate(zip(_jobs, _blobs)):
        row[0] = i+1
        blob['jid'] = i+1
        try:
            session.execute(query.bind(row))        
        except Exception as e:
            print(e)
        progress.update(1)

100%|██████████| 10000/10000 [00:06<00:00, 1564.32it/s]

CPU times: user 3.2 s, sys: 610 ms, total: 3.81 s
Wall time: 6.4 s





In [50]:
%%time
future = session.execute_async("SELECT * FROM JOB")
rows = future.result()

CPU times: user 15.5 ms, sys: 3.14 ms, total: 18.6 ms
Wall time: 75 ms


In [51]:
len(list(rows))

10000