In [None]:
from cassandra.cluster import Cluster
import pandas as pd
import datetime

In [None]:
data = pd.read_csv('ks-projects-201801.csv')
data.head()

In [None]:
data.describe()

In [None]:
# The keyspace should have already been created using `cqlsh`:
# ---
# cqlsh> CREATE KEYSPACE kickstarter WITH replication = {'class':'SimpleStrategy', 'replication_factor':1};
# ---
# Also, the `projects` table should have been created already (with correct data types):
# ```
# cqlsh> CREATE TABLE IF NOT EXISTS kickstarter.projects (
#                ... ID bigint PRIMARY KEY,
#                ... name varchar,
#                ... category varchar,
#                ... main_category varchar,
#                ... currency varchar,
#                ... deadline date,
#                ... goal decimal,
#                ... launched timestamp,
#                ... pledged decimal,
#                ... state varchar,
#                ... backers int,
#                ... country varchar,
#                ... usd_pledged decimal,
#                ... usd_pledged_real decimal,
#                ... usd_goal_real decimal);
# ```
cluster = Cluster(port=9042)
session = cluster.connect('kickstarter')

In [None]:
column_names = list(data.columns.values)
names_str = column_names[0]
for name in column_names[1:]:
    names_str += ',' + name
names_str

In [None]:
# Create a prepared statement to insert rows into Cassandra.
insert_str = 'INSERT INTO projects (' + names_str + ') VALUES (?'
for _ in range(1, len(column_names)):
    insert_str += ',?'
insert_str += ')'
insert_stmt = session.prepare(insert_str)
items = data.to_dict('list')
# Convert columns to the correct data types.
items['name'] = [v if v is str else '' for v in items['name']]
items['launched'] = [datetime.datetime.strptime(items['launched'][i], '%Y-%m-%d %H:%M:%S').date() for i in range(len(items['launched']))]
items['usd_pledged'] = [str(v).encode('utf-8') if v is float else '0.00' for v in items['usd_pledged']]
# Add all of the items to the table.
projects = []
for i in range(len(items[column_names[0]])):
    item = [items[column_names[j]][i] for j in range(len(column_names))]
    project = session.execute(insert_stmt, item)
    projects.append(project)
len(projects)

In [None]:
result = session.execute('SELECT COUNT(*) FROM projects;')
result.current_rows