In [49]:
import pandas as pd
from clickhouse_driver import Client

In [50]:
client = Client(host="localhost")

In [51]:
client.execute("CREATE DATABASE IF NOT EXISTS test_db")

[]

In [52]:
client.execute(
    "CREATE TABLE IF NOT EXISTS test_db.test_table ("
    "    Id               UInt64  NOT NULL,"
    "    EmployeeName     TEXT    NOT NULL,"
    "    JobTitle         TEXT    NOT NULL,"
    "    BasePay          Float64 NOT NULL,"
    "    OvertimePay      Float64 NOT NULL,"
    "    OtherPay         Float64 NOT NULL,"
    "    Benefits         String  NOT NULL,"
    "    TotalPay         Float64 NOT NULL,"
    "    TotalPayBenefits Float64 NOT NULL,"
    "    Year             UInt32  NOT NULL,"
    "    Agency           String  NOT NULL,"
    "    Status           String  NOT NULL "
    ") Engine=MergeTree() ORDER BY Id PRIMARY KEY Id"
)

[]

In [53]:
df = pd.read_csv("./data/sf_salaries.csv", delimiter=",", encoding="utf-8", low_memory=False)

In [54]:
df = df.drop('Notes', axis=1)
df["EmployeeName"] = df["EmployeeName"].replace("Not Provided", "")
df["JobTitle"] = df["JobTitle"].replace("Not Provided", "")
df["BasePay"] = df["BasePay"].replace("Not Provided", 0.0)
df["OvertimePay"] = df["OvertimePay"].replace("Not Provided", 0.0)
df["OtherPay"] = df["OtherPay"].replace("Not Provided", 0.0)
df["Benefits"] = df["Benefits"].replace("Not Provided", "")
data: list = df.fillna(
    value={
        "EmployeeName": "",
        "JobTitle": "",
        "BasePay": 0.0,
        "OvertimePay": 0.0,
        "OtherPay": 0.0,
        "Benefits": "",
        "TotalPay": 0.0,
        "TotalPayBenefits": 0.0,
        "Year": 0,
        "Agency": "",
        "Status": ""
    }
).values.tolist()

In [55]:
client.insert_dataframe(
    query="INSERT INTO test_db.test_table VALUES",
    dataframe=pd.DataFrame(
        data=data,
        columns=["Id", "EmployeeName", "JobTitle", "BasePay", "OvertimePay", "OtherPay", "Benefits", "TotalPay", "TotalPayBenefits", "Year", "Agency", "Status"]
    ),
    settings=dict(use_numpy=True)
)

148654

In [66]:
%%timeit
client.execute("SELECT * FROM test_db.test_table LIMIT 1")

2.7 ms ± 122 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [69]:
%%timeit
client.execute("SELECT count(*) FROM test_db.test_table WHERE OvertimePay > 30000")

2.38 ms ± 146 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [70]:
client.execute("ALTER TABLE test_db.test_table ADD INDEX IdxOvertimePay OvertimePay TYPE bloom_filter GRANULARITY 8192")

[]

In [73]:
%%timeit
client.execute("SELECT count(*) FROM test_db.test_table WHERE OvertimePay > 30000")

2.35 ms ± 49.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
