### Using KDB+ for a Financial Ticking

#### Install qPython

In [1]:
!pip install qPython
!pip install plotly
!pip install seaborn



#### Before run next (Create sesion, last one to test the connection)

First go to kdb folder  @It is necesary to load the csv files

cd path to kdb

Execute in terminal:

q -p 5000

h:hopen `:localhost:5000

h"2+2" 

In [147]:
from qpython import qconnection
import pandas as pd
import datetime 

def create_connection(pandas=False):
    q = qconnection.QConnection(host='localhost', port=5050, pandas = pandas)
    # initialize connection
    q.open()
    
    return q

def close_sconnection(q):
    q.close()



In [105]:
import pandas as pd
import numpy as np
import os, gc
from glob import glob
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


# Create database and tables


In [106]:

def create_database(q, 
                    tick_csv_path, tickenum_folder_path, 
                    base_csv_path, baseenum_folder_path):
    
    print('IPC version: %s. Is connected: %s' % (q.protocol_version, q.is_connected()))

    # Load price tick file
    q.sendSync(f'tick:("SIDTFIFIFIS"; enlist"|")0:`:{tick_csv_path}')
    
    # Create enumeration for table (this is required to create a splayed table and then a partitioned table)
    q.sendSync(f'tickenum: .Q.en[`:{tickenum_folder_path}] tick')
    # Save table
    q.sendSync('rsave `tickenum')


    # Load base tick file
    q.sendSync(f'base:("SSSSS"; enlist"|")0:`:{base_csv_path}')
    # Create enumeration for table (this is required to create a splayed table and then a partitioned table)
    q.sendSync(f'baseenum: .Q.en[`:{baseenum_folder_path}] base')
    # Save table
    q.sendSync('rsave `baseenum')

    q.sendSync('baseenum2:get `baseenum ')
    q.sendSync('priceenum2:get `tickenum')


In [170]:
import csv

def save_list_results(url, data):
    print("save_list_results")
    df = pd.DataFrame(data, index=[0])
    df.to_csv(url)
    
def save_stats(url, data):
    print("save_stats")
    df = pd.DataFrame(data)
    df.to_csv(url)


In [6]:
!pip install joblib



In [108]:
from joblib import Parallel, delayed
from multiprocessing.pool import Pool
import traceback
import time
NUM_THREADS = 5
NUM_POOLS = 10

def load_queries(path_to_queries) -> list:
    queries=[]
    for file in glob(path_to_queries+'*.q'):
        with open(file, 'r') as file:
            data = file.read().replace('\n', ';')
            queries.append(data)
    return queries

In [156]:
def run_query(q,run_id, query_number, queries, path_to_save_results, data_size, print_result=False):
    print(f"Running query {query_number} for scale factor {data_size}, saving results at {path_to_save_results}")
    try:
        start = time.time()
        temp=np.array(q(queries[query_number-1], qtype=1, adjust_dtype=False))
        print(queries[query_number-1], temp)
        result = q(queries[query_number-1])
        df = pd.DataFrame(result)
        #df = pd.DataFrame(data=q(queries[query_number-1], qtype=1, adjust_dtype=False))
        result=df
        count = df.shape[0]
        end = time.time()
        print(df)
        result.to_csv(path_to_save_results.format(size=data_size, query_number=query_number))
        stats = {
            "run_id": run_id,
            "query_id": query_number,
            "start_time": start,
            "end_time": end,
            "elapsed_time": end-start,
            "row_count": count,
            'error': False
        }
        return stats
    except Exception:
        print(traceback.format_exc())
        return {
            "run_id": run_id,
            "query_id": query_number,
            "start_time": time.time(),
            "end_time": time.time(),
            "elapsed_time": 0.0,
            "row_count": 0,
            "error": True
        }

In [160]:
def run_queries(q,run_id, queries, path_to_save_results, path_to_save_stats, data_size, print_result=False):
    stats = Parallel(n_jobs=NUM_THREADS, prefer="threads")(delayed(run_query)(q,run_id, i+1, queries, path_to_save_results, data_size, print_result) for i in range(len(queries)))
    print(stats)
    save_list_results(path_to_save_stats, stats)
    
def run_queries_iter(q,run_id, queries, path_to_save_results, path_to_save_stats, data_size, print_result=False):
    for i, query in enumerate(queries):
        stats = run_query(q,run_id, i+1, queries, path_to_save_results, data_size, print_result)
        print(stats)
        save_list_results(path_to_save_stats, stats)


In [161]:
import math
def run(data_sizes, q):    
    for i, data_size in enumerate(data_sizes):
        queries_path = "./queries/"
        result_path = "../kdb/results/result_Q{query_number}_{size}.csv"
        stats_path ="../kdb/results/test_run_stats_csv_{size}.csv".format(size=data_size)
        start_create_db = time.time()
        folder = "/Users/alfredo.leon/Desktop/findata/data/scale_1000"
        # Create metastore for the given size
        create_database(q, 
                        tick_csv_path=f"{folder}/tick_price_file_no_spaces.csv", tickenum_folder_path=f"{folder}/", 
                        base_csv_path=f"{folder}/tick_base_file_no_spaces.csv", baseenum_folder_path=f"{folder}/")
        end_create_db = time.time()
        
        # Load queries for the given size
        queries = load_queries(queries_path)
        start_run = time.time()
        run_queries_iter(q, i+1, queries, result_path, stats_path, data_size)
        end_run = time.time()
        
        df = pd.read_csv(stats_path)   
        response_t= math.sqrt(df[['elapsed_time']].prod().tolist()[0])

        # Saving the overall stats to csv file
        overall_stats = [{
            'batch_id': i+1,
            'create_db_time': end_create_db - start_create_db,
            'run_query_time': end_run - start_run,
            'Response Time Metric': response_t
            
        }]

        
        overall_stats_path = "../kdb/results/{size}_overall_stats.csv".format(size=data_size)
        save_stats(overall_stats_path, overall_stats)

In [174]:
#data_sizes=['10', '100','1000']
q=create_connection(pandas=False)
#result = q('0!select[10] from `TradeSize xdesc select from select sum(TradeSize) by Id from priceenum2 where TradeDate=2022.11.23')
#df = pd.DataFrame(result)
#print(df.head())
#print(result, type(result))
run(data_sizes=['1000'],q=q)
close_sconnection(q)

IPC version: 3. Is connected: True
Running query 1 for scale factor 1000, saving results at ../kdb/results/result_Q{query_number}_{size}.csv
0!select[10] from `TradeSize xdesc select from select sum(TradeSize) by Id from priceenum2 where TradeDate=2022.11.23 [(b'Security_701', 438800) (b'Security_248', 413200)
 (b'Security_871', 410500) (b'Security_660', 402400)
 (b'Security_76', 398900) (b'Security_182', 398200)
 (b'Security_680', 395300) (b'Security_602', 394300)
 (b'Security_770', 391000) (b'Security_324', 390400)]
                Id  TradeSize
0  b'Security_701'     438800
1  b'Security_248'     413200
2  b'Security_871'     410500
3  b'Security_660'     402400
4   b'Security_76'     398900
5  b'Security_182'     398200
6  b'Security_680'     395300
7  b'Security_602'     394300
8  b'Security_770'     391000
9  b'Security_324'     390400
{'run_id': 1, 'query_id': 1, 'start_time': 1669129988.81906, 'end_time': 1669129989.040801, 'elapsed_time': 0.2217409610748291, 'row_count': 10, '