# TPCDS: Query Performance Test Script

In [2]:
import os
import numpy as np
import pandas as pd
from utils import connect_postgres, get_files_absolute_path_from_dir, execute_sql
from datetime import datetime
from IPython.display import clear_output
import shutil

In [2]:
# scale factor being tested
sf = 'sf_1'

In [3]:
db_name = "tpcds"
cur = connect_postgres(db_name)

PostgreSQL server information
{'user': 'postgres', 'channel_binding': 'prefer', 'dbname': 'tpcds', 'host': 'localhost', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'disable', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'} 

You are connected to -  ('PostgreSQL 16.4, compiled by Visual C++ build 1940, 64-bit',) 



In [4]:
# drop all indexes
execute_sql(cur, os.path.join("index_setup", "drop_all_indexes.sql"))

SQL Status Output:
 DO


In [5]:
# create indexes
execute_sql(cur, os.path.join("index_setup", "index_setup.sql"))

SQL Status Output:
 CREATE INDEX


In [4]:
# directories
prev_dir = os.path.join(os.getcwd(), 'all_queries', 'prev_optimized_queries')
our_dir = os.path.join(os.getcwd(), 'all_queries', 'our_optimized_queries')
final_dir = os.path.join(os.getcwd(), 'all_queries', 'optimized_queries_final')

# create the final directory if not exists
if not os.path.exists(final_dir):
    os.makedirs(final_dir)

# list of .sql files in each directory
prev_files = [f for f in os.listdir(prev_dir) if f.endswith('.sql')]
our_files = [f for f in os.listdir(our_dir) if f.endswith('.sql')]

# set of all files
all_files = set(prev_files).union(set(our_files))

for filename in all_files:
    if filename in our_files:
        # take from our_optimized_queries
        src = os.path.join(our_dir, filename)
    else:
        # take from prev_optimized_queries
        src = os.path.join(prev_dir, filename)
    dst = os.path.join(final_dir, filename)
    shutil.copyfile(src, dst)

In [6]:
# get dir path

path = os.path.join(os.getcwd(), 'all_queries', 'prev_optimized_queries')
files = os.listdir(path)
print(path)

D:\BDMA\Data Warehouses\tpcds-benchmark\all_queries\prev_optimized_queries


In [7]:
files_abs_path = get_files_absolute_path_from_dir(path)

Total files: 99
First few files...
['D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/prev_optimized_queries/query-01.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/prev_optimized_queries/query-02.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/prev_optimized_queries/query-03.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/prev_optimized_queries/query-04.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/prev_optimized_queries/query-05.sql']


In [8]:
# setup dataframe for recording query execution run times

query_name_list = []

for i in range(len(files)):
    query_name_list.append("Q" + files[i][-6:-4])
    
query_name_dict = {'query':query_name_list}
exec_details_df = pd.DataFrame(query_name_dict)

In [9]:
# get the date-time before all 99 queries have run (with iterations if chosen)

run_start_default = datetime.now()
# dd/mm/YY H:M:S
run_start = run_start_default.strftime("%d/%m/%Y %H:%M:%S")
print("Overall Run Start:", run_start)

Overall Run Start: 23/10/2024 14:33:24


In [10]:
# run all 99 queries in sequence, and multiple iterations if chosen
# save query result table output
# save query execution run time (for all iterations)

q_errors = 0
exec_details = []
# choose number of iterations to run
n_iterations = 3

for i in range(1, n_iterations + 1):
    
    clear_output(wait = True)
    print(f'Iteration {i}\n')
    q_index = 0
    exec_details = []
    iteration_start = datetime.now()
    for sql_script in files_abs_path:

        exec_start = datetime.now()
        try:
            cur.execute(
                open(sql_script, "r").read()
            )
        except Exception as e:
            q_errors += 1
            outcome = "Error"
        else:
            outcome = "Success"

        exec_end = datetime.now()
        exec_run_time = "{:.2f}".format((exec_end - exec_start).total_seconds())
        query_num = query_name_list[q_index]
        print(f'{query_num}: Success, Execution Time: {exec_run_time}s')
        exec_details.append(exec_run_time)
        
        # load table output to csv file (on first iteration only)
        if i == 1:
            df = pd.DataFrame(cur.fetchall(), columns = [desc[0] for desc in cur.description])
            df.to_csv(f'performance_test/{sf}/{query_num}.csv', index = False)
        else:
            pass
        
        q_index += 1
        
    iteration_end = datetime.now()    
    iteration_run_time = "{:.2f}".format(((iteration_end - iteration_start).total_seconds()) / 3600)
    print(f'\n{sf.upper()}, Iteration {i}, Total run time for the 99 queries: {iteration_run_time}hr')
    
    # append iteration execution details to dataframe
    exec_details_df[f'exec_time_iter_{i}'] = np.array(exec_details)

Iteration 3

Q01: Success, Execution Time: 0.28s
Q02: Success, Execution Time: 0.91s
Q03: Success, Execution Time: 0.03s
Q04: Success, Execution Time: 3279.80s
Q05: Success, Execution Time: 2.61s
Q06: Success, Execution Time: 0.43s
Q07: Success, Execution Time: 3.91s
Q08: Success, Execution Time: 0.17s
Q09: Success, Execution Time: 4.62s
Q10: Success, Execution Time: 8.58s
Q11: Success, Execution Time: 39.40s
Q12: Success, Execution Time: 0.14s
Q13: Success, Execution Time: 1.15s
Q14: Success, Execution Time: 202.82s
Q15: Success, Execution Time: 0.54s
Q16: Success, Execution Time: 9.33s
Q17: Success, Execution Time: 2.95s
Q18: Success, Execution Time: 0.57s
Q19: Success, Execution Time: 0.26s
Q20: Success, Execution Time: 0.30s
Q21: Success, Execution Time: 0.73s
Q22: Success, Execution Time: 43.28s
Q23: Success, Execution Time: 43.73s
Q24: Success, Execution Time: 0.15s
Q25: Success, Execution Time: 1.99s
Q26: Success, Execution Time: 2.20s
Q27: Success, Execution Time: 2.98s
Q28: Su

In [11]:
# check total amount of query errors

print(f"We have a total of {q_errors} queries with error")

We have a total of 0 queries with error


In [12]:
# get the date-time after all 99 queries have run (with iterations if chosen)

run_end_default = datetime.now()
# dd/mm/YY H:M:S
run_end = run_end_default.strftime("%d/%m/%Y %H:%M:%S")
print(f"Overall Run End (with {n_iterations} iterations):", run_end)

Overall Run End (with 3 iterations): 23/10/2024 17:35:41


In [13]:
# get the total run time (in hours) for all 99 queries to complete (with iterations if chosen)

total_run_time = "{:.2f}".format(((run_end_default - run_start_default).total_seconds()) / 3600)
print(f'Total run time for the 99 queries (with {n_iterations} iterations): {total_run_time}hr')

Total run time for the 99 queries (with 3 iterations): 3.04hr


In [14]:
# full details on query execution times (including iterations & average)
# load execution details to csv

exec_details_df['avg_exec_time'] = np.round(exec_details_df.iloc[:, 1:].apply(pd.to_numeric).mean(axis = 1), 2)
exec_details_df.to_csv(f'performance_test/{sf}/exec_time_details_{sf}.csv', index = False)
exec_details_df

Unnamed: 0,query,exec_time_iter_1,exec_time_iter_2,exec_time_iter_3,avg_exec_time
0,Q01,1.25,0.28,0.28,0.60
1,Q02,1.03,0.92,0.91,0.95
2,Q03,0.12,0.03,0.03,0.06
3,Q04,2835.68,2867.35,3279.80,2994.28
4,Q05,1.50,1.39,2.61,1.83
...,...,...,...,...,...
94,Q95,18.81,18.72,35.14,24.22
95,Q96,0.33,0.34,0.57,0.41
96,Q97,1.57,1.57,2.88,2.01
97,Q98,0.27,0.28,0.54,0.36


In [15]:
# close connection to db

cur.close()

#### End of script.