# TPCDS: Query Performance Test Script

In [68]:
import os
import numpy as np
import pandas as pd
from utils import connect_postgres, get_files_absolute_path_from_dir, execute_sql
from datetime import datetime
from IPython.display import clear_output
import shutil

In [69]:
# scale factor being tested
sf = 'sf_4'

In [70]:
db_name = "tpcds"
cur = connect_postgres(db_name)

PostgreSQL server information
{'user': 'postgres', 'channel_binding': 'prefer', 'dbname': 'tpcds', 'host': 'localhost', 'port': '5432', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'disable', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'} 

You are connected to -  ('PostgreSQL 16.4, compiled by Visual C++ build 1940, 64-bit',) 



In [71]:
# drop all indexes
execute_sql(cur, os.path.join("index_setup", "drop_all_indexes.sql"))

SQL Status Output:
 DO


In [72]:
# create indexes
execute_sql(cur, os.path.join("index_setup", "generated_indexes.sql"))

SQL Status Output:
 CREATE INDEX


In [73]:
import tempfile
partition_dir = os.path.join(".", "all_queries", "partitions_creation")

sql_files = [f for f in os.listdir(partition_dir) if f.endswith(".sql")]

for sql_file in sql_files:
    sql_file_path = os.path.join(partition_dir, sql_file)
    
    with open(sql_file_path, 'r') as f:
        sql_commands = f.read()

    # write SQL commands to a temporary file and pass its path to execute_sql
    with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=".sql") as temp_sql_file:
        temp_sql_file.write(sql_commands)
        temp_sql_file_path = temp_sql_file.name

    execute_sql(cur, temp_sql_file_path)
    os.remove(temp_sql_file_path)
    print(f"Executed {sql_file} via temporary file {temp_sql_file_path}")

SQL Status Output:
 DO
Executed catalog_sales_partitioned.sql via temporary file C:\Users\Kristof\AppData\Local\Temp\tmphdnkpzhu.sql
SQL Status Output:
 DO
Executed store_sales_partitioned.sql via temporary file C:\Users\Kristof\AppData\Local\Temp\tmp_d3iark9.sql
SQL Status Output:
 DO
Executed web_sales_partitioned.sql via temporary file C:\Users\Kristof\AppData\Local\Temp\tmpzxd66a80.sql


In [74]:
# # directories
# prev_dir = os.path.join(os.getcwd(), 'all_queries', 'prev_optimized_queries')
# our_dir = os.path.join(os.getcwd(), 'all_queries', 'our_optimized_queries')
# final_dir = os.path.join(os.getcwd(), 'all_queries', 'optimized_queries_final')

# # create the final directory if not exists
# if not os.path.exists(final_dir):
#     os.makedirs(final_dir)

# # list of .sql files in each directory
# prev_files = [f for f in os.listdir(prev_dir) if f.endswith('.sql')]
# our_files = [f for f in os.listdir(our_dir) if f.endswith('.sql')]

# # set of all files
# all_files = set(prev_files).union(set(our_files))

# for filename in all_files:
#     if filename in our_files:
#         # take from our_optimized_queries
#         src = os.path.join(our_dir, filename)
#     else:
#         # take from prev_optimized_queries
#         src = os.path.join(prev_dir, filename)
#     dst = os.path.join(final_dir, filename)
#     shutil.copyfile(src, dst)

In [75]:
# get dir path

path = os.path.join(os.getcwd(), 'all_queries', 'optimized_queries_final')
files = os.listdir(path)
print(path)

D:\BDMA\Data Warehouses\tpcds-benchmark\all_queries\optimized_queries_final


In [76]:
files_abs_path = get_files_absolute_path_from_dir(path)

Total files: 99
First few files...
['D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/optimized_queries_final/query-01.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/optimized_queries_final/query-02.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/optimized_queries_final/query-03.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/optimized_queries_final/query-04.sql', 'D:/BDMA/Data Warehouses/tpcds-benchmark/all_queries/optimized_queries_final/query-05.sql']


In [77]:
# setup dataframe for recording query execution run times

query_name_list = []

for i in range(len(files)):
    query_name_list.append("Q" + files[i][-6:-4])
    
query_name_dict = {'query':query_name_list}
exec_details_df = pd.DataFrame(query_name_dict)

In [78]:
# get the date-time before all 99 queries have run (with iterations if chosen)

run_start_default = datetime.now()
# dd/mm/YY H:M:S
run_start = run_start_default.strftime("%d/%m/%Y %H:%M:%S")
print("Overall Run Start:", run_start)

Overall Run Start: 27/10/2024 21:53:13


In [79]:
# run all 99 queries in sequence, and multiple iterations if chosen
# save query result table output
# save query execution run time (for all iterations)

q_errors = 0
exec_details = []
# choose number of iterations to run
n_iterations = 3

for i in range(1, n_iterations + 1):
    
    clear_output(wait = True)
    print(f'Iteration {i}\n')
    q_index = 0
    exec_details = []
    iteration_start = datetime.now()
    for sql_script in files_abs_path:

        exec_start = datetime.now()
        try:
            cur.execute(
                open(sql_script, "r").read()
            )
        except Exception as e:
            q_errors += 1
            outcome = "Error"
        else:
            outcome = "Success"

        exec_end = datetime.now()
        exec_run_time = "{:.2f}".format((exec_end - exec_start).total_seconds())
        query_num = query_name_list[q_index]
        print(f'{query_num}: Success, Execution Time: {exec_run_time}s')
        exec_details.append(exec_run_time)
        
        # load table output to csv file (on first iteration only)
        if i == 1:
            df = pd.DataFrame(cur.fetchall(), columns = [desc[0] for desc in cur.description])
            df.to_csv(f'performance_test/{sf}/{query_num}.csv', index = False)
        else:
            pass
        
        q_index += 1
        
    iteration_end = datetime.now()    
    iteration_run_time = "{:.2f}".format(((iteration_end - iteration_start).total_seconds()) / 3600)
    print(f'\n{sf.upper()}, Iteration {i}, Total run time for the 99 queries: {iteration_run_time}hr')
    
    # append iteration execution details to dataframe
    exec_details_df[f'exec_time_iter_{i}'] = np.array(exec_details)

Iteration 3

Q01: Success, Execution Time: 0.50s
Q02: Success, Execution Time: 0.99s
Q03: Success, Execution Time: 0.03s
Q04: Success, Execution Time: 12.71s
Q05: Success, Execution Time: 0.99s
Q06: Success, Execution Time: 0.23s
Q07: Success, Execution Time: 0.93s
Q08: Success, Execution Time: 0.09s
Q09: Success, Execution Time: 3.32s
Q10: Success, Execution Time: 1.23s
Q11: Success, Execution Time: 20.72s
Q12: Success, Execution Time: 0.08s
Q13: Success, Execution Time: 1.06s
Q14: Success, Execution Time: 23.97s
Q15: Success, Execution Time: 0.19s
Q16: Success, Execution Time: 4.70s
Q17: Success, Execution Time: 1.57s
Q18: Success, Execution Time: 0.89s
Q19: Success, Execution Time: 0.31s
Q20: Success, Execution Time: 0.14s
Q21: Success, Execution Time: 0.93s
Q22: Success, Execution Time: 19.65s
Q23: Success, Execution Time: 12.53s
Q24: Success, Execution Time: 0.06s
Q25: Success, Execution Time: 3.04s
Q26: Success, Execution Time: 1.08s
Q27: Success, Execution Time: 0.50s
Q28: Succe

In [80]:
# check total amount of query errors

print(f"We have a total of {q_errors} queries with error")

We have a total of 0 queries with error


In [81]:
# get the date-time after all 99 queries have run (with iterations if chosen)

run_end_default = datetime.now()
# dd/mm/YY H:M:S
run_end = run_end_default.strftime("%d/%m/%Y %H:%M:%S")
print(f"Overall Run End (with {n_iterations} iterations):", run_end)

Overall Run End (with 3 iterations): 27/10/2024 22:31:13


In [82]:
# get the total run time (in hours) for all 99 queries to complete (with iterations if chosen)

total_run_time = "{:.2f}".format(((run_end_default - run_start_default).total_seconds()) / 3600)
print(f'Total run time for the 99 queries (with {n_iterations} iterations): {total_run_time}hr')

Total run time for the 99 queries (with 3 iterations): 0.63hr


In [83]:
# full details on query execution times (including iterations & average)
# load execution details to csv

exec_details_df['avg_exec_time'] = np.round(exec_details_df.iloc[:, 1:].apply(pd.to_numeric).mean(axis = 1), 2)
exec_details_df.to_csv(f'performance_test/{sf}/exec_time_details_{sf}.csv', index = False)
exec_details_df

Unnamed: 0,query,exec_time_iter_1,exec_time_iter_2,exec_time_iter_3,avg_exec_time
0,Q01,1.98,0.49,0.50,0.99
1,Q02,0.97,0.95,0.99,0.97
2,Q03,0.13,0.03,0.03,0.06
3,Q04,12.34,12.87,12.71,12.64
4,Q05,1.30,0.98,0.99,1.09
...,...,...,...,...,...
94,Q95,514.19,522.73,515.46,517.46
95,Q96,0.08,0.08,0.06,0.07
96,Q97,1.63,1.67,1.64,1.65
97,Q98,0.25,0.25,0.24,0.25


In [84]:
# close connection to db

cur.close()

#### End of script.