# TPCDS: Query Performance Test Script

In [1]:
import os
import numpy as np
import pandas as pd
from utils import connect_postgres, get_files_absolute_path_from_dir, execute_sql
from datetime import datetime
from IPython.display import clear_output

In [2]:
# scale factor being tested
sf = 'sf_1'

In [3]:
db_name = "tpcds"
cur = connect_postgres(db_name)

PostgreSQL server information
{'user': 'postgres', 'channel_binding': 'prefer', 'dbname': 'tpcds', 'host': 'localhost', 'port': '25433', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'} 

You are connected to -  ('PostgreSQL 17.0 (Debian 17.0-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',) 



In [6]:
# drop all indexes
execute_sql(cur, os.path.join("index_setup", "drop_all_indexes.sql"))

SQL Status Output:
 DO


In [5]:
# create indexes
execute_sql(cur, os.path.join("index_setup", "index_setup.sql"))

SQL Status Output:
 CREATE INDEX


In [3]:
# get dir path

path = os.path.join(os.getcwd(), 'all_queries', 'optimized_queries_final')
files = os.listdir(path)
print(path)

/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final


In [4]:
files_abs_path = get_files_absolute_path_from_dir(path)

Total files: 99
First few files...
['/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final/query-08.sql', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final/query-79.sql', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final/query-06.sql', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final/query-81.sql', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/all_queries/optimized_queries_final/query-30.sql']


In [11]:
# setup dataframe for recording query execution run times

query_name_list = []

for i in range(len(files)):
    query_name_list.append("Q" + files[i][-6:-4])
    
query_name_dict = {'query':query_name_list}
exec_details_df = pd.DataFrame(query_name_dict)

In [12]:
# get the date-time before all 99 queries have run (with iterations if chosen)

run_start_default = datetime.now()
# dd/mm/YY H:M:S
run_start = run_start_default.strftime("%d/%m/%Y %H:%M:%S")
print("Overall Run Start:", run_start)

Overall Run Start: 17/10/2024 18:03:02


In [15]:
# run all 99 queries in sequence, and multiple iterations if chosen
# save query result table output
# save query execution run time (for all iterations)

q_errors = 0
exec_details = []
# choose number of iterations to run
n_iterations = 3

for i in range(1, n_iterations + 1):
    
    clear_output(wait = True)
    print(f'Iteration {i}\n')
    q_index = 0
    exec_details = []
    iteration_start = datetime.now()
    for sql_script in files_abs_path:

        exec_start = datetime.now()
        try:
            cur.execute(
                open(sql_script, "r").read()
            )
        except Exception as e:
            q_errors += 1
            outcome = "Error"
        else:
            outcome = "Success"

        exec_end = datetime.now()
        exec_run_time = "{:.2f}".format((exec_end - exec_start).total_seconds())
        query_num = query_name_list[q_index]
        print(f'{query_num}: Success, Execution Time: {exec_run_time}s')
        exec_details.append(exec_run_time)
        
        # load table output to csv file (on first iteration only)
        if i == 1:
            df = pd.DataFrame(cur.fetchall(), columns = [desc[0] for desc in cur.description])
            df.to_csv(f'performance_test/{sf}/{query_num}.csv', index = False)
        else:
            pass
        
        q_index += 1
        
    iteration_end = datetime.now()    
    iteration_run_time = "{:.2f}".format(((iteration_end - iteration_start).total_seconds()) / 3600)
    print(f'\n{sf.upper()}, Iteration {i}, Total run time for the 99 queries: {iteration_run_time}hr')
    
    # append iteration execution details to dataframe
    exec_details_df[f'exec_time_iter_{i}'] = np.array(exec_details)

Iteration 3

Q08: Success, Execution Time: 0.06s
Q79: Success, Execution Time: 0.33s
Q06: Success, Execution Time: 0.14s
Q81: Success, Execution Time: 0.06s
Q30: Success, Execution Time: 11.42s
Q83: Success, Execution Time: 0.10s
Q46: Success, Execution Time: 0.44s
Q90: Success, Execution Time: 0.10s
Q62: Success, Execution Time: 0.14s
Q16: Success, Execution Time: 2.47s
Q82: Success, Execution Time: 0.04s
Q89: Success, Execution Time: 0.31s
Q72: Success, Execution Time: 0.31s
Q75: Success, Execution Time: 1.26s
Q17: Success, Execution Time: 2.30s
Q77: Success, Execution Time: 0.48s
Q32: Success, Execution Time: 0.09s
Q11: Success, Execution Time: 5.40s
Q19: Success, Execution Time: 0.21s
Q64: Success, Execution Time: 0.34s
Q66: Success, Execution Time: 0.20s
Q61: Success, Execution Time: 0.15s
Q87: Success, Execution Time: 1.21s
Q57: Success, Execution Time: 1.08s
Q27: Success, Execution Time: 0.49s
Q60: Success, Execution Time: 1.42s
Q53: Success, Execution Time: 0.25s
Q88: Success, 

In [16]:
# check total amount of query errors

print(f"We have a total of {q_errors} queries with error")

We have a total of 0 queries with error


In [17]:
# get the date-time after all 99 queries have run (with iterations if chosen)

run_end_default = datetime.now()
# dd/mm/YY H:M:S
run_end = run_end_default.strftime("%d/%m/%Y %H:%M:%S")
print(f"Overall Run End (with {n_iterations} iterations):", run_end)

Overall Run End (with 3 iterations): 17/10/2024 18:17:35


In [18]:
# get the total run time (in hours) for all 99 queries to complete (with iterations if chosen)

total_run_time = "{:.2f}".format(((run_end_default - run_start_default).total_seconds()) / 3600)
print(f'Total run time for the 99 queries (with {n_iterations} iterations): {total_run_time}hr')

Total run time for the 99 queries (with 3 iterations): 0.24hr


In [19]:
# full details on query execution times (including iterations & average)
# load execution details to csv

exec_details_df['avg_exec_time'] = np.round(exec_details_df.iloc[:, 1:].apply(pd.to_numeric).mean(axis = 1), 2)
exec_details_df.to_csv(f'performance_test/{sf}/exec_time_details_{sf}.csv', index = False)
exec_details_df

Unnamed: 0,query,exec_time_iter_1,exec_time_iter_2,exec_time_iter_3,avg_exec_time
0,Q08,0.06,0.07,0.06,0.06
1,Q79,0.47,0.34,0.33,0.38
2,Q06,0.14,0.14,0.14,0.14
3,Q81,0.07,0.06,0.06,0.06
4,Q30,10.90,11.16,11.42,11.16
...,...,...,...,...,...
94,Q45,0.07,0.08,0.09,0.08
95,Q74,1.72,1.97,1.85,1.85
96,Q12,0.08,0.05,0.07,0.07
97,Q29,1.38,1.41,1.48,1.42


In [6]:
# close connection to db

cur.close()

#### End of script.