# Data Processing for Query Execution Times

We will process and merge execution time data from different sets of queries to prepare it for analysis in a Power BI dashboard. The final dataset will have the following columns:

- **Query Number**
- **Scale Factor**
- **Type** (Base Queries / Referenced Queries / Our Queries)
- **Average Execution Time**

In [29]:
import os, re
import pandas as pd
from utils import get_files_absolute_path_from_dir, exclude_non_csv_files

In [30]:
base_dir = os.path.join(os.getcwd(), 'performance_test', 'quieries_prior_to_optimization')
referenced_dir = os.path.join(os.getcwd(), 'performance_test', 'queries_optimized_in_referenced_work')
our_dir = os.path.join('.', 'performance_test', 'queries_optimized_by_us')

In [38]:
# sf for base queries
base_sfs = [1, 2, 3]

base_csv_files = []
for sf in base_sfs:
    sf_dir = os.path.join(base_dir, f'sf_{sf}')
    file_name = f'exec_time_details_sf_{sf}.csv'
    file_path = os.path.join(sf_dir, file_name)
    base_csv_files.append(file_path)

# sf for referenced queries
referenced_sfs = [1, 2, 3, 4]

referenced_csv_files = []
for sf in referenced_sfs:
    sf_dir = os.path.join(referenced_dir, f'sf_{sf}')
    file_name = f'exec_time_details_sf_{sf}.csv'
    file_path = os.path.join(sf_dir, file_name)
    referenced_csv_files.append(file_path)

# sf for our queries
our_sfs = [1, 2, 3, 4]

our_csv_files = []
for sf in our_sfs:
    sf_dir = os.path.join(our_dir, f'sf_{sf}')
    file_name = f'exec_time_details_sf_{sf}.csv'
    file_path = os.path.join(sf_dir, file_name)
    our_csv_files.append(file_path)

for file in base_csv_files:
    print(file)
for file in referenced_csv_files:
    print(file)
for file in our_csv_files:
    print(file)

D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\quieries_prior_to_optimization\sf_1\exec_time_details_sf_1.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\quieries_prior_to_optimization\sf_2\exec_time_details_sf_2.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\quieries_prior_to_optimization\sf_3\exec_time_details_sf_3.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\queries_optimized_in_referenced_work\sf_1\exec_time_details_sf_1.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\queries_optimized_in_referenced_work\sf_2\exec_time_details_sf_2.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\queries_optimized_in_referenced_work\sf_3\exec_time_details_sf_3.csv
D:\BDMA\Data Warehouses\tpcds-benchmark\performance_test\queries_optimized_in_referenced_work\sf_4\exec_time_details_sf_4.csv
.\performance_test\queries_optimized_by_us\sf_1\exec_time_details_sf_1.csv
.\performance_test\queries_optimized_by_us\sf_2\exec_time_det

In [32]:
def process_csv_files(file_list, query_type):
    dfs = []
    for file in file_list:
        if not os.path.isfile(file):
            print(f"File not found: {file}")
            continue
        
        try:
            df = pd.read_csv(file)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue
        
        if 'avg_exec_time' in df.columns:
            df = df[['query', 'avg_exec_time']]
        else:
            print(f"'avg_exec_time' column not found in {file}")
            continue
        
        # sf from the file path
        sf_dir = os.path.basename(os.path.dirname(file))
        sf = int(sf_dir.split('_')[-1])
        df['Scale Factor'] = sf
        # Add the 'Type' column
        df['Type'] = query_type
        dfs.append(df)
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame(columns=['query', 'avg_exec_time', 'Scale Factor', 'Type'])

In [33]:
print("Processing Base Queries...")
base_queries_df = process_csv_files(base_csv_files, 'Base Queries')

print("Processing Referenced Queries...")
referenced_queries_df = process_csv_files(referenced_csv_files, 'Referenced Queries')

print("Processing Our Queries...")
our_queries_df = process_csv_files(our_csv_files, 'Our Queries')

Processing Base Queries...
Processing Referenced Queries...
Processing Our Queries...


In [34]:
# merge
final_df = pd.concat([base_queries_df, referenced_queries_df, our_queries_df], ignore_index=True)
final_df.head()

Unnamed: 0,query,avg_exec_time,Scale Factor,Type
0,Q01,1269.67,1,Base Queries
1,Q02,0.55,1,Base Queries
2,Q03,0.27,1,Base Queries
3,Q04,3106.44,1,Base Queries
4,Q05,0.42,1,Base Queries


In [35]:
# rename
final_df.rename(columns={
    'query': 'Query Number',
    'avg_exec_time': 'Average Execution Time'
}, inplace=True)

# reorder
final_df = final_df[['Query Number', 'Scale Factor', 'Type', 'Average Execution Time']]

In [36]:
final_df.head()

Unnamed: 0,Query Number,Scale Factor,Type,Average Execution Time
0,Q01,1,Base Queries,1269.67
1,Q02,1,Base Queries,0.55
2,Q03,1,Base Queries,0.27
3,Q04,1,Base Queries,3106.44
4,Q05,1,Base Queries,0.42


In [37]:
final_df.to_csv('performance_test/merged_query_execution_times.csv', index=False)