# TPCDS: Preprocessing, DB Setup and Data Load Script

In [3]:
import os, re
from utils import connect_postgres, get_full_absolute_path, exclude_non_csv_files
from dotenv import load_dotenv

In [4]:
cur = connect_postgres("postgres")

PostgreSQL server information
{'user': 'postgres', 'channel_binding': 'prefer', 'dbname': 'postgres', 'host': 'localhost', 'port': '25433', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'} 

You are connected to -  ('PostgreSQL 17.0 (Debian 17.0-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',) 



In [3]:
# drop tpcds db

db_name = "tpcds"

cur.execute(
    f"DROP DATABASE IF EXISTS {db_name} WITH (FORCE);"
)
print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 DROP DATABASE


In [4]:
# change win1252 encoding temp db to normal before drop
try:
    cur.execute(
        "ALTER DATABASE win1252_temp is_template false;"
    )
except Exception as e:
    print(e)
else:
    print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 ALTER DATABASE


In [5]:
# drop win1252 encoding temp db (after set to normal db)
cur.execute(
    "DROP DATABASE IF EXISTS win1252_temp WITH (FORCE);"
)
print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 DROP DATABASE


In [6]:
# create win1252 encoding temp db
cur.execute(
    """
    
    CREATE DATABASE win1252_temp
        WITH
        OWNER = postgres
        TEMPLATE = template0
        ENCODING = 'WIN1252'
        CONNECTION LIMIT = -1
        IS_TEMPLATE = True;

    """
)
print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 CREATE DATABASE


In [7]:
# create tpcds db
cur.execute(
    f"""

    CREATE DATABASE {db_name}
        WITH
        OWNER = postgres
        TEMPLATE = win1252_temp
        ENCODING = 'WIN1252'
        CONNECTION LIMIT = -1
        IS_TEMPLATE = False;
        
    """
)
print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 CREATE DATABASE


In [8]:
# connect to tpcds db
cur = connect_postgres(db_name)

PostgreSQL server information
{'user': 'postgres', 'channel_binding': 'prefer', 'dbname': 'tpcds', 'host': 'localhost', 'port': '25433', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'sslcertmode': 'allow', 'sslsni': '1', 'ssl_min_protocol_version': 'TLSv1.2', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'gssdelegation': '0', 'target_session_attrs': 'any', 'load_balance_hosts': 'disable'} 

You are connected to -  ('PostgreSQL 17.0 (Debian 17.0-1.pgdg120+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit',) 



In [9]:
# create tables for db
cur.execute(open("DSGen-software-code-3.2.0rc1/tools/tpcds.sql", "r").read())
print("SQL Status Output:\n", cur.statusmessage)
cur.execute(open("DSGen-software-code-3.2.0rc1/tools/tpcds_source.sql", "r").read())
print("SQL Status Output:\n", cur.statusmessage)

SQL Status Output:
 CREATE TABLE
SQL Status Output:
 CREATE TABLE


In [5]:
# get dir path

path = os.path.join(os.getcwd(), 'data', 'tmp_1')
files = os.listdir(path)
print(path)

/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1


In [6]:
files_abs_path = get_full_absolute_path(path)
files = exclude_non_csv_files(files)
files_abs_path = exclude_non_csv_files(files_abs_path)

Total files: 31
First few files...
['/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1/customer_demographics_2_4.csv', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1/customer_demographics_4_4.csv', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1/dbgen_version_1_4.csv', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1/store_1_4.csv', '/home/alfio/Desktop/DataWarehouse/TPC_DS/tpcds-benchmark/data/tmp_1/ship_mode_1_4.csv']


In [8]:
# Assume files_abs_path is defined and contains the list of file paths
file_count = 0

for iteration in range(0, 1):
    for file in files_abs_path:
        if 'dbgen_version' in file:
            with open(file, 'r', encoding='latin-1') as f:
                lines = f.readlines()
            new_lines = []
            for line in lines:
                # Remove the last '^' and any spaces after it
                if(line.count('^') == 4):
                    line = re.sub(r'\^(?!.*\^)\s*', '', line)
                    new_lines.append(line)
                    with open(file, 'w', encoding='latin-1') as f:
                        f.writelines(new_lines)
                    file_count += 1
    print(f'\nIteration {iteration + 1} done!')
    print(f'{file_count} file(s) updated for extra column exclusion.')
    file_count = 0

3

Iteration 1 done!
0 file(s) updated for extra column exclusion.


In [14]:
# generate sql commands for loading data from csv to postgres db
# considers that csv files were generated in parallel stream

sql_commands_file = open('data_load_script.sql','w')

for file in files:
    underscore_index = [underscore_ind.start() for underscore_ind in re.finditer('_', file)]
    file_name = file[:underscore_index[-2]]
    file_path = os.path.join(path, file)
    sql_command = "COPY public."+file_name+" FROM '"+file_path+"' delimiter '^' CSV;\n"
    sql_commands_file.write(sql_command)

sql_commands_file.close()

In [15]:
#insert

import subprocess
import os

db_name = "tpcds"

# psql command
command = [
    "psql",
    "-d", db_name,      # dbname
    "-f", "data_load_script.sql"  # SQL file to execute
]

# copy the env
load_dotenv()
env = os.environ.copy()

# run command as subprocess
subprocess.run(command, env=env)

COPY 480200
COPY 480200
COPY 1
COPY 12
COPY 20
COPY 144067
COPY 50000
COPY 2936250
COPY 2936250
COPY 71763
COPY 1441548
COPY 480200
COPY 86400
COPY 5
COPY 18000
COPY 100000
COPY 60
COPY 719384
COPY 35
COPY 2936250
COPY 300
COPY 2936250
COPY 20
COPY 11718
COPY 480200
COPY 7200
COPY 287514
COPY 6
COPY 30
COPY 73049
COPY 2880404


CompletedProcess(args=['psql', '-d', 'tpcds', '-f', 'data_load_script.sql'], returncode=0)

In [114]:
# add constraints to db

cur.execute(open("tpcds_ri.sql", "r").read())
print("SQL Status Output:\n", cur.statusmessage)


SQL Status Output:
 ALTER TABLE


In [115]:
# close connection to db

cur.close()

#### End of script.