In [12]:
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
import pandas as pd
import json
# from naivebayes import NaiveBayesTextClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

In [13]:
load_dotenv()

True

In [14]:
# username = os.getenv('SNOWFLAKE_USER')
# password = os.getenv('SNOWFLAKE_PASSWORD')
# account = os.getenv('SNOWFLAKE_ACCOUNT')
# warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
# database = os.getenv('SNOWFLAKE_DATABASE_TPC')
# schema = os.getenv('SNOWFLAKE_SCHEMA_TPC')

In [15]:
# engine = create_engine(
#     f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
# )

In [16]:
Query1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATEADD(DAY, -90, DATE '1998-12-01')
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""


In [17]:
Query5 = """
SELECT
    n_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue
FROM
    customer,
    orders,
    lineitem,
    supplier,
    nation,
    region
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND l_suppkey = s_suppkey
    AND c_nationkey = s_nationkey
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
    AND r_name = 'ASIA'
    AND o_orderdate >= DATE '1994-01-01'
    AND o_orderdate < DATEADD(YEAR, 1, DATE '1994-01-01')
GROUP BY
    n_name
ORDER BY
    revenue DESC;
"""


In [18]:
Query18 = """
SELECT
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    SUM(l_quantity) AS total_quantity
FROM
    customer,
    orders,
    lineitem
WHERE
    o_orderkey IN (
        SELECT
            l_orderkey
        FROM
            lineitem
        GROUP BY
            l_orderkey
        HAVING
            SUM(l_quantity) > 300
    )
    AND c_custkey = o_custkey
    AND o_orderkey = l_orderkey
GROUP BY
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice
ORDER BY
    o_totalprice DESC,
    o_orderdate;
"""


In [19]:
def query_run(query, number, warehouse_size=None):
    profiling_folder = os.path.join('results', f'query_{number}')
    os.makedirs(profiling_folder, exist_ok=True)
    execution_times = []  
    
    engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
            user=os.getenv('SNOWFLAKE_USER'),
            password=os.getenv('SNOWFLAKE_PASSWORD'),
            account=os.getenv('SNOWFLAKE_ACCOUNT'),
            database = os.getenv('SNOWFLAKE_DATABASE_TPC'),
            schema = os.getenv('SNOWFLAKE_SCHEMA_TPC'),
            warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
        )
    )
    
    try:
        with engine.connect() as connection:
            if warehouse_size:
                try:
                    alter_warehouse_query = f"ALTER WAREHOUSE ANIMAL_TASK_WH SET WAREHOUSE_SIZE = '{warehouse_size}'"
                    connection.execute(text(alter_warehouse_query))
                    print(f"Warehouse size updated to {warehouse_size}.")
                except Exception as e:
                    print(f"Error setting warehouse size: {e}")
                    return 

            for i in range(100):
                start_time = time.time()
                result = connection.execute(text(query))
                end_time = time.time()
                elapsed_time = end_time - start_time
                execution_times.append(elapsed_time)
                print(f"Iteration {i + 1}: Query executed in {elapsed_time:.4f} seconds.")
    except Exception as e:
        print(f"Error executing query: {e}")
        return

    file_path = os.path.join(profiling_folder, "execution_times.txt")
    try:
        with open(file_path, "w") as file:
            for exec_time in execution_times:
                file.write(f"{exec_time:.4f}\n")
        print(f"Execution times saved to {file_path}")
    except Exception as e:
        print(f"Error saving execution times to file: {e}")


In [20]:
query_run(Query1,'1','SMALL')

Warehouse size updated to SMALL.
Iteration 1: Query executed in 0.2489 seconds.
Iteration 2: Query executed in 0.2365 seconds.
Iteration 3: Query executed in 0.2524 seconds.
Iteration 4: Query executed in 0.2599 seconds.
Iteration 5: Query executed in 0.2464 seconds.
Iteration 6: Query executed in 0.2572 seconds.
Iteration 7: Query executed in 0.2369 seconds.
Iteration 8: Query executed in 0.2744 seconds.
Iteration 9: Query executed in 0.2748 seconds.
Iteration 10: Query executed in 0.2568 seconds.
Iteration 11: Query executed in 0.3136 seconds.
Iteration 12: Query executed in 0.2968 seconds.
Iteration 13: Query executed in 0.2567 seconds.
Iteration 14: Query executed in 0.2400 seconds.
Iteration 15: Query executed in 0.2640 seconds.
Iteration 16: Query executed in 0.2342 seconds.
Iteration 17: Query executed in 0.2401 seconds.
Iteration 18: Query executed in 0.2550 seconds.
Iteration 19: Query executed in 0.2580 seconds.
Iteration 20: Query executed in 0.3350 seconds.
Iteration 21: Qu

In [21]:
query_run(Query5,'5','SMALL')

Warehouse size updated to SMALL.
Iteration 1: Query executed in 0.2517 seconds.
Iteration 2: Query executed in 0.2614 seconds.
Iteration 3: Query executed in 0.2895 seconds.
Iteration 4: Query executed in 0.2376 seconds.
Iteration 5: Query executed in 0.2427 seconds.
Iteration 6: Query executed in 0.2381 seconds.
Iteration 7: Query executed in 0.2334 seconds.
Iteration 8: Query executed in 0.2544 seconds.
Iteration 9: Query executed in 0.2538 seconds.
Iteration 10: Query executed in 0.2444 seconds.
Iteration 11: Query executed in 0.2561 seconds.
Iteration 12: Query executed in 0.6375 seconds.
Iteration 13: Query executed in 0.2275 seconds.
Iteration 14: Query executed in 0.9591 seconds.
Iteration 15: Query executed in 0.2667 seconds.
Iteration 16: Query executed in 0.2315 seconds.
Iteration 17: Query executed in 0.2454 seconds.
Iteration 18: Query executed in 0.2520 seconds.
Iteration 19: Query executed in 0.2547 seconds.
Iteration 20: Query executed in 0.2471 seconds.
Iteration 21: Qu

In [22]:
query_run(Query18,'18','SMALL')

Warehouse size updated to SMALL.
Iteration 1: Query executed in 0.2862 seconds.
Iteration 2: Query executed in 0.3543 seconds.
Iteration 3: Query executed in 0.2545 seconds.
Iteration 4: Query executed in 0.2603 seconds.
Iteration 5: Query executed in 0.2780 seconds.
Iteration 6: Query executed in 0.2622 seconds.
Iteration 7: Query executed in 0.2645 seconds.
Iteration 8: Query executed in 0.2668 seconds.
Iteration 9: Query executed in 0.2583 seconds.
Iteration 10: Query executed in 0.2617 seconds.
Iteration 11: Query executed in 0.2588 seconds.
Iteration 12: Query executed in 0.2803 seconds.
Iteration 13: Query executed in 0.2571 seconds.
Iteration 14: Query executed in 0.2975 seconds.
Iteration 15: Query executed in 0.2571 seconds.
Iteration 16: Query executed in 0.3107 seconds.
Iteration 17: Query executed in 0.3340 seconds.
Iteration 18: Query executed in 0.2946 seconds.
Iteration 19: Query executed in 0.2508 seconds.
Iteration 20: Query executed in 0.2568 seconds.
Iteration 21: Qu