In [34]:
import numpy as np
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine, text
import pandas as pd
import json
# from naivebayes import NaiveBayesTextClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
import time
import math

In [35]:
load_dotenv()

True

In [36]:
# username = os.getenv('SNOWFLAKE_USER')
# password = os.getenv('SNOWFLAKE_PASSWORD')
# account = os.getenv('SNOWFLAKE_ACCOUNT')
# warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
# database = os.getenv('SNOWFLAKE_DATABASE_TPC')
# schema = os.getenv('SNOWFLAKE_SCHEMA_TPC')

In [37]:
# engine = create_engine(
#     f'snowflake://{username}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
# )

In [38]:
Query1 = """
SELECT
    l_returnflag,
    l_linestatus,
    SUM(l_quantity) AS sum_qty,
    SUM(l_extendedprice) AS sum_base_price,
    SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
    SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
    AVG(l_quantity) AS avg_qty,
    AVG(l_extendedprice) AS avg_price,
    AVG(l_discount) AS avg_disc,
    COUNT(*) AS count_order
FROM
    lineitem
WHERE
    l_shipdate <= DATEADD(DAY, -90, DATE '1998-12-01')
GROUP BY
    l_returnflag,
    l_linestatus
ORDER BY
    l_returnflag,
    l_linestatus;
"""


In [39]:
Query5 = """
SELECT
    n_name,
    SUM(l_extendedprice * (1 - l_discount)) AS revenue
FROM
    customer,
    orders,
    lineitem,
    supplier,
    nation,
    region
WHERE
    c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND l_suppkey = s_suppkey
    AND c_nationkey = s_nationkey
    AND s_nationkey = n_nationkey
    AND n_regionkey = r_regionkey
    AND r_name = 'ASIA'
    AND o_orderdate >= DATE '1994-01-01'
    AND o_orderdate < DATEADD(YEAR, 1, DATE '1994-01-01')
GROUP BY
    n_name
ORDER BY
    revenue DESC;
"""


In [40]:
Query18 = """
SELECT
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    SUM(l_quantity) AS total_quantity
FROM
    customer,
    orders,
    lineitem
WHERE
    o_orderkey IN (
        SELECT
            l_orderkey
        FROM
            lineitem
        GROUP BY
            l_orderkey
        HAVING
            SUM(l_quantity) > 300
    )
    AND c_custkey = o_custkey
    AND o_orderkey = l_orderkey
GROUP BY
    c_name,
    c_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice
ORDER BY
    o_totalprice DESC,
    o_orderdate;
"""


In [41]:
def query_run(query, number, warehouse_size=None):
    profiling_folder = os.path.join('results', f'query_{number}')
    os.makedirs(profiling_folder, exist_ok=True)
    execution_times = []  
    
    engine = create_engine(
        'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'.format(
            user=os.getenv('SNOWFLAKE_USER'),
            password=os.getenv('SNOWFLAKE_PASSWORD'),
            account=os.getenv('SNOWFLAKE_ACCOUNT'),
            database = os.getenv('SNOWFLAKE_DATABASE_TPC'),
            schema = os.getenv('SNOWFLAKE_SCHEMA_TPC'),
            warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
        )
    )
    
    try:
        with engine.connect() as connection:
            if warehouse_size:
                try:
                    alter_warehouse_query = f"ALTER WAREHOUSE ANIMAL_TASK_WH SET WAREHOUSE_SIZE = '{warehouse_size}'"
                    connection.execute(text(alter_warehouse_query))
                    print(f"Warehouse size updated to {warehouse_size}.")
                except Exception as e:
                    print(f"Error setting warehouse size: {e}")
                    return 

            for i in range(100):
                start_time = time.time()
                result = connection.execute(text(query))
                end_time = time.time()
                elapsed_time = end_time - start_time
                execution_times.append(elapsed_time)
                print(f"Iteration {i + 1}: Query executed in {elapsed_time:.4f} seconds.")
    except Exception as e:
        print(f"Error executing query: {e}")
        return

    file_path = os.path.join(profiling_folder, "execution_times.txt")
    try:
        with open(file_path, "w") as file:
            for exec_time in execution_times:
                file.write(f"{exec_time:.4f}\n")
        print(f"Execution times saved to {file_path}")
    except Exception as e:
        print(f"Error saving execution times to file: {e}")


In [42]:
query_run(Query1,'1','LARGE')

Warehouse size updated to LARGE.
Iteration 1: Query executed in 0.2402 seconds.
Iteration 2: Query executed in 0.2765 seconds.
Iteration 3: Query executed in 0.2622 seconds.
Iteration 4: Query executed in 0.2544 seconds.
Iteration 5: Query executed in 0.2452 seconds.
Iteration 6: Query executed in 0.4966 seconds.
Iteration 7: Query executed in 0.2988 seconds.
Iteration 8: Query executed in 0.2569 seconds.
Iteration 9: Query executed in 0.2413 seconds.
Iteration 10: Query executed in 0.2509 seconds.
Iteration 11: Query executed in 0.2546 seconds.
Iteration 12: Query executed in 0.2603 seconds.
Iteration 13: Query executed in 0.2560 seconds.
Iteration 14: Query executed in 0.2415 seconds.
Iteration 15: Query executed in 0.2427 seconds.
Iteration 16: Query executed in 0.2534 seconds.
Iteration 17: Query executed in 0.5145 seconds.
Iteration 18: Query executed in 0.2353 seconds.
Iteration 19: Query executed in 0.2823 seconds.
Iteration 20: Query executed in 0.2689 seconds.
Iteration 21: Qu

In [43]:
query_run(Query5,'5','LARGE')

Warehouse size updated to LARGE.
Iteration 1: Query executed in 0.2590 seconds.
Iteration 2: Query executed in 0.2766 seconds.
Iteration 3: Query executed in 0.2595 seconds.
Iteration 4: Query executed in 0.2359 seconds.
Iteration 5: Query executed in 0.2428 seconds.
Iteration 6: Query executed in 0.3210 seconds.
Iteration 7: Query executed in 0.2353 seconds.
Iteration 8: Query executed in 0.2689 seconds.
Iteration 9: Query executed in 0.2705 seconds.
Iteration 10: Query executed in 0.2642 seconds.
Iteration 11: Query executed in 0.2568 seconds.
Iteration 12: Query executed in 0.2466 seconds.
Iteration 13: Query executed in 0.2508 seconds.
Iteration 14: Query executed in 0.2384 seconds.
Iteration 15: Query executed in 0.2476 seconds.
Iteration 16: Query executed in 0.2614 seconds.
Iteration 17: Query executed in 0.2622 seconds.
Iteration 18: Query executed in 0.2503 seconds.
Iteration 19: Query executed in 0.3168 seconds.
Iteration 20: Query executed in 0.2565 seconds.
Iteration 21: Qu

In [44]:
query_run(Query18,'18','LARGE')

Warehouse size updated to LARGE.
Iteration 1: Query executed in 0.4209 seconds.
Iteration 2: Query executed in 0.2461 seconds.
Iteration 3: Query executed in 0.2778 seconds.
Iteration 4: Query executed in 0.2552 seconds.
Iteration 5: Query executed in 0.2526 seconds.
Iteration 6: Query executed in 0.3194 seconds.
Iteration 7: Query executed in 0.2452 seconds.
Iteration 8: Query executed in 0.2440 seconds.
Iteration 9: Query executed in 0.2505 seconds.
Iteration 10: Query executed in 0.3348 seconds.
Iteration 11: Query executed in 0.2735 seconds.
Iteration 12: Query executed in 0.2413 seconds.
Iteration 13: Query executed in 0.2373 seconds.
Iteration 14: Query executed in 0.2609 seconds.
Iteration 15: Query executed in 0.2605 seconds.
Iteration 16: Query executed in 0.2557 seconds.
Iteration 17: Query executed in 0.2348 seconds.
Iteration 18: Query executed in 0.2709 seconds.
Iteration 19: Query executed in 0.2631 seconds.
Iteration 20: Query executed in 0.2502 seconds.
Iteration 21: Qu