## Latest versions

### Data Read in

In [None]:
data_ref = {
    key_id_cols = [],
    key_dt_col = null,
    dt_cols = [],
    numeric_cols = []
}

In [None]:
class Dataset:
    def __init__(self, data_source=None):
        """Initialize the Dataset with an optional data source."""

        if not data_source or not isinstance(data_source, str) or not data_source.endswith('.csv'):
            raise ValueError("The data_source must be a valid .csv file")


        self.data_source = data_source  # Source of the data (e.g., file path, database connection)
        self.attributes = {}  # Dictionary to store dataset attributes

    def load_data(self):
        """Load data from the data source and populate attributes."""
        if not self.data_source:
            raise ValueError("No data source provided")
        
        # Example of populating the attributes dictionary
        self.attributes['rows'] = self._read_data()
        self.attributes['columns'] = self._extract_columns()
        self.attributes['data_type'] = self._infer_data_type()
        # Add more attributes as needed

    def _read_data(self):
        """
        Private method to read data from the data source.
        Identifies if data source is file or database (SF)
        """

        if 'csv.' in data_source:
            return pd.read_csv(data_source)
        
        else:
            print('Snowflake/SQL support coming later....')
            break
        
        
        # Implement logic to read data from self.data_source
        # This could be reading from a CSV file, database, etc.
        # For this example, let's assume it reads and returns the number of rows
        return 1000  # Placeholder value, replace with actual data reading logic

    def _extract_columns(self):
        """Private method to extract columns from the data."""
        # Implement logic to extract column names or metadata
        return ['column1', 'column2', 'column3']  # Placeholder values

    def _infer_data_type(self):
        """Private method to infer the data type of the dataset."""
        # Implement logic to infer the data type
        return 'tabular'  # Placeholder value

    def get_attributes(self):
        """Public method to retrieve the attributes dictionary."""
        return self.attributes

# usage:
# dataset = Dataset(data_source='path/to/data.csv')
# dataset.load_data()
# attributes = dataset.get_attributes()
# print(attributes)


## Working/Scrap Area

In [None]:
import os

def read_sql_files(directory):
    sql_files = []
    for file in os.listdir(directory):
        if file.endswith(".sql"):
            file_path = os.path.join(directory, file)
            with open(file_path, "r") as f:
                sql_files.append(f.read())
    return sql_files

In [None]:
import time

class Query:
    def __init__(self, query_text, dependencies=None):
        self.query_text = query_text
        self.dependencies = dependencies if dependencies else []

    def add_dependency(self, dependency):
        self.dependencies.append(dependency)

    def performance_measure(self, connector):

        start_time = time.time()

        # Execute the query using the SQL connector
        connector.execute(self.query_text)

        end_time = time.time()
        execution_time = end_time - start_time

        return execution_time

In [None]:
def flip_value(x):
    return -x
    

In [1]:
# Specify the directory containing the .sql files
directory = "path/to/sql/files"

# Read the .sql files and store the contents in a list
sql_files = read_sql_files(directory)

# Create Query objects for each SQL query
queries = []
for sql_text in sql_files:
    query = Query(sql_text)
    queries.append(query)

# Print the queries
for query in queries:
    print("Query Text:", query.query_text)
    print("Dependencies:", query.dependencies)
    print()

NameError: name 'read_sql_files' is not defined

In [None]:
# Create a Query object for 'query53'
query53 = Query('query53')

# Pass the query to the connect function
connect(query53)

# Measure the performance of query53
execution_time = query53.performance_measure(connector)

# Print the execution time
print("Execution time for query53:", execution_time, "seconds")

## Query recognizer

### Identify dependencies

In [24]:
import re


def list_references(text):
    
    #identify tables (not CTE's)
    matches = re.findall(r'FROM\s+(.*?)\s', text, re.IGNORECASE)

    result = {'type1': [], 'type2': []}
    
    for match in matches:
        
        print(f'printing match....: |{match}|')
        if match.startswith('('):
            result['type1'].append(match + ' type1')
        else:
            result['type2'].append(match + ' type2')
    
    return result

sample_queries = [
    "SELECT * FROM table1 t1",
    "SELECT column1, column2 FROM table2 t2",
    "SELECT COUNT(*) FROM table3",
    "SELECT AVG(column1) FROM table4",
    "SELECT column1, column2 FROM table5 WHERE column3 = 'value'",
    "WITH cte5 AS (SELECT * FROM BIG_TABLE75) WHERE X=35"
]

for s in sample_queries:
    print(list_references(s))

printing match....: |table1|
{'type1': [], 'type2': ['table1 type2']}
printing match....: |table2|
{'type1': [], 'type2': ['table2 type2']}
{'type1': [], 'type2': []}
{'type1': [], 'type2': []}
printing match....: |table5|
{'type1': [], 'type2': ['table5 type2']}
printing match....: |BIG_TABLE75)|
{'type1': [], 'type2': ['BIG_TABLE75) type2']}


In [9]:
sample_queries = [
    "SELECT * FROM table1",
    "SELECT column1, column2 FROM table2",
    "SELECT COUNT(*) FROM table3",
    "SELECT AVG(column1) FROM table4",
    "SELECT column1, column2 FROM table5 WHERE column3 = 'value'"
]

for s in sample_queries:
    print(extract_and_classify(s))

{'type1': [], 'type2': ['table1 type2']}
{'type1': [], 'type2': ['table2 type2']}
{'type1': [], 'type2': ['table3 type2']}
{'type1': [], 'type2': ['table4 type2']}
{'type1': [], 'type2': ["table5 WHERE column3 = 'value' type2"]}


SQL Alchemy


In [None]:
from sqlalchemy import create_engine, text
import os

# Specify the directory containing the .sql files
directory = "path/to/sql/files"

# Read the .sql files and store the contents in a list
sql_files = []
for file in os.listdir(directory):
    if file.endswith(".sql"):
        file_path = os.path.join(directory, file)
        with open(file_path, "r") as f:
            sql_files.append(f.read())

# Create a SQLAlchemy engine
engine = create_engine("your_database_connection_string")

# Execute the SQL queries
for sql_text in sql_files:
    query = text(sql_text)
    result = engine.execute(query)
    # Process the query result as needed
    for row in result:
        print(row)

# Close the SQLAlchemy engine
engine.dispose()