## 1. Connect Database

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path("D:/code/text-to-sql-agent")))
from src.connect_db import get_db_connection

cursor = get_db_connection()

## 2. Discover Database Structure

### 2.1 List all user tables

In [None]:
cursor.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
""")

tables = [row[0] for row in cursor.fetchall()]
tables


['aisles',
 'departments',
 'order_products_prior',
 'order_products_train',
 'orders',
 'products']

### 2.2 Extract columns for all tables

In [4]:
cursor.execute("""
SELECT
    table_name,
    column_name,
    data_type,
    is_nullable,
    ordinal_position
FROM information_schema.columns
WHERE table_schema = 'public'
ORDER BY table_name, ordinal_position;
""")

columns_raw = cursor.fetchall()
columns_raw[:10]  # sanity check

[('aisles', 'aisle_id', 'bigint', 'NO', 1),
 ('aisles', 'aisle', 'text', 'YES', 2),
 ('departments', 'department_id', 'bigint', 'NO', 1),
 ('departments', 'department', 'text', 'YES', 2),
 ('order_products_prior', 'order_id', 'bigint', 'YES', 1),
 ('order_products_prior', 'product_id', 'bigint', 'YES', 2),
 ('order_products_prior', 'add_to_cart_order', 'bigint', 'YES', 3),
 ('order_products_prior', 'reordered', 'bigint', 'YES', 4),
 ('order_products_train', 'order_id', 'bigint', 'YES', 1),
 ('order_products_train', 'product_id', 'bigint', 'YES', 2)]

### 2.3 Extract primary keys

In [5]:
cursor.execute("""
SELECT
    tc.table_name,
    kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
  ON tc.constraint_name = kcu.constraint_name
WHERE tc.constraint_type = 'PRIMARY KEY'
  AND tc.table_schema = 'public'
ORDER BY tc.table_name, kcu.ordinal_position;
""")

primary_keys = cursor.fetchall()
primary_keys

[('aisles', 'aisle_id'),
 ('departments', 'department_id'),
 ('orders', 'order_id'),
 ('products', 'product_id')]

### 2.4 Extract foreign keys

In [6]:
cursor.execute("""
SELECT
    tc.table_name,
    kcu.column_name,
    ccu.table_name AS foreign_table,
    ccu.column_name AS foreign_column
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
  ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu
  ON ccu.constraint_name = tc.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY'
  AND tc.table_schema = 'public'
ORDER BY tc.table_name;
""")

foreign_keys = cursor.fetchall()
foreign_keys

[('order_products_prior', 'order_id', 'orders', 'order_id'),
 ('order_products_prior', 'product_id', 'products', 'product_id'),
 ('order_products_train', 'order_id', 'orders', 'order_id'),
 ('order_products_train', 'product_id', 'products', 'product_id'),
 ('products', 'aisle_id', 'aisles', 'aisle_id'),
 ('products', 'department_id', 'departments', 'department_id')]

## 3. Normalize schema into a dict

In [11]:
from collections import defaultdict

# Initialize schema container
schema_dict = {
    "tables": {}
}

# Populate tables and columns
for table, column, dtype, nullable, position in columns_raw:
    if table not in schema_dict["tables"]:
        schema_dict["tables"][table] = {
            "columns": {},
            "primary_key": [],
            "foreign_keys": []
        }

    schema_dict["tables"][table]["columns"][column] = {
        "data_type": dtype,
        "nullable": nullable == "YES",
        "ordinal_position": position
    }

# Attach primary keys
for table, column in primary_keys:
    schema_dict["tables"][table]["primary_key"].append(column)

# Attach foreign key relationships
for table, column, foreign_table, foreign_column in foreign_keys:
    schema_dict["tables"][table]["foreign_keys"].append({
        "column": column,
        "references": {
            "table": foreign_table,
            "column": foreign_column
        }
    })

In [None]:
# Sanity Check

from IPython.display import display

display(schema_dict.keys())
display(schema_dict["tables"].keys())
display(schema_dict["tables"]["orders"])

dict_keys(['tables'])

dict_keys(['aisles', 'departments', 'order_products_prior', 'order_products_train', 'orders', 'products'])

{'columns': {'order_id': {'data_type': 'bigint',
   'nullable': False,
   'ordinal_position': 1},
  'user_id': {'data_type': 'bigint', 'nullable': True, 'ordinal_position': 2},
  'eval_set': {'data_type': 'text', 'nullable': True, 'ordinal_position': 3},
  'order_number': {'data_type': 'bigint',
   'nullable': True,
   'ordinal_position': 4},
  'order_dow': {'data_type': 'bigint',
   'nullable': True,
   'ordinal_position': 5},
  'order_hour_of_day': {'data_type': 'bigint',
   'nullable': True,
   'ordinal_position': 6},
  'days_since_prior_order': {'data_type': 'double precision',
   'nullable': True,
   'ordinal_position': 7}},
 'primary_key': ['order_id'],
 'foreign_keys': []}