In [1]:
import os

os.getcwd()

'c:\\Users\\dsteinec\\Documents\\Jupyter_Notebooks\\nl_to_data'

### SQL Database

In [6]:
import sqlite3
import pandas as pd

# Connect to the database
db_path = "data/chinook.db"
conn = sqlite3.connect(db_path)

# --- Tables ---
tables = pd.read_sql_query("""
SELECT name FROM sqlite_master
WHERE type='table' AND name NOT LIKE 'sqlite_%'
ORDER BY name;
""", conn)

# --- Columns ---
columns = pd.read_sql_query("""
SELECT
    m.name AS table_name,
    p.name AS column_name,
    p.type AS data_type,
    p.pk AS primary_key
FROM sqlite_master m
JOIN pragma_table_info(m.name) p
WHERE m.type = 'table'
  AND m.name NOT LIKE 'sqlite_%'
ORDER BY m.name, p.cid;
""", conn)

tables, columns

(              name
 0           albums
 1          artists
 2        customers
 3        employees
 4           genres
 5    invoice_items
 6         invoices
 7      media_types
 8   playlist_track
 9        playlists
 10          tracks,
    table_name   column_name      data_type  primary_key
 0      albums       AlbumId        INTEGER            1
 1      albums         Title  NVARCHAR(160)            0
 2      albums      ArtistId        INTEGER            0
 3     artists      ArtistId        INTEGER            1
 4     artists          Name  NVARCHAR(120)            0
 ..        ...           ...            ...          ...
 59     tracks       GenreId        INTEGER            0
 60     tracks      Composer  NVARCHAR(220)            0
 61     tracks  Milliseconds        INTEGER            0
 62     tracks         Bytes        INTEGER            0
 63     tracks     UnitPrice  NUMERIC(10,2)            0
 
 [64 rows x 4 columns])

### Generate Schema Documents

In [2]:
from generate_schema_documents import make_schema_documents

# Generate schema documents
documents = make_schema_documents(conn)
print(documents[0].text)

KeyError: 'samples'

### Embed Documents in a Vector Database

In [19]:
from embed_documents_into_vector_db import upsert_schema_docs_to_lancedb, SchemaDoc

vector_db = upsert_schema_docs_to_lancedb(
                db_dir="chinook_schema_docs",
                documents=documents)
print(vector_db.count_rows())

86


In [20]:
results = vector_db.search("How is the customer table linked?").limit(5).to_pydantic(SchemaDoc)

In [22]:
results

[SchemaDoc(id='table:customers', doc_type='table', table='customers', column='', ref_table='', ref_column='', text='Table: customers\nPrimary key: CustomerId\nColumns:\n - CustomerId (INTEGER) [PK NOT NULL]\n - FirstName (NVARCHAR(40)) [NOT NULL]\n - LastName (NVARCHAR(20)) [NOT NULL]\n - Company (NVARCHAR(80))\n - Address (NVARCHAR(70))\n - City (NVARCHAR(40))\n - State (NVARCHAR(40))\n - Country (NVARCHAR(40))\n - PostalCode (NVARCHAR(10))\n - Phone (NVARCHAR(24))\n - Fax (NVARCHAR(24))\n - Email (NVARCHAR(60)) [NOT NULL]\n - SupportRepId (INTEGER)\nRelationships (foreign keys):\n - customers.SupportRepId → employees.EmployeeId', vector=FixedSizeList(dim=384)),
 SchemaDoc(id='table:invoices', doc_type='table', table='invoices', column='', ref_table='', ref_column='', text='Table: invoices\nPrimary key: InvoiceId\nColumns:\n - InvoiceId (INTEGER) [PK NOT NULL]\n - CustomerId (INTEGER) [NOT NULL]\n - InvoiceDate (DATETIME) [NOT NULL]\n - BillingAddress (NVARCHAR(70))\n - BillingCity (N

In [None]:
import sqlite3

from generate_schema_documents import make_schema_documents
from embed_documents_into_vector_db import upsert_schema_docs_to_lancedb, \
    get_relevant_documents
from create_sql_query import generate_sql_cpu


if __name__ == "__main__":
    query = "Who are the top 3 artists?"

    # --- Connect to the database
    db_path = "data/chinook.db"
    conn = sqlite3.connect(db_path)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

    # --- Take SQL tables and columns, then generate "documents" for each
    # NOTE: Later add LLM-generated descriptions for each table/column
    documents = make_schema_documents(conn)
    print(f'---Example Document---\n{documents[0].text}\n')


In [None]:

    # --- Embed documents in a vector database
    vector_db = upsert_schema_docs_to_lancedb(
                    db_dir="chinook_schema_docs",
                    documents=documents)
    print(f'{vector_db.count_rows()} documents in vector database')

    # --- Validate document relevance
    # NOTE: Later, use another model to validate that documents match the query
    sql_context = get_relevant_documents(vector_db, query)
    print(f'{len(sql_context)} relevant documents found for query "{query}"')
    

In [None]:

    # --- 
    sql_query = generate_sql_cpu("Who are the top 3 artists?", sql_context)
    print(sql_query)

---Example Document---
Table: albums
Primary key: AlbumId
Columns:
 - AlbumId (INTEGER) [PK NOT NULL] (ex: 1, 4, 2)
 - Title (NVARCHAR(160)) [NOT NULL] (ex: For Those About To Rock We Salute You, Balls to the Wall, Restless and Wild)
 - ArtistId (INTEGER) [NOT NULL] (ex: 1, 2, 3)
Relationships (foreign keys):
 - albums.ArtistId → artists.ArtistId


In [None]:
query = "What are the names of all the people here?"
top_docs = get_relevant_documents(vector_db, query)
top_docs

Unnamed: 0,id,doc_type,table,column,ref_table,ref_column,text,vector,_distance
0,column:customers.LastName,column,customers,LastName,,,Column: customers.LastName\nData type: NVARCHA...,"[0.020133438, -0.005432637, -0.053388007, 0.00...",1.173347
1,column:employees.FirstName,column,employees,FirstName,,,Column: employees.FirstName\nData type: NVARCH...,"[-0.012548125, 0.013465421, -0.00900397, 0.012...",1.204338
2,column:customers.FirstName,column,customers,FirstName,,,Column: customers.FirstName\nData type: NVARCH...,"[-0.01519112, -0.020656675, -0.009289459, 0.01...",1.20581
3,column:employees.LastName,column,employees,LastName,,,Column: employees.LastName\nData type: NVARCHA...,"[-0.0051865526, 0.028851336, -0.009485228, -0....",1.230712


In [None]:
# NOTE: Try GPU version later

[DOCUMENT_START]
Column: customers.LastName
Data type: NVARCHAR(20)
Nullable: no
Primary key: no
Default: None
Sample values: Gonçalves, Köhler, Tremblay
[DOCUMENT_END]

[DOCUMENT_START]
Column: employees.FirstName
Data type: NVARCHAR(20)
Nullable: no
Primary key: no
Default: None
Sample values: Andrew, Nancy, Jane
[DOCUMENT_END]

[DOCUMENT_START]
Column: customers.FirstName
Data type: NVARCHAR(40)
Nullable: no
Primary key: no
Default: None
Sample values: Luís, Leonie, François
[DOCUMENT_END]

[DOCUMENT_START]
Column: employees.LastName
Data type: NVARCHAR(20)
Nullable: no
Primary key: no
Default: None
Sample values: Adams, Edwards, Peacock
[DOCUMENT_END]




In [44]:
from create_sql_query import generate_sql_cpu

Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


In [45]:
 
# Usage
top_docs = get_top_docs(vector_db, query)
sql_query = generate_sql_cpu("Who are the top 3 artists?", top_docs)
print(sql_query)

Generating SQL...
 SELECT c.FirstName, c.LastName FROM customers c ORDER BY COUNT(*) DESC NULLS LAST LIMIT 3
