[Reference](https://medium.com/firebird-technologies/building-sql-trainer-ais-backend-a-full-walkthrough-cea7789bda9f)

In [2]:
!pip install dspy

Collecting dspy
  Downloading dspy-2.6.27-py3-none-any.whl.metadata (7.0 kB)
Collecting backoff>=2.2 (from dspy)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting ujson>=5.8.0 (from dspy)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting optuna>=3.4.0 (from dspy)
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting magicattr>=0.1.6 (from dspy)
  Downloading magicattr-0.1.6-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting litellm>=1.60.3 (from dspy)
  Downloading litellm-1.75.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.0 (from dspy)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting json-repair>=0.30.0 (from dspy)
  Downloading json_repair-0.48.0-py3-none-any.whl.metadata (12 kB)
Collecting asyncer==0.0.8 (from dspy

In [4]:
import dspy
lm = dspy.LM("openai/gpt-4o-mini", api_key="YOUR_OPENAI_API_KEY")
dspy.configure(lm=lm)

In [5]:
import dspy
# DSPy Signature for the create schema LLM program
class create_schema(dspy.Signature):
    """
    You are a schema generation assistant. Given a natural language description of the data or entities
    the user wants to store, generate a SQL CREATE TABLE schema that defines appropriate tables, columns,
    and data types. Use sensible names, appropriate data types, and include primary keys. If multiple tables
    are needed, include foreign key relationships where applicable. Return only the SQL schema.

    Example:
    User Prompt: "I want to store information about books, authors, and publishers. Each book has a title,
    publication year, genre, and is written by one or more authors. Each author has a name and birth year.
    Each publisher has a name and address."

    Output: A valid SQL schema containing CREATE TABLE statements to represent this data model.

    Your are using duckDB SQL, which is based on SQLite
    - DO NOT TRY to add foreign_key etc relationships


    """
    user_prompt = dspy.InputField(desc="The prompt the user has given on what schema they want you to generate")
    schema_sql = dspy.OutputField(desc="The SCHEMA SQL for the requested prompt")

schema_gen = dspy.Predict(create_schema)
schema_prompt = "Generate a schema for an online pet store"

response = schema_gen(user_prompt = schema_prompt)

print(response.schema_sql)

In [6]:
# DSPy signature that generates executable code that when executed will add data
class populate_table(dspy.Signature):
    """
    You are provided with a DuckDB SQL table schema.

    Your task is to write complete Python code that:
    - Uses DuckDB in Python.
    - Generates 250 rows of realistic simulated data based on column types and names.
    - Uses libraries such as `faker`, `random`, or `numpy` for data generation.
    - Creates the table using the exact schema provided.
    - Inserts the generated rows using DuckDB SQL INSERT statements (no DataFrame insertion).
    - Uses parameterized queries to avoid SQL injection and ensure clean formatting.
    - No need to import duckdb or connect it is already connected as conn
    - Do not do conn = duckdb.connect(), it is already connected
    - Take care of the foreign key relations, ensuring you add in good sequence!

    Do not return anything except the Python code.

    One-shot Example:

    Input
    table_schema = '''
    CREATE TABLE users (
        user_id INTEGER,
        full_name VARCHAR,
        email VARCHAR,
        age INTEGER,
        join_date DATE,
        is_active BOOLEAN
    );
    '''

    Output
    python_code = '''
    from faker import Faker
    import random
    from datetime import datetime, timedelta

    # Initialize
    fake = Faker()


    # Insert 250 rows
    insert_query = "INSERT INTO users VALUES (?, ?, ?, ?, ?, ?)"
    for i in range(1, 251):
        full_name = fake.name()
        email = fake.email()
        age = random.randint(18, 70)
        join_date = fake.date_between(start_date='-3y', end_date='today').isoformat()
        is_active = random.choice([True, False])
        conn.execute(insert_query, (i, full_name, email, age, join_date, is_active))
    '''
    """
    table_schema = dspy.InputField(desc="The DuckDB SQL schema for the table")
    python_code = dspy.OutputField(desc="Python code that generates simulated data & adds it via DuckDB SQL")

pop_table_gen = dspy.Predict(populate_table)

# Get the DDL for all tables in the database and print them
tables = [row[0] for row in conn.execute("SHOW TABLES").fetchall()]
schema_result = []
for table in tables:
    ddl = conn.execute(f"DESCRIBE {table}").fetchall()
    schema_result.append((table,ddl))
    print(f"Schema for table '{table}':\n{ddl}\n{'-'*40}")

response = populate_agent(table_schema = str(schema_result))

print(response.python_code)