In [1]:
# Install Groq library
!pip install groq

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/138.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0


In [2]:
#Load secrets
import os
from google.colab import userdata

os.environ["NEON_HOST"] = userdata.get("NEON_HOST")
os.environ["NEON_DB"] = userdata.get("NEON_DB")
os.environ["NEON_USER"] = userdata.get("NEON_USER")
os.environ["NEON_PASSWORD"] = userdata.get("NEON_PASSWORD")
os.environ["NEON_PORT"] = userdata.get("NEON_PORT")
os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")

In [34]:
#Connect to Neon PostgreSQL
import psycopg2
import os

conn = psycopg2.connect(
    host=os.environ["NEON_HOST"],
    dbname=os.environ["NEON_DB"],
    user=os.environ["NEON_USER"],
    password=os.environ["NEON_PASSWORD"],
    port=os.environ["NEON_PORT"],
    sslmode="require"
)

print("Connected to NeonDB")

Connected to NeonDB


In [21]:
#Extract Database Schema
def extract_schema(conn):
    cur = conn.cursor()
    cur.execute("""
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = 'public'
        ORDER BY table_name, ordinal_position;
    """)

    rows = cur.fetchall()
    schema = {}

    for table, column, dtype in rows:
        schema.setdefault(table, []).append(f"{column} ({dtype})")

    return schema





In [22]:
SYSTEM_PROMPT="""
You are an expert PostgreSQL NL2SQL agent.

Your task is to convert a user’s natural language business question
(English or Hinglish) into a correct, safe PostgreSQL SELECT query.

Return ONLY the SQL query.
Do NOT add explanations, comments, or markdown.

--------------------------------------------------
DATABASE SCHEMA
--------------------------------------------------

table_first (line-level / product-level data):
- date (DATE)
- firm (TEXT)
- firm_product (TEXT)
- voucher_type (TEXT)
- voucher_no (TEXT)
- gstin_uin (TEXT)
- quantity (NUMERIC)
- rate (NUMERIC)
- value (NUMERIC)
- brand (TEXT)
- sub_category (TEXT)
- category (TEXT)
- client_type (TEXT)

table_second (voucher-level / accounting totals):
- date (DATE)
- firm (TEXT)
- voucher_no (TEXT)
- quantity (NUMERIC)
- value (NUMERIC)
- gstin_uin (TEXT)
- total_tax_amount (NUMERIC)
- gross_total (NUMERIC)

--------------------------------------------------
BUSINESS SEMANTIC LAYER
--------------------------------------------------

Business Definitions:
- Sales / business / turnover / revenue → gross_total (table_second)
- Net sales → value
- GST / tax → total_tax_amount

Rules:
- Default meaning of "sales" is gross_total
- Use table_second for monetary totals
- Use table_first for product / brand / category / quantity analysis

--------------------------------------------------
GENERAL SAFETY RULES
--------------------------------------------------

- Generate SELECT queries only
- NEVER generate INSERT, UPDATE, DELETE, DROP, TRUNCATE
- Use valid PostgreSQL syntax only
- Use only columns present in the schema
- Do not hallucinate columns or tables
- Prefer the simplest correct query

--------------------------------------------------
JOIN RULES (CRITICAL)
--------------------------------------------------

- Do NOT use JOIN unless it is strictly required
- JOIN table_first and table_second ONLY using voucher_no
- Use JOIN ONLY when the requested columns do not exist in a single table
- If a question can be answered using table_second alone,
  using JOIN is considered INCORRECT
- Do NOT join table_first merely to access firm, date, or voucher_no
  because these already exist in table_second
- Avoid joins that cause double counting

--------------------------------------------------
AGGREGATION RULES
--------------------------------------------------

- Always use COALESCE(SUM(column), 0) for aggregate totals
- Do NOT aggregate unless the question asks for totals, sales, business, etc.


GROUPING (“WISE”) RULES (STRICT)


If the question contains:
- "firm wise"
- "brand wise"
- "category wise"
- "product wise"
- "client type wise"

Then ALL of the following MUST be true:
1) The column must appear in SELECT
2) The column must appear in GROUP BY
3) Returning a single aggregated row is INCORRECT


DATE & TIME RULES


GENERAL:
- Use EXTRACT(YEAR FROM date) and EXTRACT(MONTH FROM date)
- When filtering by MONTH, ALWAYS include YEAR

CURRENT:
- "is month", "this month", "current month":
  - Filter by CURRENT_DATE month and year
  - Do NOT group unless explicitly asked

- "is saal", "this year", "current year":
  - Filter by CURRENT_DATE year
  - Do NOT group unless explicitly asked

PAST:
- "pichle saal", "last year":
  - EXTRACT(YEAR FROM date) = EXTRACT(YEAR FROM CURRENT_DATE) - 1

- "pichle N saal", "last N years":
  - Filter:
    EXTRACT(YEAR FROM date) BETWEEN
    EXTRACT(YEAR FROM CURRENT_DATE) - N
    AND EXTRACT(YEAR FROM CURRENT_DATE) - 1
  - Do NOT include current year

MONTHLY GROUPING:
- "har month", "month wise", "monthly":
  - GROUP BY year and month
  - Return year, month, and aggregate

FUTURE:
- "next year", "aane wala saal":
  - EXTRACT(YEAR FROM date) = EXTRACT(YEAR FROM CURRENT_DATE) + 1

- "next N years", "aane wale N saal":
  - EXTRACT(YEAR FROM date) BETWEEN
    EXTRACT(YEAR FROM CURRENT_DATE) + 1
    AND EXTRACT(YEAR FROM CURRENT_DATE) + N

- "future" without range:
  - date > CURRENT_DATE

IMPORTANT:
- Do NOT predict or estimate future data
- If no data exists, return 0 or empty results

--------------------------------------------------
LIMIT & ORDER RULES
--------------------------------------------------

- Do NOT add LIMIT unless explicitly requested
- If LIMIT is used, ORDER BY must be present
- "Top / highest / max" → ORDER BY aggregate DESC
- "Lowest / minimum" → ORDER BY aggregate ASC
- "Latest / recent" → ORDER BY date DESC

--------------------------------------------------
OUTPUT RULES
--------------------------------------------------

- Use clear column aliases (total_sales, total_business, total_tax)
- Do NOT add explanations, comments, markdown, or extra text
- Return ONLY the SQL query"""

In [7]:
# LLM SQL Generator
from groq import Groq
client = Groq()
def nl_to_sql(question):
    response = client.chat.completions.create(
        model="groq/compound",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": question}
        ],
        temperature=0
    )
    return response.choices[0].message.content.strip()


In [8]:
#SQL Safety Validator
def is_safe_sql(sql):
    sql = sql.lower().strip()

    if not sql.startswith("select"):
        return False

    blocked = ["insert", "update", "delete", "drop", "alter", "truncate"]
    return not any(word in sql for word in blocked)


In [9]:
#Row Limit Safety
def enforce_limit(sql, limit=1000):
    sql = sql.strip()
    if "limit" not in sql.lower():
        return sql.rstrip(";") + f" LIMIT {limit};"
    return sql


In [10]:
#Empty-Result Handling
def execute_sql(conn, sql):
    cur = conn.cursor()
    cur.execute(sql)

    columns = [desc[0] for desc in cur.description]
    rows = cur.fetchall()

    results = [dict(zip(columns, row)) for row in rows]
    return results



Testing

In [13]:
question = "December 2025 ka firm wise total sales dikhao"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)


Generated SQL:
SELECT firm, COALESCE(SUM(gross_total), 0) AS total_sales
FROM table_second
WHERE EXTRACT(YEAR FROM date) = 2025 AND EXTRACT(MONTH FROM date) = 12
GROUP BY firm;
Query Result:
[{'firm': '(cancelled )', 'total_sales': Decimal('0')}, {'firm': 'A TO Z SECUTECH PRIVATE LIMITED', 'total_sales': Decimal('1590230')}, {'firm': 'ABN CONNECT', 'total_sales': Decimal('52110')}, {'firm': 'ACS DIGITAL INDIA (AKASH)', 'total_sales': Decimal('123755')}, {'firm': 'ADINATH SECURITY SOLUTIONS', 'total_sales': Decimal('11800')}, {'firm': 'ADVANCE COMMUNICATION SERVICE', 'total_sales': Decimal('850')}, {'firm': 'ADVANCE SECURITY & IT', 'total_sales': Decimal('337000')}, {'firm': 'AJEEM', 'total_sales': Decimal('1250')}, {'firm': 'ALCOFAB ALUMINIUM WORK & SERVICES', 'total_sales': Decimal('3400')}, {'firm': 'AMPLE TRAILS', 'total_sales': Decimal('12231')}, {'firm': 'AP INFOTECH', 'total_sales': Decimal('166388')}, {'firm': 'APEX IT SOLUTIONS', 'total_sales': Decimal('164851')}, {'firm': 'ARN

In [38]:
question = "In 2025 what is January month's total business"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)


Generated SQL:
SELECT COALESCE(SUM(gross_total), 0) AS total_business
FROM table_second
WHERE EXTRACT(YEAR FROM date) = 2025
  AND EXTRACT(MONTH FROM date) = 1;
Query Result:
[{'total_business': Decimal('0')}]


In [73]:
question = "December 2025 ka total sales kitna hai"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)


Generated SQL:
SELECT COALESCE(SUM(gross_total), 0) AS total_sales
FROM table_second
WHERE EXTRACT(YEAR FROM date) = 2025 AND EXTRACT(MONTH FROM date) = 12;
Query Result:
[{'total_sales': Decimal('14495011.14')}]


In [78]:
question = "Pichle saal ka total GST kitna tha"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)

Generated SQL:
SELECT COALESCE(SUM(total_tax_amount), 0) 
FROM table_second 
WHERE EXTRACT(YEAR FROM date) = EXTRACT(YEAR FROM CURRENT_DATE) - 1;
Query Result:
[{'coalesce': Decimal('986824.02')}]


Table 1

In [76]:
question = "December 2025 me brand wise quantity batao"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)

Generated SQL:
SELECT brand, COALESCE(SUM(quantity), 0) AS total_quantity FROM table_first WHERE EXTRACT(YEAR FROM date) = 2025 AND EXTRACT(MONTH FROM date) = 12 GROUP BY brand
Query Result:
[{'brand': 'Beetel', 'total_quantity': Decimal('31')}, {'brand': 'CP Plus', 'total_quantity': Decimal('525')}, {'brand': 'Daichi', 'total_quantity': Decimal('249')}, {'brand': 'Hikvision', 'total_quantity': Decimal('3880')}, {'brand': 'Irange', 'total_quantity': Decimal('687')}, {'brand': 'Other', 'total_quantity': Decimal('17639')}, {'brand': 'Prama', 'total_quantity': Decimal('545')}]


In [77]:
question = "Category wise sales dikhao"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)


Generated SQL:
SELECT 
  COALESCE(SUM(t2.gross_total), 0) AS total_sales,
  t1.category
FROM 
  table_first t1
JOIN 
  table_second t2 ON t1.voucher_no = t2.voucher_no
GROUP BY 
  t1.category
ORDER BY 
  total_sales DESC;
Query Result:
[{'total_sales': Decimal('59312152.14'), 'category': 'Cameras'}, {'total_sales': Decimal('35344561.14'), 'category': 'Accessories'}]


In [81]:
question = "Best performing firm"

sql = nl_to_sql(question)
print("Generated SQL:")
print(sql)

if not is_safe_sql(sql):
    raise ValueError("Unsafe SQL blocked")

data = execute_sql(conn, sql)
print("Query Result:")
print(data)

Generated SQL:
SELECT firm, COALESCE(SUM(gross_total), 0) AS total_sales
FROM table_second
GROUP BY firm
ORDER BY total_sales DESC;
Query Result:
[{'firm': 'A TO Z SECUTECH PRIVATE LIMITED', 'total_sales': Decimal('1590230')}, {'firm': 'CONNET TECHNOLOGIES', 'total_sales': Decimal('1428948')}, {'firm': 'M/s A.B.S. System Solutions', 'total_sales': Decimal('1275314')}, {'firm': 'UNIQUE SYSTEM SOLUTION', 'total_sales': Decimal('604105')}, {'firm': 'QUADRA SECURITY LLP', 'total_sales': Decimal('466108.14')}, {'firm': 'M/S VISION INDIA TECHNOLOGIES', 'total_sales': Decimal('410635')}, {'firm': 'NET CLOUD SERVICES', 'total_sales': Decimal('354067')}, {'firm': 'PAMELON NETWORKS', 'total_sales': Decimal('351220')}, {'firm': 'ADVANCE SECURITY & IT', 'total_sales': Decimal('337000')}, {'firm': 'PXE TECHNOLOGIES', 'total_sales': Decimal('315110')}, {'firm': 'MARKS COMPUTERS', 'total_sales': Decimal('277180')}, {'firm': 'DEE & ESS SECURITY SYSTEMS', 'total_sales': Decimal('268000')}, {'firm': 'BA