In [1]:
!pip install faker

Collecting faker


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.1-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   --- ------------------------------------ 0.2/1.9 MB 10.2 MB/s eta 0:00:01
   ------------------------ --------------- 1.2/1.9 MB 15.0 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 15.3 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-36.1.1


In [7]:
import sqlite3
import random
from faker import Faker
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [2]:
fake = Faker()

# --- CONFIGURATION ---
NUM_EMPLOYEES = 1_000_000  # Adjust to increase data volume
BATCH_SIZE = 10_000        # Number of rows per batch
DEPARTMENTS = ['Sales', 'Engineering', 'HR', 'Marketing', 'Finance', 'Operations']

In [3]:
def create_big_database(db_name='big_database.db'):
    with sqlite3.connect(db_name) as conn:
        c = conn.cursor()
        # Create employees table
        c.execute('''
            CREATE TABLE IF NOT EXISTS employees (
                employee_id INTEGER PRIMARY KEY,
                name TEXT,
                department TEXT,
                salary REAL,
                hire_date TEXT
            )
        ''')
        conn.commit()  # Explicit commit after table creation

        print("Generating employee data...")
        # Insert data in batches
        for batch_start in range(1, NUM_EMPLOYEES + 1, BATCH_SIZE):
            batch = []
            for i in range(batch_start, min(batch_start + BATCH_SIZE, NUM_EMPLOYEES + 1)):
                name = fake.name()
                department = random.choice(DEPARTMENTS)
                salary = round(random.uniform(30000, 200000), 2)
                hire_date = fake.date_between(start_date='-20y', end_date='today').isoformat()
                batch.append((i, name, department, salary, hire_date))
            c.executemany('INSERT INTO employees VALUES (?, ?, ?, ?, ?)', batch)
            # Commit is automatic on exiting the 'with' block, but we can also call commit here if desired.
            conn.commit()
            print(f"Inserted records {batch_start} to {batch_start + len(batch) - 1}")
    print("Big database creation complete and committed.")


In [4]:
def aggregate_data(big_db='big_database.db'):
    with sqlite3.connect(big_db) as conn:
        c = conn.cursor()
        # Overall aggregates
        c.execute('SELECT COUNT(*), MIN(salary), MAX(salary), SUM(salary) FROM employees')
        total_employees, min_salary, max_salary, total_expenditure = c.fetchone()
        # Aggregates by department
        c.execute('SELECT department, COUNT(*) FROM employees GROUP BY department')
        dept_counts = c.fetchall()
    return {
        'total_employees': total_employees,
        'min_salary': min_salary,
        'max_salary': max_salary,
        'total_expenditure': total_expenditure,
        'department_counts': dept_counts
    }


In [5]:
def create_aggregated_database(aggregates, agg_db='aggregated_database.db'):
    with sqlite3.connect(agg_db) as conn:
        c = conn.cursor()
        # Create tables for aggregates
        c.execute('''
            CREATE TABLE IF NOT EXISTS company_summary (
                metric TEXT PRIMARY KEY,
                value TEXT
            )
        ''')
        c.execute('''
            CREATE TABLE IF NOT EXISTS department_counts (
                department TEXT PRIMARY KEY,
                employee_count INTEGER
            )
        ''')
        # Insert overall metrics
        summary_data = [
            ('total_employees', str(aggregates['total_employees'])),
            ('min_salary', f"{aggregates['min_salary']:.2f}"),
            ('max_salary', f"{aggregates['max_salary']:.2f}"),
            ('total_expenditure', f"{aggregates['total_expenditure']:.2f}")
        ]
        c.executemany('INSERT OR REPLACE INTO company_summary VALUES (?, ?)', summary_data)
        # Insert department counts
        c.executemany('INSERT OR REPLACE INTO department_counts VALUES (?, ?)', aggregates['department_counts'])
        conn.commit()
    print("Aggregated database created and committed.")


In [6]:
if __name__ == '__main__':
    start_time = datetime.now()
    print("Starting database generation...")

    # Create and commit the big database automatically.
    create_big_database()

    # Compute aggregates from the big database.
    aggregates = aggregate_data()
    print("Aggregates computed:")
    print(f"  Total employees: {aggregates['total_employees']}")
    print(f"  Min salary: ${aggregates['min_salary']:.2f}")
    print(f"  Max salary: ${aggregates['max_salary']:.2f}")
    print(f"  Total expenditure: ${aggregates['total_expenditure']:.2f}")
    for dept, count in aggregates['department_counts']:
        print(f"    {dept}: {count}")

    # Create the aggregated database automatically.
    create_aggregated_database(aggregates)

    end_time = datetime.now()
    print("Process completed in:", end_time - start_time)

Starting database generation...
Generating employee data...
Inserted records 1 to 10000
Inserted records 10001 to 20000
Inserted records 20001 to 30000
Inserted records 30001 to 40000
Inserted records 40001 to 50000
Inserted records 50001 to 60000
Inserted records 60001 to 70000
Inserted records 70001 to 80000
Inserted records 80001 to 90000
Inserted records 90001 to 100000
Inserted records 100001 to 110000
Inserted records 110001 to 120000
Inserted records 120001 to 130000
Inserted records 130001 to 140000
Inserted records 140001 to 150000
Inserted records 150001 to 160000
Inserted records 160001 to 170000
Inserted records 170001 to 180000
Inserted records 180001 to 190000
Inserted records 190001 to 200000
Inserted records 200001 to 210000
Inserted records 210001 to 220000
Inserted records 220001 to 230000
Inserted records 230001 to 240000
Inserted records 240001 to 250000
Inserted records 250001 to 260000
Inserted records 260001 to 270000
Inserted records 270001 to 280000
Inserted re

API of LLM

In [32]:
import os
os.environ["REPLICATE_API_TOKEN"] = "r8_04zBDMfz7ZCc0IN8eMD7KCHLh8QuSkN0kngpN"


In [34]:
import sqlite3
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("NexaAIDev/DeepSeek-R1-Distill-Llama-8B-NexaQuant")
model = AutoModelForCausalLM.from_pretrained("NexaAIDev/DeepSeek-R1-Distill-Llama-8B-NexaQuant")


In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


In [None]:
def human_to_sql_local(human_query: str) -> str:
    """
    Uses a locally loaded language model to convert a human language query into SQL.
    The database schema is assumed to be:
      - company_summary(metric TEXT, value TEXT)
      - department_counts(department TEXT, employee_count INTEGER)
    """
    # Build a prompt that gives context to the model.
    prompt = (
        "You are an expert SQL generator. Convert the following human language question into a SQL query for a SQLite database. \n"
        "The database has two tables:\n"
        "1. company_summary with columns: metric (TEXT), value (TEXT)\n"
        "2. department_counts with columns: department (TEXT), employee_count (INTEGER)\n\n"
        f"Question: \"{human_query}\"\n\nSQL Query:"
    )
    
    # Generate output text. You might need to experiment with parameters.
    output = generator(prompt, max_length=150, num_return_sequences=1, temperature=0.0)
    
    # Extract generated text from the output
    generated_text = output[0]['generated_text']
    
    # Try to extract the SQL query by removing the prompt part.
    sql_query = generated_text.replace(prompt, "").strip()
    return sql_query

In [None]:
def run_sql_query(db_path: str, query: str):
    """
    Executes the provided SQL query on the given SQLite database and returns the results.
    """
    try:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute(query)
            results = cursor.fetchall()
        return results
    except Exception as e:
        return f"Error executing query: {e}"

In [None]:
def main():
    print("Welcome! Please enter your question in plain English (no SQL required):")
    human_query = input("Question: ")

    # Convert human language query to SQL using the local model.
    sql_query = human_to_sql_local(human_query)
    print("\nGenerated SQL query:")
    print(sql_query)

    # Execute the SQL query on the aggregated database.
    results = run_sql_query("aggregated_database.db", sql_query)
    
    print("\nQuery Results:")
    print(results)

In [None]:
if __name__ == "__main__":
    main()