The notebook contains the experiments for the generation of ETL jobs

In [89]:
test_schemas = [
    {
        "battles": """
        battle_id INT PRIMARY KEY,
        player_id INT NOT NULL,
        tank_id INT NOT NULL,
        damage_dealt INT NOT NULL,
        damage_blocked INT NOT NULL,
        damage_assisted INT NOT NULL,
        battle_played_date DATETIME NOT NULL,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
        """,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2)
        """,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int
        """
    },
    {
        "orders": """
        order_id int primary key,
        order_date DATETIME,
        quantity decimal(4,2),
        price decimal(6,2),
        customer_id int, 
        product_id int,
        FOREIGN KEY (product_id) REFERENCES products(product_id),
        FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
        """,
        "products": """
        product_id int primary key,
        product_name varchar(100),
        product_price decimal(4,2)
        """,
        "customers": """
        customer_id int primary key, 
        customer_name varchar(100),
        country varchar(100)
        """
    },
    {
        "orders": """
        order_id int primary key,
        order_date DATETIME,
        quantity decimal(4,2),
        price decimal(6,2),
        customer_id int, 
        product_id int,
        FOREIGN KEY (product_id) REFERENCES products(product_id),
        FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
        """,
        "products": """
        product_id int primary key,
        product_name varchar(100),
        product_price decimal(4,2)
        """,
        "customers": """
        customer_id int primary key, 
        customer_name varchar(100),
        country varchar(100)
        """
    },
    {
        "calls":"""
        call_id int primary key,
        caller_id int,
        receiver_id int,
        call_date DATETIME,
        duration decimal(4,2),
        FOREIGN KEY (caller_id) REFERENCES customers(customer_id),
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        plan_id int,
        FOREIGN KEY (plan_id) REFERENCES plan(plan_id)
        """,
        "plan": """
        plan_id int primary key,
        description varchar(255)
        """
    },
    {
        "calls": """
        call_id int primary key,
        caller_id int,
        receiver_id int,
        date DATETIME,
        duration decimal(4,2),
        FOREIGN KEY (caller_id) REFERENCES customers(customer_id)
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        plan_id int,
        phone_id int,
        FOREIGN KEY (plan_id) REFERENCES plan(plan_id),
        FOREIGN KEY (phone_id) REFERENCES phone(phone_id)
        """,
        "phone": """
        phone_id int primary key,
        version varchar(50),
        company varchar(50)
        """,
        "plan": """
        plan_id int primary key,
        description varchar(255)
        """
    },
    {
       "drives": """
        drives_id primary key,
        driver_id int,
        car_id int,
        fuel_burnt decimal(4,2),
        days_out int,
        mileage decimal(5,2),
        FOREIGN KEY (driver_id) REFERENCES drivers(driver_id),
        FOREIGN KEY (car_id) REFERENCES cars(car_id)
        """,
        "drivers": """
        driver_id primary key,
        driver_name varchar(100),
        driver_age int
        """,
        "cars": """
        car_id int primary key,
        car_model varchar(100),
        car_manufacturer int,
        FOREIGN KEY (car_manufacturer) REFERENCES car_manufacturer(manufacturer_id)
        """,
        "car_manufacturer": """
        manufacturer_id int primary key,
        name varchar(100), 
        country varchar(50)
        """ 
    },
    {
        "battles": """
        battle_id int primary key,
        player_id int,
        tank_id int,
        damage_dealt int,
        damage_blocked int,
        damage_assisted int,
        battle_played DATETIME,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
        """ ,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2)
        """ ,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int
        """ 
    },
    {
        "orders": """
        order_id int primary key,
        product_id int,
        order_date DATETIME,
        quantity decimal(4,2),
        price decimal(6,2),
        customer_id int, 
        FOREIGN KEY (product_id) REFERENCES products(product_id),
        FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key, 
        customer_name varchar(100),
        country varchar(100)
        """ ,
        "products": """
        product_id int primary key,
        product_name varchar(100),
        product_price decimal(4,2)
        """     
    },
    {
        "orders": """
        order_id int primary key,
        product_id int,
        order_date DATETIME,
        quantity decimal(4,2),
        total_price decimal(6,2),
        customer_id int, 
        FOREIGN KEY (product_id) REFERENCES products(product_id),
        FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
        """,
        "products": """
        product_id int primary key,
        product_name varchar(100),
        product_price decimal(4,2)
        """,
        "customers": """
        customer_id int primary key, 
        customer_name varchar(100),
        country varchar(100)
        """
    },
    {
        "calls": """
        call_id int primary key,
        caller_id int,
        receiver_id int,
        date DATETIME,
        duration decimal(4,2),
        FOREIGN KEY (caller_id) REFERENCES customers(customer_id),
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        plan_id int,
        phone_id int,
        FOREIGN KEY (plan_id) REFERENCES plan(plan_id),
        FOREIGN KEY (phone_id) REFERENCES phone(phone_id)
        """,
        "phone": """
        phone_id int primary key,
        version varchar(50),
        company varchar(50)
        """,
        "plan": """
        plan_id int primary key,
        description varchar(255)    
        """
    },
    {
        "calls": """
        call_id int primary key,
        caller_id int,
        receiver_id int,
        call_date DATETIME,
        duration decimal(4,2),
        FOREIGN KEY (caller_id) REFERENCES customers(customer_id),
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        plan_id int,
        phone_id int,
        FOREIGN KEY (plan_id) REFERENCES plan(plan_id),
        FOREIGN KEY (phone_id) REFERENCES phone(phone_id)
        """,
        "phone": """
        phone_id int primary key,
        version varchar(50),
        company varchar(50)
        """,
        "plan": """
        plan_id int primary key,
        description varchar(255)    
        """
    },
    {
        "battles": """
        battle_id int primary key,
        player_id int,
        tank_id int,
        damage_dealt int,
        damage_blocked int,
        damage_assisted int,
        battle_played DATE,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)  
        """,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2) 
        """,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int    
        """
    },
    {
        "battles": """
        battle_id int primary key,
        player_id int,
        tank_id int,
        damage_dealt int,
        damage_blocked int,
        damage_assisted int,
        battle_played DATE,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)  
        """,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2) 
        """,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int    
        """,  
    },
    {
        "races": """
        race_id int primary key,
        location_id int,
        season_id int,
        race_winner_number int,
        fasted_lap decimal(3,4),
        FOREIGN KEY (location_id) REFERENCES locations(location_id),
        FOREIGN KEY (season_id) REFERENCES seasons(season_id),
        FOREIGN KEY (race_winner_number) REFERENCES racers(race_number)
        """,
        "locations": """
        location_id int primary key,
        country varchar(30),
        city varchar(25)
        """,
        "seasons": """
        season_id int primary key,
        season varchar(20)
        """,
        "racers": """
        race_number int primary key,
        racer_name varchar(100)
        """
    },
    {
        "flights": """
        flight_number int primary key,
        plane_number int,
        departure_place int,
        destination_place int,
        departure_time DATETIME,
        arrival_time DATETIME,
        FOREIGN KEY (plane_number) REFERENCES planes(plane_number),
        FOREIGN KEY (departure_place) REFERENCES place(place_id),
        FOREIGN KEY (destination_place) REFERENCES place(place_id)
        """,
        "planes": """
        plane_number int primary key,
        plane_company int,
        plane_model varchar(50),
        flights_number int,
        flight_hours decimal(5, 2),
        FOREIGN KEY (plane_company) REFERENCES companies(company_id)
        """,
        "companies": """
        company_id int primary key,
        name varchar(50),
        number_planes int
        """,
        "place": """
        place_id int primary key,
        country varchar(30)
        """
    },
    {
        "flights": """
        flight_number int primary key,
        plane_number int,
        departure_place int,
        destination_place int,
        departure_time DATE,
        arrival_time DATE,
        FOREIGN KEY (plane_number) REFERENCES planes(plane_number),
        FOREIGN KEY (departure_place) REFERENCES place(place_id),
        FOREIGN KEY (destination_place) REFERENCES place(place_id)
        """,
        "planes": """
        plane_number int primary key,
        plane_company int,
        plane_model varchar(50),
        flights_number int,
        flight_hours decimal(5, 2),
        FOREIGN KEY (plane_company) REFERENCES companies(company_id)
        """,
        "companies": """
        company_id int primary key,
        name varchar(50),
        number_planes int
        """,
        "place": """
        place_id int primary key,
        country varchar(30)
        """
    },
    {
        "flights": """
        flight_number int primary key,
        plane_number int,
        departure_place int,
        destination_place int,
        flight_duration decimal(5,2),
        FOREIGN KEY (plane_number) REFERENCES planes(plane_number),
        FOREIGN KEY (departure_place) REFERENCES place(place_id),
        FOREIGN KEY (destination_place) REFERENCES place(place_id)
        """,
        "planes": """
        plane_number int primary key,
        plane_company int,
        plane_model varchar(50),
        flights_number int,
        flight_hours decimal(5, 2),
        FOREIGN KEY (plane_company) REFERENCES companies(company_id)
        """,
        "companies": """
        company_id int primary key,
        name varchar(50),
        number_planes int
        """,
        "place": """
        place_id int primary key,
        country varchar(30)
        """,
    },
    {
        "exams": """
        exam_id int,
        student_id int,
        score decimal(3,2),
        primary key (exam_id, student_id),
        FOREIGN KEY (student_id) REFERENCES students(student_id),
        FOREIGN KEY (exam_id) REFERENCES exam_dim(exam_id)
        """,
        "students": """
        student_id int primary key,
        student_name varchar(100),
        class_id int,
        FOREIGN KEY (class_id) REFERENCES classes(class_id)
        """,
        "classes": """
        class_id int primary key,
        grade int,
        letter varchar(2),
        number_students int
        """,
        "exam_dim": """
        exam_id int,
        class varchar(20),
        date DATE
        """,
    },
    {
        "payments": """
        payment_id int primary key,
        date DATE,
        sender_id int,
        receiver_id int,
        amount decimal(6,2),
        FOREIGN KEY (sender_id) REFERENCES customers(customer_id),
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        balance decimal(7,2)
        """
    },
    {
        "exams": """
        exam_id int,
        student_id int,
        score decimal(3,2),
        primary key (exam_id, student_id),
        FOREIGN KEY (student_id) REFERENCES students(student_id),
        FOREIGN KEY (exam_id) REFERENCES exam_dim(exam_id)
        """,
        "exam_dim": """
        exam_id int primary key,
        subject varchar(20),
        date DATE
        """,
        "classes": """
        class_id int primary key,
        grade int,
        letter varchar(2),
        number_students int
        """,
        "students": """
        student_id int primary key,
        student_name varchar(100),
        class_id int,
        FOREIGN KEY (class_id) REFERENCES classes(class_id)
        """
    },
    {
        "flights": """
        flight_id int primary key,
        plane_id int,
        company_id int,
        tickets_sold int,
        ticket_price decimal(4,2),
        operation_day DATE,
        FOREIGN KEY (plane_id) REFERENCES planes(plane_id),
        FOREIGN KEY (company_id) REFERENCES companies(company_id)
        """,
        "planes": """
        plane_id int primary key,
        plane_model varchar(100),
        plane_manufacturer varchar(100)
        """,
        "companies": """
        company_id int primary key,
        name varchar(100)
        """
    },
    {
        "flights": """
        flight_id int primary key,
        plane_id int,
        company_id int,
        tickets_sold int,
        operation_day DATE,
        FOREIGN KEY (plane_id) REFERENCES planes(plane_id),
        FOREIGN KEY (company_id) REFERENCES companies(company_id)
        """,
        "planes": """
        plane_id int primary key,
        plane_model varchar(100),
        plane_manufacturer varchar(100)
        """,
        "companies": """
        company_id int primary key,
        name varchar(100)
        """
    },
    {
        "battles": """
        battle_id int primary key,
        player_id int,
        tank_id int,
        damage_dealt int,
        damage_blocked int,
        damage_assisted int,
        battle_played DATE,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
        """,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2)
        """,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int
        """
    },
    {
        "battles": """
        battle_id int primary key,
        player_id int,
        tank_id int,
        damage_dealt int,
        damage_blocked int,
        damage_assisted int,
        battle_played DATE,
        FOREIGN KEY (player_id) REFERENCES players(player_id),
        FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
        """,
        "players": """
        player_id int primary key,
        username varchar(30),
        battles int,
        winrate decimal(3,2)
        """,
        "tanks": """
        tank_id int primary_key,
        name varchar(100),
        tier int
        """
    },
    {
        "calls": """
        call_id int primary key,
        caller_id int,
        receiver_id int,
        call_date DATETIME,
        duration decimal(4,2),
        FOREIGN KEY (caller_id) REFERENCES customers(customer_id),
        FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
        """,
        "customers": """
        customer_id int primary key,
        name varchar(100),
        plan_id int,
        phone_id int,
        FOREIGN KEY (plan_id) REFERENCES plan(plan_id),
        FOREIGN KEY (phone_id) REFERENCES phone(phone_id)
        """,
        "phone": """
        phone_id int primary key,
        version varchar(50),
        company varchar(50)
        """,
        "plan": """
        plan_id int primary key,
        description varchar(255)
        """
    },
    
]

In [71]:
generated_fact_schemas = [
    """
    CREATE TABLE fact_battles (
    user_id INT NOT NULL,
    tank_id INT NOT NULL,
    total_damage_dealt INT,
    total_damage_blocked INT,
    total_damage_assisted INT,
    PRIMARY KEY (user_id, tank_id),
    FOREIGN KEY (user_id) REFERENCES players(player_id),
    FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
    );
    """,
    """
    CREATE TABLE fact_orders (
    product_id int,
    customer_id int,
    total_spending decimal(10,2),
    PRIMARY KEY (product_id, customer_id),
    FOREIGN KEY (product_id) REFERENCES products(product_id),
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
    );
    """,
    """
    CREATE TABLE fact_orders (
    product_id int,
    total_spending decimal(10,2),
    quantity decimal(4,2),
    customer_id int,
    PRIMARY KEY (product_id), 
    FOREIGN KEY (product_id) REFERENCES products(product_id),
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
    );
    """,
    """
    CREATE TABLE call_duration_facts (
    caller_id int,
    receiver_id int,
    total_duration int,
    PRIMARY KEY (caller_id, receiver_id),
    FOREIGN KEY (caller_id) REFERENCES customers(customer_id),
    FOREIGN KEY (receiver_id) REFERENCES customers(customer_id)
    );
    """,
    """
    CREATE TABLE fact_calls (
    plan_id int,
    phone_id int,
    total_duration int,
    PRIMARY KEY (plan_id, phone_id),
    FOREIGN KEY (plan_id) REFERENCES plan(plan_id),
    FOREIGN KEY (phone_id) REFERENCES phone(phone_id)
    );
    """,
    """
    CREATE TABLE fact_drives (
    manufacturer_id INT,
    total_mileage INT,
    avg_days_out FLOAT,
    FOREIGN KEY (manufacturer_id) REFERENCES car_manufacturer(manufacturer_id)
    );
    """,
    """
    CREATE TABLE tank_battle_statistics (
    tank_id int,
    average_damage_dealt decimal(10,2),
    total_blocked int,
    total_assisted int,
    total_battles int,
    PRIMARY KEY (tank_id),
    FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
    );
    """,
    """
    CREATE TABLE sales_fact (
    product_id int,
    order_date_hourly int,
    total_order_price decimal(10,2),
    PRIMARY KEY (product_id, order_date_hourly),
    FOREIGN KEY (product_id) REFERENCES products(product_id)
    );
    """,
    """
    CREATE TABLE fact_sales (
    product_id int,
    order_date_monthly int,
    product_count int,
    FOREIGN KEY (product_id) REFERENCES products(product_id),
    PRIMARY KEY (product_id, order_date_monthly)
    );
    """,
    """
    CREATE TABLE fact_calls (
    phone_id int,
    plan_id int,
    call_count int,
    PRIMARY KEY (phone_id, plan_id),
    FOREIGN KEY (phone_id) REFERENCES phone(phone_id),
    FOREIGN KEY (plan_id) REFERENCES plan(plan_id)
    );
    """,
    """
    CREATE TABLE fact_calls (
    phone_id int,
    plan_id int,
    call_daily int,
    call_count int,
    PRIMARY KEY (phone_id, plan_id, call_daily),
    FOREIGN KEY (phone_id) REFERENCES phone(phone_id),
    FOREIGN KEY (plan_id) REFERENCES plan(plan_id)
    );
    """,
    """
    CREATE TABLE fact_battles (
    player_id int,
    tank_id int,
    battle_count int,
    total_damage_dealt int,
    average_damage_dealt decimal(10,2),
    PRIMARY KEY (player_id, tank_id),
    FOREIGN KEY (player_id) REFERENCES players(player_id),
    FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
    );
    """,
    """
    CREATE TABLE fact_battles (
    player_id int,
    tank_id int,
    battle_played_daily int,
    battle_count int,
    total_damage_dealt int,
    avg_damage_dealt decimal(10,2),
    PRIMARY KEY (player_id, tank_id, battle_played_daily),
    FOREIGN KEY (player_id) REFERENCES players(player_id),
    FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
    );
    """,
    """
    CREATE TABLE race_results (
    season_id int,
    race_count int,
    PRIMARY KEY (season_id),
    FOREIGN KEY (season_id) REFERENCES seasons(season_id)
    );
    """,
    """
    CREATE TABLE flight_fact (
    company_id int,
    departure_time_daily int,
    flight_count int,
    PRIMARY KEY (company_id, departure_time_daily),
    FOREIGN KEY (company_id) REFERENCES companies(company_id)
    );
    """,
    """
    CREATE TABLE flight_facts (
    company_id int,
    departure_place int,
    destination_place int,
    departure_time_daily_daily int,
    flight_count int,
    PRIMARY KEY (company_id, departure_place, destination_place, departure_time_daily_daily),
    FOREIGN KEY (company_id) REFERENCES companies(company_id),
    FOREIGN KEY (departure_place) REFERENCES place(place_id),
    FOREIGN KEY (destination_place) REFERENCES place(place_id)
    );
    """,
    """
    CREATE TABLE flight_facts (
    departure_place int,
    destination_place int,
    min_flight_duration decimal(5,2),
    max_flight_duration decimal(5,2),
    PRIMARY KEY (departure_place, destination_place),
    FOREIGN KEY (departure_place) REFERENCES place(place_id),
    FOREIGN KEY (destination_place) REFERENCES place(place_id)
    );
    """,
    """
    CREATE TABLE exam_scores_fact (
    class_id int,
    exam_id int,
    min_score decimal(3,2),
    max_score decimal(3,2),
    student_min_score int,
    student_max_score int,
    PRIMARY KEY (class_id, exam_id),
    FOREIGN KEY (class_id) REFERENCES classes(class_id),
    FOREIGN KEY (exam_id) REFERENCES exam_dim(exam_id)
    );
    """,
    """
    CREATE TABLE fact_transaction (
    customer_id int,
    date_monthly_monthly int,
    spendings_over_month_min decimal(6,2),
    spendings_over_month_max decimal(6,2),
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    PRIMARY KEY (customer_id, date_monthly_monthly)
);
    """,
    """
    CREATE TABLE exam_scores_fact (
    class_id int,
    exam_id int,
    average_score decimal(5,2),
    primary key (class_id, exam_id),
    FOREIGN KEY (class_id) REFERENCES classes(class_id),
    FOREIGN KEY (exam_id) REFERENCES exam_dim(exam_id)
    );
    """,
    """
    CREATE TABLE flight_facts (
    company_id int,
    avg_tickets_sold decimal(10,2),
    total_ticket_price decimal(10,2),
    PRIMARY KEY (company_id),
    FOREIGN KEY (company_id) REFERENCES companies(company_id)
    );
    """,
    """
    CREATE TABLE flight_aggregates (
    company_id int,
    operation_day_monthly_monthly int,
    avg_tickets_sold float,
    PRIMARY KEY (company_id, operation_day_monthly_monthly),
    FOREIGN KEY (company_id) REFERENCES companies(company_id)
    );
    """,
    """
    CREATE TABLE fact_battle_stats (
    player_id int,
    tank_id int,
    avg_damage_dealt decimal(10,2),
    PRIMARY KEY (player_id, tank_id),
    FOREIGN KEY (player_id) REFERENCES players(player_id),
    FOREIGN KEY (tank_id) REFERENCES tanks(tank_id)
    );
    """,
    """
    CREATE TABLE fact_battles (
    player_id INT,
    battle_played_monthly INT,
    avg_damage_dealt DECIMAL(10, 2),
    PRIMARY KEY (player_id, battle_played_monthly),
    FOREIGN KEY (player_id) REFERENCES players(player_id)
    );
    """,
    """
    CREATE TABLE fact_calls (
    customer_id int,
    date_daily int,
    avg_call_id decimal(10,2),
    sum_duration decimal(10,2),
    PRIMARY KEY (customer_id, date_daily),
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
    );
    """,
    
]

In [72]:
# print(test_schemas[])
print(generated_fact_schemas[16])


    CREATE TABLE flight_facts (
    departure_place int,
    destination_place int,
    min_flight_duration decimal(5,2),
    max_flight_duration decimal(5,2),
    PRIMARY KEY (departure_place, destination_place),
    FOREIGN KEY (departure_place) REFERENCES place(place_id),
    FOREIGN KEY (destination_place) REFERENCES place(place_id)
    );
    


In [3]:
# testing state
from typing_extensions import TypedDict

class TestingState(TypedDict):
    test_count: int
    user_input: str
    model_output: str
    errors: str

In [18]:
import yaml

with open("config/configs_etl.yaml", "r") as f:
    data = yaml.safe_load(f)

a = data["etl"]["role"].format(tables_schema=test_schemas[0] , generated_fact_table=generated_fact_schemas[0])

In [19]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

Creation of the local testing SQL instance

In [90]:
import sqlite3

conn = sqlite3.connect('test_v1.db')
cursor = conn.cursor()

In [68]:
import re

def sql_query_extraction(query):
    pattern = r'(INSERT.*?;)'
    matches = re.findall(pattern, query, re.DOTALL)
    if not matches:
        raise ValueError("No INSERT queries found in the provided text.")
    queries = []
    for sql in matches:
        queries.append(sql)
    return queries

def table_query_creation(counter):
    record = test_schemas[counter]
    queries = []
    for key, value in record.items():
        values = value.split(",")
        cleaned = [element.replace('\n', '').strip() for element in values]
        query = data["etl"]["utils"]["table_creation"].format(table_name=key, table_content=", ".join(cleaned))
        queries.append(query)
    return queries

def table_content_insertion(counter):
    content = ",".join(table_query_creation(counter))
    model_prompt = data["etl"]["prompt"].format(table_schemas=content)
    response = llm.invoke(model_prompt)
    try:
        queries = sql_query_extraction(response.content)
        for sql in queries:
            cursor.execute(sql)
            conn.commit()
    except ValueError as e:
        raise e
    
def table_content_extraction(counter):
    tables = test_schemas[counter]
    for key in tables.keys():
        query = data["etl"]["utils"]["table_content_extraction"].format(table_name=key)
        res = cursor.execute(query)
        print('From a table ' + key)
        print(res.fetchall())

def table_deletion(counter):
    record = test_schemas[counter]
    queries = []
    for key in record.keys():
        queries.append(data["etl"]["utils"]["table_deletion"].format(table_name=key))
    queries.append(data["etl"]["utils"]["table_deletion"].format(table_name=table_name_extraction(counter)))
    for sql in queries:
        cursor.execute(sql)
        conn.commit

def etl_job_generation(counter):
    res = llm.invoke(data["etl"]["role"].format(tables_schema=test_schemas[counter], generated_fact_table=generated_fact_schemas[counter]))
    a = sql_query_extraction(res.content)
    print(a[0])
    try:
        cursor.execute(a[0])
        conn.commit()
        table_name = table_name_extraction(counter)
        query = data["etl"]["utils"]["table_content_extraction"].format(table_name=table_name)
        res = cursor.execute(query)
        print('From a table ' + table_name)
        print(res.fetchall())
    except:
        raise ValueError("Error while trying to execute the ETL job.")

def table_name_extraction(counter):
    string = generated_fact_schemas[counter]
    splitted = string.split("\n")
    return splitted[1].strip().split(" ")[2]


def testing_loop(counter):
    tables = table_query_creation(counter)
    for table in tables:
        cursor.execute(table)
        conn.commit()
    print("Created tables")
    try:
        table_content_insertion(counter)
        print("inserted elements")
    except ValueError as e:
        print(e)
        return -1
    table_content_extraction(counter)
    cursor.execute(generated_fact_schemas[counter])
    conn.commit()
    try:
        etl_job_generation(counter)
        table_deletion(counter)
        return 1
    except ValueError as e:
        print(e)
        table_deletion(counter)
        return -1
    
def testing(j=0):
    correct = 0
    wrong = 0
    for i in range(j, len(test_schemas)):
        print(i)
        if testing_loop(i) < 0:
            wrong += 1
        else:
            correct += 1
    total = correct + wrong
    return (correct / total, wrong / total)

In [91]:
(cor, wrong) = testing(19)
print(cor, wrong)

19
Created tables
inserted elements
From a table exams
[(1, 1, 95.5)]
From a table exam_dim
[(1, 'Mathematics', '2023-10-01')]
From a table classes
[(1, 10, 'A', 25)]
From a table students
[(1, 'John Doe', 1)]
INSERT INTO exam_scores_fact (class_id, exam_id, average_score)
SELECT s.class_id, e.exam_id, AVG(ex.score) AS average_score
FROM exams ex
JOIN exam_dim e ON ex.exam_id = e.exam_id
JOIN students s ON ex.student_id = s.student_id
GROUP BY s.class_id, e.exam_id;
From a table exam_scores_fact
[(1, 1, 95.5)]
20
Created tables
inserted elements
From a table flights
[(1, 1, 1, 150, 299.99, '2023-10-15')]
From a table planes
[(1, 'Boeing 747', 'Boeing')]
From a table companies
[(1, 'Airways Inc.')]
INSERT INTO flight_facts (company_id, avg_tickets_sold, total_ticket_price)
SELECT 
    f.company_id,
    AVG(f.tickets_sold) AS avg_tickets_sold,
    SUM(f.tickets_sold * f.ticket_price) AS total_ticket_price
FROM 
    flights f
GROUP BY 
    f.company_id;
From a table flight_facts
[(1, 150,

In [None]:
correct = 0
wrong = 0

for i in range(len(test_schemas)):
    a = data["etl"]["role"].format(tables_schema=test_schemas[i], generated_fact_table=generated_fact_schemas[i])
    con = llm.invoke(a)
    print(test_schemas[i])
    print(generated_fact_schemas[i])
    print(con.content)
    res = input("correct")
    if res.lower() == "yes":
        correct += 1
    else:
        wrong += 1

print(correct/25, wrong/25)

In [12]:
print(con.content)

```sql
INSERT INTO fact_battles (user_id, tank_id, total_damage_dealt, total_damage_blocked, total_damage_assisted)
SELECT player_id, tank_id, SUM(damage_dealt) AS total_damage_dealt, SUM(damage_blocked) AS total_damage_blocked, SUM(damage_assisted) AS total_damage_assisted
FROM battles
GROUP BY player_id, tank_id;
```


In [None]:
def etl_generator(state: TestingState):
    prompt = data["etl"]["role"].format()