In [39]:
import data as data
import pandas as pd
import json
import tiktoken

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
data.set_db_path('CBSdatabase.db')
ids = data.get_tables_database()

## Cleaning the column metadata so it doesn't include the topic groups

In [65]:
column_info = data.get_column_info(ids)
print(column_info.shape)
# Filter column_info to only include columns which key is not ''
column_info = column_info[column_info['Key'] != '']
print(column_info.shape)

(4997, 15)
(4119, 15)


## Generating the JSON with the column and table information

In [66]:
# column_info = data.get_column_info(ids)
column_info.rename(columns={'Key': 'TitleColumn'}, inplace=True)
column_info.drop(columns=['Title'], inplace=True)
table_info = data.get_all_tables_info()
# join both tables on the column 'Identifier'
all_info = pd.merge(column_info, table_info, on='Identifier')
grouped = all_info.groupby('Identifier')

# Create a new column with the column names for the table_with_columns json
table_info_with_columns = all_info[['TitleColumn', 'Title', 'Identifier', 'Summary', 'Period', 'ShortDescription']]
table_info_with_columns.drop_duplicates(inplace=True)
table_info_with_columns.reset_index(drop=True, inplace=True)
grouped_table_with_columns = table_info_with_columns.groupby('Identifier').agg({
    'Title': 'first',
    'Summary': 'first',
    'Period': 'first',
    'ShortDescription': 'first',
    'TitleColumn': ' | '.join
}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table_info_with_columns.drop_duplicates(inplace=True)


In [43]:
# This json contains the table info and the column titles

grouped_table_with_columns.to_json('table_info.json', orient='records', indent=2)

In [44]:
output = []
for table_id, group in grouped:
    # Grab the first row to get the common table-level info
    table_level_info = group.iloc[0]
    
    # Create a list of column dictionaries
    columns_list = group[["TitleColumn", "Description"]].to_dict(orient="records")
    
    # Build the structure for this table
    table_dict = {
        "table_id": table_id,
        "title": table_level_info["Title"],
        "summary": table_level_info["Summary"],
        "period": table_level_info["Period"],
        "short_description": table_level_info["ShortDescription"],
        "columns": columns_list
    }
    
    # Add the structure to our output list
    output.append(table_dict)

json_output = json.dumps(output, indent=2)
# save the file
with open('all_info.json', 'w') as f:
    f.write(json_output)


In [67]:
output = {}
for table_id, group in grouped:
    # Grab the first row to get the common table-level info
    table_level_info = group.iloc[0]

    columns_list = []
    for _, row in group.iterrows():
        if row["Type"] == "Dimension":
            columns_list.append({
                "TitleColumn": row["TitleColumn"],
                "Values": ' | '.join(data.execute_query_pandas_df(f"SELECT DISTINCT {row['TitleColumn']} FROM \"{row['Identifier']}\";")[row['TitleColumn']])
            })
        else:
            columns_list.append({
                "TitleColumn": row["TitleColumn"],
                "Description": row["Description"]
            })

    table_dict = {
        # "table_id": table_id,
        "title": table_level_info["Title"],
        "summary": table_level_info["Summary"],
        "period": table_level_info["Period"],
        "short_description": table_level_info["ShortDescription"],
        "columns": columns_list
    }
    output[table_id] = table_dict

json_output = json.dumps(output, indent=2)
# save the file
with open('all_info.json', 'w') as f:
    f.write(json_output)


## Finding one table in the JSON

In [69]:

with open("all_info.json", "r") as file:
    all_info = json.load(file)  # data is a list of dicts

ids = data.get_tables_database()
table_to_find = ids[20]

table_info = all_info[table_to_find]

if table_info:
    print(f"Found table: {table_info}")
else:
    print("Not found.")

Found table: {'title': 'Caribbean Netherlands, Bonaire; import and export values, SITC', 'summary': 'Import, export value and trade balance of Bonaire \naccording to SITC classification.', 'period': '2011 3rd quarter - 2022 3rd quarter.', 'short_description': '\nThis table shows the values of imports and exports and the trade balance of Bonaire. The data are broken down into aggregated categories of goods according to the Standard International Trade Classification (SITC sections). On October 10, 2010, Bonaire became a public body of the Netherlands. \n\nData available from: the third quarter of 2011.\n\nStatus of the figures:\nThe figures on international trade of Bonaire are subject to adjustment when new or updated sources become available. The figures are therefore provisional when they are first published. \nThese provisional figures are adjusted approximately five months after the quarter under review. In the second quarter of the current year, the definite figure of the previous

## Token estimation for user question generation

In [58]:
def estimate_tokens(text, model="gpt-4o"):
    # Initialize tokenizer for the specific model
    tokenizer = tiktoken.encoding_for_model(model)
    # Tokenize the input text
    tokens = tokenizer.encode(text)
    return len(tokens)

In [28]:
# Token estimation for the user quesstion generation

prompt_start = 'Use the following table metadata to generate three specific questions that can be answered by running an SQL query on this table. Each question must be closely aligned to this table’s content and not applicable to a different dataset. Output the questions and the table_id for each in JSON format. Do not include any additional text in your answer.'
prompts = []
n_tokens = 0
for id in ids:
    table_info = all_info[id]
    # write result as string
    table_info = json.dumps(table_info)
    prompt = prompt_start + table_info
    prompts.append(prompt)
    n_tokens += estimate_tokens(prompt) + 150  # Added 150 tokens for the estimated number of tokens in the model's response (see calculation below)
print('Expected number of tokens:', n_tokens)

Expected number of tokens: 431564


In [None]:
# Token estimation for the user quesstion generation

prompt_start = 'Use the following table metadata to generate three specific questions that can be answered by running an SQL query on this table and the corresponding SQL query. Each question must be closely aligned to this table’s content and not applicable to a different dataset. Output the questions and the table_id for each in JSON format. Do not include any additional text in your answer.'
prompts = []
n_tokens = 0
for id in ids:
    table_info = all_info[id]
    # write result as string
    table_info = json.dumps(table_info)
    prompt = prompt_start + table_info
    prompts.append(prompt)
    n_tokens += estimate_tokens(prompt) + 150  # Added 150 tokens for the estimated number of tokens in the model's response (see calculation below)
print('Expected number of tokens:', n_tokens)

In [59]:
answer = """[
  {
    "table_id": "37823eng",
    "question": "What is the total input of natural gas used for electricity production in Combined Heat and Power (CHP) installations across all sectors in 2022?",
    "query": "SELECT SUM(\"NaturalGasInput_2\") AS TotalNaturalGasInput FROM \"37823eng\" WHERE \"CombinedHeatAndPowerCHP\" = 'Combined Heat and Power (CHP)' AND \"Periods\" = '2022';"
  },
  {
    "table_id": "37823eng",
    "question": "Which installation type had the highest electricity output in 2020?",
    "query": "SELECT \"InstallationTypes\", MAX(\"ElectricityOutput_12\") AS MaxElectricityOutput FROM \"37823eng\" WHERE \"Periods\" = '2020' GROUP BY \"InstallationTypes\" ORDER BY MaxElectricityOutput DESC LIMIT 1;"
  },
  {
    "table_id": "37823eng",
    "question": "How much heat was generated by decentral production in the health care and social work sector in 2023?",
    "query": "SELECT SUM(\"HeatOutput_13\") AS TotalHeatOutput FROM \"37823eng\" WHERE \"Sectors\" = 'Health care and social work' AND \"CombinedHeatAndPowerCHP\" = 'Decentral electricity production' AND \"Periods\" = '2023';"
  }
]"""

estimate_tokens(answer)

294

In [None]:
# Token estimation for the user quesstion generation

prompt_start = 'Use the following table metadata to generate three specific questions that can be answered by running an SQL query on this table and the corresponding SQLite query (the table names are the ids. The table name after the from should have ""). Each question must be closely aligned to this table’s content and not applicable to a different dataset. Output the questions and the table_id for each in JSON format. Do not include any additional text in your answer.'
prompts = []
n_tokens = 0
for id in ids:
    table_info = all_info[id]
    # write result as string
    table_info = json.dumps(table_info)
    prompt = prompt_start + table_info
    prompts.append(prompt)
    n_tokens += estimate_tokens(prompt) + 300  # Added 150 tokens for the estimated number of tokens in the model's response (see calculation below)
print('Expected number of tokens:', n_tokens)

Expected number of tokens: 537918


In [27]:
n_prompts = 10
# select n_prompts random prompts
import random
random_prompts = random.sample(prompts, n_prompts)
random_prompts

ValueError: Sample larger than population or is negative

In [3]:
# Here there are 13 user questions manually generated.
responses = []
response1 = [
  {
    "table_id": "82659ENG",
    "question": "What were the import and export values for specific categories of goods in Bonaire during a given period?"
  },
  {
    "table_id": "82659ENG",
    "question": "How has the trade balance of Bonaire changed year-on-year for specific categories of goods?"
  },
  {
    "table_id": "82659ENG",
    "question": "Which categories of goods experienced the highest percentage change in import or export values over the last year in Bonaire?"
  }
]
responses += response1

response2 = [
  {
    "table_id": "80783eng",
    "question": "What is the total number of farms producing specific types of crops or raising specific livestock in different regions?"
  },
  {
    "table_id": "80783eng",
    "question": "How has the utilisation of agricultural land, such as arable land and grassland, varied across different regions and farm types over time?"
  },
  {
    "table_id": "80783eng",
    "question": "What are the trends in the production of specific crops, such as potatoes or cereals, and their distribution by farm type and region?"
  }
]
responses += response2

response3 = [
  {
    "table_id": "80784eng",
    "question": "What is the total number of persons employed in agricultural holdings, categorized by region and gender?"
  },
  {
    "table_id": "80784eng",
    "question": "How many annual work units (AWU) are contributed by family and non-family labour in different regions?"
  },
  {
    "table_id": "80784eng",
    "question": "What is the distribution of family and non-family labour across regions, including the roles of holders, spouses, and other regularly employed persons?"
  }
]
responses += response3

response4 = [
  {
    "table_id": "80030eng",
    "question": "What is the total electricity and heat production in terajoules (TJ) for each energy commodity and production type (central/decentral) over a given period?"
  },
  {
    "table_id": "80030eng",
    "question": "How does the input of different energy commodities (in TJ) compare to the gross production of electricity and heat for central and decentral production?"
  },
  {
    "table_id": "80030eng",
    "question": "What is the percentage contribution of electricity production relative to the total electricity and heat produced for each year?"
  }
]
responses += response4

response5 = [
  {
    "table_id": "85799ENG",
    "question": "What is the total water supply, production, and losses/statistical difference for each period in the Caribbean Netherlands?"
  },
  {
    "table_id": "85799ENG",
    "question": "How does water consumption by households and companies compare to the total supply for different periods in the Caribbean Netherlands?"
  },
  {
    "table_id": "85799ENG",
    "question": "What is the number of mains water connections for households and companies in the Caribbean Netherlands over time?"
  }
]
responses += response5

response6 = [
  {
    "table_id": "84747ENG",
    "question": "What is the year-on-year export growth for different exporter characteristics during specific periods?"
  },
  {
    "table_id": "84747ENG",
    "question": "How much did each exporter characteristic contribute to the total export growth during specific periods?"
  },
  {
    "table_id": "84747ENG",
    "question": "What are the trends in export growth and their contributions to total export growth for specific periods across all exporter characteristics?"
  }
]
responses += response6

response7 = [
  {
    "table_id": "82823ENG",
    "question": "What is the total value of debt guarantees provided by the government for different sectors during specific periods?"
  },
  {
    "table_id": "82823ENG",
    "question": "What is the adjusted capital value of off-balance public-private partnerships (PPPs) for each sector over time?"
  },
  {
    "table_id": "82823ENG",
    "question": "How have non-performing loans varied across sectors and periods, and what percentage of GDP do they represent?"
  }
]
responses += response7

response8 = [
  {
    "table_id": "83396ENG",
    "question": "What is the price index of sold dwellings on Bonaire for each year, and how does it compare to the base year 2011?" # This one contains something we should be careful with. In this table, 2011 is the base year, so it appears in the user question.
  },
  {
    "table_id": "83396ENG",
    "question": "What is the percentage change in the price of sold dwellings compared to the previous year for different periods on Bonaire?"
  },
  {
    "table_id": "83396ENG",
    "question": "How has the number of sold dwellings on Bonaire changed over time, including year-on-year percentage changes?"
  }
]
responses += response8

response9 = [
  {
    "table_id": "84823ENG",
    "question": "What percentage of dwellings experienced rent increases, reductions, or remained unchanged for different types of rentals during specific periods?"
  },
  {
    "table_id": "84823ENG",
    "question": "How does the percentage of rent increases compare across different types of landlords over time?"
  },
  {
    "table_id": "84823ENG",
    "question": "What are the trends in rent reductions for various types of rentals across different periods?"
  }
]
responses += response9

response10 = [
  {
    "table_id": "84422ENG",
    "question": "What is the total value of assets, liabilities, and net worth for the government during specific periods?"
  },
  {
    "table_id": "84422ENG",
    "question": "How have non-financial assets, such as dwellings, transport equipment, and civil engineering works, contributed to the total assets of the government over time?"
  },
  {
    "table_id": "84422ENG",
    "question": "What are the changes in government net worth during different periods, and how do they relate to assets and liabilities?"
  }
]
responses += response10

response11 = [
  {
    "table_id": "84336ENG",
    "question": "What is the total and median gross received inheritance for different connections with the deceased person during specific periods?"
  },
  {
    "table_id": "84336ENG",
    "question": "How does the total amount of tax due on inheritances vary across different characteristics of receivers and periods?"
  },
  {
    "table_id": "84336ENG",
    "question": "What is the mean and median net received inheritance for various connections with the deceased person during specific periods?"
  }
]
responses += response11

response12 = [
  {
    "table_id": "85828ENG",
    "question": "What is the turnover index for different branches compared to the base year 2021 during specific periods?"
  },
  {
    "table_id": "85828ENG",
    "question": "How does the calendar-adjusted production or turnover differ across various branches over time?"
  },
  {
    "table_id": "85828ENG",
    "question": "What is the percentage change in turnover compared to the previous period for specific branches and periods?"
  }
]
responses += response12

response13 = [
  {
    "table_id": "83698ENG",
    "question": "What is the population of the Caribbean Netherlands on January 1st, broken down by sex and age for specific periods?"
  },
  {
    "table_id": "83698ENG",
    "question": "How does the population of the Caribbean Netherlands vary by marital status and age group during specific periods?"
  },
  {
    "table_id": "83698ENG",
    "question": "What are the trends in the population of the Caribbean Netherlands on January 1st, segmented by sex, age, and marital status over time?"
  }
]
responses += response13

responses = json.dumps(responses, indent=2)
with open('user_questions_small.json', 'w') as f:
    f.write(responses)


In [13]:
# Estimated size of the response
n_responses = 13
estimate_tokens(responses)/n_responses

122.23076923076923

# Executing queries on the database

In [56]:
query = "SELECT SUM(\"HeatOutput_13\") AS TotalHeatOutput FROM \"37823eng\" WHERE \"Sectors\" = 'Health care and social work' AND \"CombinedHeatAndPowerCHP\" = 'Decentral electricity production' AND \"Periods\" = '2023';"
# data.execute_query(query)
data.execute_query_pandas_df(query)

Unnamed: 0,TotalHeatOutput
0,


# Adding the dimension fields

In [None]:
# Load the JSON file into a Python list of dicts
with open("all_info.json", "r") as file:
    all_info = json.load(file)  # data is


# go through the stored tables in the json file
for table in all_info:
    table_id = table['table_id']
    columns = table['columns'] 
    column_info = data.get_column_info(table_id)
# Filter column_info to only include columns which key is not ''
    column_info = column_info[column_info['Key'] != '']
    for column in column_info.itertuples():
        if column.Type == 'Dimension':
            possible_values = data.execute_query_pandas_df(f"SELECT DISTINCT {column.Key} FROM \"{column.Identifier}\";")
            print(possible_values.to_dict())
            break

(4997, 15)
(4119, 15)
                          FarmTypes
0                    All farm types
1            Specialist field crops
2           Specialist horticulture
3        Specialist permanent crops
4      Specialist grazing livestock
5             Specialist granivores
6         Specialist mixed cropping
7        Specialist mixed livestock
8  Specialist mixed crops/livestock


In [None]:
for column in column_info.itertuples():
        if column.Type == 'Dimension':
            possible_values = data.execute_query_pandas_df(f"SELECT DISTINCT {column.Key} FROM \"{column.Identifier}\";")[column.Key]
            print(' | '.join(possible_values))

All farm types | Specialist field crops | Specialist horticulture | Specialist permanent crops | Specialist grazing livestock | Specialist granivores | Specialist mixed cropping | Specialist mixed livestock | Specialist mixed crops/livestock
Total | Male | Female
Wheat (total) | Wheat, winter | Wheat, spring | Barley (total) | Barley, winter | Barley, spring | Rye | Oats | Triticale | Grain maize | Green maize | Maize, corn cob mix | Kidney beans | Rape and turnip rape seed | Fibre flax | Chicory | Hemp | Potatoes (total) | Ware potatoes | Seed potatoes | Starch potatoes | Sugar beet | Seed onions (total) | Seed onions: yellow | Seed onion: red | Onion sets (2nd year)
Nederland | Noord-Nederland (LD) | Oost-Nederland (LD) | West-Nederland (LD) | Zuid-Nederland (LD) | Groningen (PV) | Fryslân (PV) | Drenthe (PV) | Overijssel (PV) | Flevoland (PV) | Gelderland (PV) | Utrecht (PV) | Noord-Holland (PV) | Zuid-Holland (PV) | Zeeland (PV) | Noord-Brabant (PV) | Limburg (PV)
Vegetables | Othe

KeyboardInterrupt: 

Prompt start: Use the following table metadata to generate three specific questions that can be answered by running an SQL query on this table and the corresponding SQLite query (the table names are the ids. The table name after the from should have ""). Each question must be closely aligned to this table’s content and not applicable to a different dataset. Output the questions and the table_id for each in JSON format. Do not include any additional text in your answer.

Table schema: on the all_info.json file

In [None]:
response1=[
    {
        "question": "What is the total indigenous production of natural gas for a given year?",
        "table_id": "00372eng",
        "sql_query": "SELECT Periods, IndigenousProduction_2 FROM \"00372eng\" WHERE Periods = '2024';"
    },
    {
        "question": "How much natural gas was imported from Norway as LNG in the latest available month?",
        "table_id": "00372eng",
        "sql_query": "SELECT Periods, ImportsViaNorway_12 FROM \"00372eng\" WHERE Periods = 'November 2024';"
    },
    {
        "question": "What is the total consumption of natural gas delivered via regional grids in the Netherlands for the year 2023?",
        "table_id": "00372eng",
        "sql_query": "SELECT Periods, ViaRegionalGrids_31 FROM \"00372eng\" WHERE Periods = '2023';"
    }
]

response2=[
    {
        "question": "What is the total number of schools and institutions for primary education in the school year 2023/24?",
        "table_id": "03753eng",
        "sql_query": "SELECT Periods, SchoolsInstitutions_1 FROM \"03753eng\" WHERE TypeOfEducation = 'Primary education' AND Periods = '2023/24';"
    },
    {
        "question": "How many students were enrolled in secondary education for schools with an ideological basis of 'Public' in the school year 2022/23?",
        "table_id": "03753eng",
        "sql_query": "SELECT Periods, EnrolledPupilsStudents_2 FROM \"03753eng\" WHERE TypeOfEducation = 'Secondary education' AND IdeologicalBasisOfTheSchool = 'Public' AND Periods = '2022/23';"
    },
    {
        "question": "What is the number of schools with more than 500 students in vocational education for the school year 2023/24?",
        "table_id": "03753eng",
        "sql_query": "SELECT Periods, SchoolsInstitutions_1 FROM \"03753eng\" WHERE TypeOfEducation = 'Vocational education' AND Schoolsize > 500 AND Periods = '2023/24';"
    }
]

response3=[
    {
        "question": "What was the gross yield of tomatoes in million kg for the year 2023?",
        "table_id": "37738ENG",
        "sql_query": "SELECT Periodes, GrossYield_1 FROM \"37738ENG\" WHERE Vegetables = 'Tomatoes' AND Periodes = '2023';"
    },
    {
        "question": "What was the total cropping area in hectares for cucumbers in the year 2020?",
        "table_id": "37738ENG",
        "sql_query": "SELECT Periodes, CroppingArea_2 FROM \"37738ENG\" WHERE Vegetables = 'Cucumbers' AND Periodes = '2020';"
    },
    {
        "question": "Which vegetables had a gross yield greater than 50 million kg in 2023?",
        "table_id": "37738ENG",
        "sql_query": "SELECT Vegetables, GrossYield_1 FROM \"37738ENG\" WHERE GrossYield_1 > 50 AND Periodes = '2023';"
    }
]

response4=[
    {
        "question": "What was the total natural gas input for electricity and heat production in central installations in 2023?",
        "table_id": "37823eng",
        "sql_query": "SELECT Periods, NaturalGasInput_2 FROM \"37823eng\" WHERE Sectors = 'Central installations' AND Periods = '2023';"
    },
    {
        "question": "What was the total electricity output in TeraJoules from decentral installations using Combined Heat and Power (CHP) in 2022?",
        "table_id": "37823eng",
        "sql_query": "SELECT Periods, ElectricityOutput_12 FROM \"37823eng\" WHERE Sectors = 'Decentral installations' AND CombinedHeatAndPowerCHP = 'Yes' AND Periods = '2022';"
    },
    {
        "question": "Which sectors had a total input of hard coal exceeding 100 TJ in 2023?",
        "table_id": "37823eng",
        "sql_query": "SELECT Sectors, HardCoalInput_4 FROM \"37823eng\" WHERE HardCoalInput_4 > 100 AND Periods = '2023';"
    }
]

response5=[
    {
        "question": "What was the total number of live-born children in the year 2023?",
        "table_id": "37852eng",
        "sql_query": "SELECT Periods, LiveBornChildren_2 FROM \"37852eng\" WHERE Periods = '2023';"
    },
    {
        "question": "How many deaths were caused by malignant neoplasms of respiratory and intrathoracic organs per 100,000 of the average population in 2020?",
        "table_id": "37852eng",
        "sql_query": "SELECT Periods, MalignantNeoplasmsRespiratory_130 FROM \"37852eng\" WHERE Periods = '2020';"
    },
    {
        "question": "What was the percentage of overweight persons (BMI >= 25) in the Netherlands in 2022?",
        "table_id": "37852eng",
        "sql_query": "SELECT Periods, OverweightPersons_30 FROM \"37852eng\" WHERE Periods = '2022';"
    }
]

response6=[
    {
        "question": "What was the total number of deaths for men in 2023?",
        "table_id": "37979eng",
        "sql_query": "SELECT Periods, TotalDeaths_1 FROM \"37979eng\" WHERE Gender = 'Men' AND Periods = '2023';"
    },
    {
        "question": "What was the infant mortality rate (per 1,000 live-born children) for women in 2020?",
        "table_id": "37979eng",
        "sql_query": "SELECT Periods, InfantMortalityRelative_5 FROM \"37979eng\" WHERE Gender = 'Women' AND Periods = '2020';"
    },
    {
        "question": "What was the life expectancy at birth for men in 1950?",
        "table_id": "37979eng",
        "sql_query": "SELECT Periods, LifeExpectancyAtBirth_12 FROM \"37979eng\" WHERE Gender = 'Men' AND Periods = '1950';"
    }
]

response7=[
    {
        "question": "What was the total number of theoretically available hours for construction in the first quarter of 2023?",
        "table_id": "60006eng",
        "sql_query": "SELECT Periods, TheoreticallyAvailableHours_1 FROM \"60006eng\" WHERE Periods = 'I-quart. 2023';"
    },
    {
        "question": "How many non-productive hours were due to frost and precipitation delays in the third quarter of 2022?",
        "table_id": "60006eng",
        "sql_query": "SELECT Periods, FrostAndPrecipitationDelays_3 FROM \"60006eng\" WHERE Periods = 'III-quart. 2022';"
    },
    {
        "question": "What was the total number of productive hours in construction during the second quarter of 2024?",
        "table_id": "60006eng",
        "sql_query": "SELECT Periods, ProductiveHours_5 FROM \"60006eng\" WHERE Periods = 'II-quart. 2024';"
    }
]

response8=[
    {
        "question": "What was the absolute value in euros of household consumption on food and non-alcoholic beverages in January 2019?",
        "table_id": "70076eng",
        "sql_query": "SELECT Periods, AbsoluteValueInEuroS_7 FROM \"70076eng\" WHERE GoodsAndServices = 'Food and non-alcoholic beverages' AND Periods = '2019 January';"
    },
    {
        "question": "What was the volume change in household consumption on durable goods compared to the same period in the previous year for June 2015?",
        "table_id": "70076eng",
        "sql_query": "SELECT Periods, VolumeChanges_1 FROM \"70076eng\" WHERE GoodsAndServices = 'Durable goods' AND Periods = '2015 June';"
    },
    {
        "question": "What was the share of domestic consumption of households for housing, water, electricity, gas, and other fuels in 2010?",
        "table_id": "70076eng",
        "sql_query": "SELECT Periods, ShareInDomesticConsOfHouseholds_6 FROM \"70076eng\" WHERE GoodsAndServices = 'Housing, water, electricity, gas and other fuels' AND Periods = '2010';"
    }
]

response9=[
    {
        "question": "What was the total number of suicides among males in 2022?",
        "table_id": "7022eng",
        "sql_query": "SELECT Periods, TotalSuicides_1 FROM \"7022eng\" WHERE Sex = 'Male' AND Periods = '2022';"
    },
    {
        "question": "How many suicides were caused by hanging or strangulation in the age group 30-39 in 2020?",
        "table_id": "7022eng",
        "sql_query": "SELECT Periods, HangingStrangulation_6 FROM \"7022eng\" WHERE Age = '30-39' AND Periods = '2020';"
    },
    {
        "question": "What was the rate of suicides per 100,000 population for unmarried individuals in 2019?",
        "table_id": "7022eng",
        "sql_query": "SELECT Periods, Unmarried_19 FROM \"7022eng\" WHERE Periods = '2019';"
    }
]

response10=[
    {
        "question": "What was the total number of deaths due to malignant neoplasms in males aged 60-69 in 2023?",
        "table_id": "7052eng",
        "sql_query": "SELECT Periods, TotalMalignantNeoplasms_9 FROM \"7052eng\" WHERE Sex = 'Male' AND Age = '60-69' AND Periods = '2023';"
    },
    {
        "question": "How many deaths were caused by COVID-19 (confirmed cases) in females aged 80+ in 2021?",
        "table_id": "7052eng",
        "sql_query": "SELECT Periods, k_181ConfirmedCOVID19_94 FROM \"7052eng\" WHERE Sex = 'Female' AND Age = '80+' AND Periods = '2021';"
    },
    {
        "question": "What was the total number of deaths caused by ischaemic heart diseases for all genders in the age group 50-59 in 2022?",
        "table_id": "7052eng",
        "sql_query": "SELECT Periods, k_71IschaemicHeartDiseases_44 FROM \"7052eng\" WHERE Age = '50-59' AND Periods = '2022';"
    }
]

response11=[
    {
        "question": "What was the average rent increase for dwellings in the Netherlands in 2020?",
        "table_id": "70675eng",
        "sql_query": "SELECT RentIncrease_1 FROM \"70675eng\" WHERE Periods = '2020';"
    },
    {
        "question": "What is the average rent increase for dwellings in the Netherlands for the year 2024?",
        "table_id": "70675eng",
        "sql_query": "SELECT RentIncrease_1 FROM \"70675eng\" WHERE Periods = '2024';"
    },
    {
        "question": "What was the average rent increase for dwellings in the Netherlands in 1980?",
        "table_id": "70675eng",
        "sql_query": "SELECT RentIncrease_1 FROM \"70675eng\" WHERE Periods = '1980';"
    }
]

response12=[
    {
        "question": "How many deaths were registered for males aged 65 and over in week 20 of 2023?",
        "table_id": "70895ENG",
        "sql_query": "SELECT Deaths_1 FROM \"70895ENG\" WHERE Sex = 'Male' AND Age31December = '65 and over' AND Periods = '2023 week 20';"
    },
    {
        "question": "What was the total number of deaths for females under 65 in week 50 of 2024?",
        "table_id": "70895ENG",
        "sql_query": "SELECT Deaths_1 FROM \"70895ENG\" WHERE Sex = 'Female' AND Age31December = 'Under 65' AND Periods = '2024 week 50';"
    },
    {
        "question": "How many deaths were recorded for all genders and age groups in week 1 of 1975?",
        "table_id": "70895ENG",
        "sql_query": "SELECT SUM(Deaths_1) FROM \"70895ENG\" WHERE Periods = '1975 week 1';"
    }
]

response13=[
    {
        "question": "What was the total value of balance sheet items at year-end (in millions of euros) for municipalities in the region 'North Holland' in 2023?",
        "table_id": "71231ENG",
        "sql_query": "SELECT BalanceSheetItemsYearEndInMlnEuro_1 FROM \"71231ENG\" WHERE Regions = 'North Holland' AND Periods = '2023';"
    },
    {
        "question": "What was the balance sheet value per inhabitant (in euros) for municipalities in the size class '100,000 to 250,000 inhabitants' in the region 'South Holland' for the year 2022?",
        "table_id": "71231ENG",
        "sql_query": "SELECT BalanceSheetYearEndInEuroInhabit_2 FROM \"71231ENG\" WHERE Regions = 'South Holland' AND BalanceSheetItemsMunicipalities = '100,000 to 250,000 inhabitants' AND Periods = '2022';"
    },
    {
        "question": "What was the total balance sheet value (in millions of euros) for all municipalities in the Netherlands in 2010?",
        "table_id": "71231ENG",
        "sql_query": "SELECT SUM(BalanceSheetItemsYearEndInMlnEuro_1) FROM \"71231ENG\" WHERE Regions = 'Netherlands' AND Periods = '2010';"
    }
]

response14=[
    {
        "question": "What was the total balance sheet value at year-end (in millions of euros) for the province of Utrecht in 2023?",
        "table_id": "71536ENG",
        "sql_query": "SELECT BalanceSheetItemsYearEndInMlnEuro_1 FROM \"71536ENG\" WHERE Regions = 'Utrecht' AND Periods = '2023';"
    },
    {
        "question": "What was the balance sheet value per inhabitant (in euros) for the province of Groningen in 2022?",
        "table_id": "71536ENG",
        "sql_query": "SELECT BalanceSheetYearEndInEuroInhabit_2 FROM \"71536ENG\" WHERE Regions = 'Groningen' AND Periods = '2022';"
    },
    {
        "question": "What was the sum of balance sheet values (in millions of euros) for all provinces in the Netherlands in 2015?",
        "table_id": "71536ENG",
        "sql_query": "SELECT SUM(BalanceSheetItemsYearEndInMlnEuro_1) FROM \"71536ENG\" WHERE Regions != 'Netherlands' AND Periods = '2015';"
    }
]

response15=[
    {
        "question": "What was the total indigenous production of hard coal in the Netherlands in 1950?",
        "table_id": "71554eng",
        "sql_query": "SELECT IndigenousProductionHardCoal_3 FROM \"71554eng\" WHERE Periods = '1950';"
    },
    {
        "question": "How much energy was primarily available for consumption (Total Energy Supply) in the Netherlands in 2023?",
        "table_id": "71554eng",
        "sql_query": "SELECT TotalEnergySupplyTPES_1 FROM \"71554eng\" WHERE Periods = '2023';"
    },
    {
        "question": "What was the amount of lignite extracted from nature (Indigenous Production of Lignite) in the Netherlands in 1980?",
        "table_id": "71554eng",
        "sql_query": "SELECT IndigenousProductionLignite_4 FROM \"71554eng\" WHERE Periods = '1980';"
    }
]

responses = []
for i in range(1, 16):
    responses += globals()[f'response{i}']

responses = json.dumps(responses, indent=2)

with open('eval_dataset_small.json', 'w') as f:
    f.write(responses)

# API generation
## Preparing the batch file

## Batch API

In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
# Create the batch input file

batch_input_file = client.files.create(
    file=open("batchinput.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

In [None]:
# Create the batch

batch_input_file_id = batch_input_file.id
batch_metadata = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "nightly eval job"
    }
)

batch_id = batch_metadata.id
print(batch_metadata)

In [None]:
# Check the status of the batch

batch = client.batches.retrieve(batch_id)
print(batch)

In [None]:
# Retrieve the results

file_response = client.files.content(batch_input_file_id)
print(file_response.text)

In [81]:
data.set_db_path("CBSdatabase.db")
query = "SELECT SUM(SchoolsInstitutions_1) FROM '03753eng' WHERE TypeOfEducation LIKE 'Total primary schools%'"
data.execute_query(query)

[(1388118.0,)]