In [29]:
import data as data
import pandas as pd
import json
import tiktoken

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
data.set_db_path('CBSdatabase.db')
ids = data.get_tables_database()

## Generating the JSON with the column and table information

In [17]:
column_info = data.get_column_info(ids)
column_info.rename(columns={'Title': 'TitleColumn'}, inplace=True)
table_info = data.get_all_tables_info()
table_info.rename(columns={'Title': 'TitleTable'}, inplace=True)
# join both tables on the column 'Identifier'
all_info = pd.merge(column_info, table_info, on='Identifier')
grouped = all_info.groupby('Identifier')
table_info_with_columns = all_info[['TitleColumn', 'TitleTable', 'Identifier', 'Summary', 'Period', 'ShortDescription']]
table_info_with_columns.drop_duplicates(inplace=True)
table_info_with_columns.reset_index(drop=True, inplace=True)
grouped_table_with_columns = table_info_with_columns.groupby('Identifier').agg({
    'TitleTable': 'first',
    'Summary': 'first',
    'Period': 'first',
    'ShortDescription': 'first',
    'TitleColumn': ' | '.join
}).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table_info_with_columns.drop_duplicates(inplace=True)


In [18]:
# This json contains the table info and the column titles

grouped_table_with_columns.to_json('table_info.json', orient='records', lines=True)

In [20]:
output = []
for table_id, group in grouped:
    # Grab the first row to get the common table-level info
    row = group.iloc[0]
    
    # Create a list of column dictionaries
    columns_list = group[["TitleColumn", "Description"]].to_dict(orient="records")
    
    # Build the structure for this table
    table_dict = {
        "table_id": table_id,
        "title": row["TitleTable"],
        "summary": row["Summary"],
        "period": row["Period"],
        "short_description": row["ShortDescription"],
        "columns": columns_list
    }
    
    # Add the structure to our output list
    output.append(table_dict)

json_output = json.dumps(output, indent=2)
# save the file
with open('all_info.json', 'w') as f:
    f.write(json_output)


## Finding one table in the JSON

In [32]:

# 1) Load the JSON file into a Python list of dicts
with open("all_info.json", "r") as file:
    all_info = json.load(file)  # data is a list of dicts

# 2) Suppose we want the entry where table_id == "table2"
ids = data.get_tables_database()
table_to_find = ids[20]

# 3) We can loop or use a generator expression to find it
table_info = next((item for item in all_info if item["table_id"] == table_to_find), None)

if table_info:
    print(f"Found table: {table_info}")
else:
    print("Not found.")

Found table: {'table_id': '82659ENG', 'title': 'Caribbean Netherlands, Bonaire; import and export values, SITC', 'summary': 'Import, export value and trade balance of Bonaire \naccording to SITC classification.', 'period': '2011 3rd quarter - 2022 3rd quarter.', 'short_description': '\nThis table shows the values of imports and exports and the trade balance of Bonaire. The data are broken down into aggregated categories of goods according to the Standard International Trade Classification (SITC sections). On October 10, 2010, Bonaire became a public body of the Netherlands. \n\nData available from: the third quarter of 2011.\n\nStatus of the figures:\nThe figures on international trade of Bonaire are subject to adjustment when new or updated sources become available. The figures are therefore provisional when they are first published. \nThese provisional figures are adjusted approximately five months after the quarter under review. In the second quarter of the current year, the definit

## Token estimation for user question generation

In [33]:
def estimate_tokens(text, model="gpt-4o"):
    # Initialize tokenizer for the specific model
    tokenizer = tiktoken.encoding_for_model(model)
    # Tokenize the input text
    tokens = tokenizer.encode(text)
    return len(tokens)

In [34]:
# Token estimation for the user quesstion generation

prompt_start = 'Use the following table metadata to generate three specific questions that can be answered by running an SQL query on this table. Each question must be closely aligned to this table’s content and not applicable to a different dataset. Output the questions and the table_id for each in JSON format. Do not include any additional text in your answer.'
prompts = []
n_tokens = 0
for id in ids:
    table_info = next((item for item in all_info if item["table_id"] == id), None)
    # write result as string
    table_info = json.dumps(table_info)
    prompt = prompt_start + table_info
    prompts.append(prompt)
    n_tokens += estimate_tokens(prompt) + 150  # Added 150 tokens for the estimated number of tokens in the model's response (see calculation below)
print('Expected number of tokens:', n_tokens)

Expected number of tokens: 428620


In [27]:
n_prompts = 10
# select n_prompts random prompts
import random
random_prompts = random.sample(prompts, n_prompts)
random_prompts

ValueError: Sample larger than population or is negative

In [3]:
# Here there are 13 user questions manually generated.
responses = []
response1 = [
  {
    "table_id": "82659ENG",
    "question": "What were the import and export values for specific categories of goods in Bonaire during a given period?"
  },
  {
    "table_id": "82659ENG",
    "question": "How has the trade balance of Bonaire changed year-on-year for specific categories of goods?"
  },
  {
    "table_id": "82659ENG",
    "question": "Which categories of goods experienced the highest percentage change in import or export values over the last year in Bonaire?"
  }
]
responses += response1

response2 = [
  {
    "table_id": "80783eng",
    "question": "What is the total number of farms producing specific types of crops or raising specific livestock in different regions?"
  },
  {
    "table_id": "80783eng",
    "question": "How has the utilisation of agricultural land, such as arable land and grassland, varied across different regions and farm types over time?"
  },
  {
    "table_id": "80783eng",
    "question": "What are the trends in the production of specific crops, such as potatoes or cereals, and their distribution by farm type and region?"
  }
]
responses += response2

response3 = [
  {
    "table_id": "80784eng",
    "question": "What is the total number of persons employed in agricultural holdings, categorized by region and gender?"
  },
  {
    "table_id": "80784eng",
    "question": "How many annual work units (AWU) are contributed by family and non-family labour in different regions?"
  },
  {
    "table_id": "80784eng",
    "question": "What is the distribution of family and non-family labour across regions, including the roles of holders, spouses, and other regularly employed persons?"
  }
]
responses += response3

response4 = [
  {
    "table_id": "80030eng",
    "question": "What is the total electricity and heat production in terajoules (TJ) for each energy commodity and production type (central/decentral) over a given period?"
  },
  {
    "table_id": "80030eng",
    "question": "How does the input of different energy commodities (in TJ) compare to the gross production of electricity and heat for central and decentral production?"
  },
  {
    "table_id": "80030eng",
    "question": "What is the percentage contribution of electricity production relative to the total electricity and heat produced for each year?"
  }
]
responses += response4

response5 = [
  {
    "table_id": "85799ENG",
    "question": "What is the total water supply, production, and losses/statistical difference for each period in the Caribbean Netherlands?"
  },
  {
    "table_id": "85799ENG",
    "question": "How does water consumption by households and companies compare to the total supply for different periods in the Caribbean Netherlands?"
  },
  {
    "table_id": "85799ENG",
    "question": "What is the number of mains water connections for households and companies in the Caribbean Netherlands over time?"
  }
]
responses += response5

response6 = [
  {
    "table_id": "84747ENG",
    "question": "What is the year-on-year export growth for different exporter characteristics during specific periods?"
  },
  {
    "table_id": "84747ENG",
    "question": "How much did each exporter characteristic contribute to the total export growth during specific periods?"
  },
  {
    "table_id": "84747ENG",
    "question": "What are the trends in export growth and their contributions to total export growth for specific periods across all exporter characteristics?"
  }
]
responses += response6

response7 = [
  {
    "table_id": "82823ENG",
    "question": "What is the total value of debt guarantees provided by the government for different sectors during specific periods?"
  },
  {
    "table_id": "82823ENG",
    "question": "What is the adjusted capital value of off-balance public-private partnerships (PPPs) for each sector over time?"
  },
  {
    "table_id": "82823ENG",
    "question": "How have non-performing loans varied across sectors and periods, and what percentage of GDP do they represent?"
  }
]
responses += response7

response8 = [
  {
    "table_id": "83396ENG",
    "question": "What is the price index of sold dwellings on Bonaire for each year, and how does it compare to the base year 2011?" # This one contains something we should be careful with. In this table, 2011 is the base year, so it appears in the user question.
  },
  {
    "table_id": "83396ENG",
    "question": "What is the percentage change in the price of sold dwellings compared to the previous year for different periods on Bonaire?"
  },
  {
    "table_id": "83396ENG",
    "question": "How has the number of sold dwellings on Bonaire changed over time, including year-on-year percentage changes?"
  }
]
responses += response8

response9 = [
  {
    "table_id": "84823ENG",
    "question": "What percentage of dwellings experienced rent increases, reductions, or remained unchanged for different types of rentals during specific periods?"
  },
  {
    "table_id": "84823ENG",
    "question": "How does the percentage of rent increases compare across different types of landlords over time?"
  },
  {
    "table_id": "84823ENG",
    "question": "What are the trends in rent reductions for various types of rentals across different periods?"
  }
]
responses += response9

response10 = [
  {
    "table_id": "84422ENG",
    "question": "What is the total value of assets, liabilities, and net worth for the government during specific periods?"
  },
  {
    "table_id": "84422ENG",
    "question": "How have non-financial assets, such as dwellings, transport equipment, and civil engineering works, contributed to the total assets of the government over time?"
  },
  {
    "table_id": "84422ENG",
    "question": "What are the changes in government net worth during different periods, and how do they relate to assets and liabilities?"
  }
]
responses += response10

response11 = [
  {
    "table_id": "84336ENG",
    "question": "What is the total and median gross received inheritance for different connections with the deceased person during specific periods?"
  },
  {
    "table_id": "84336ENG",
    "question": "How does the total amount of tax due on inheritances vary across different characteristics of receivers and periods?"
  },
  {
    "table_id": "84336ENG",
    "question": "What is the mean and median net received inheritance for various connections with the deceased person during specific periods?"
  }
]
responses += response11

response12 = [
  {
    "table_id": "85828ENG",
    "question": "What is the turnover index for different branches compared to the base year 2021 during specific periods?"
  },
  {
    "table_id": "85828ENG",
    "question": "How does the calendar-adjusted production or turnover differ across various branches over time?"
  },
  {
    "table_id": "85828ENG",
    "question": "What is the percentage change in turnover compared to the previous period for specific branches and periods?"
  }
]
responses += response12

response13 = [
  {
    "table_id": "83698ENG",
    "question": "What is the population of the Caribbean Netherlands on January 1st, broken down by sex and age for specific periods?"
  },
  {
    "table_id": "83698ENG",
    "question": "How does the population of the Caribbean Netherlands vary by marital status and age group during specific periods?"
  },
  {
    "table_id": "83698ENG",
    "question": "What are the trends in the population of the Caribbean Netherlands on January 1st, segmented by sex, age, and marital status over time?"
  }
]
responses += response13

responses = json.dumps(responses, indent=2)
with open('user_questions_small.json', 'w') as f:
    f.write(responses)


In [13]:
# Estimated size of the response
n_responses = 13
estimate_tokens(responses)/n_responses

122.23076923076923

In [6]:
data.set_db_path('CBSdatabase.db')

In [3]:
data.get_database_schema1()

{'80783eng':      cid                   name     type  notnull dflt_value  pk
 0      0                     ID  INTEGER        0       None   0
 1      1              FarmTypes     TEXT        0       None   0
 2      2                Regions     TEXT        0       None   0
 3      3                Periods     TEXT        0       None   0
 4      4   NumberOfFarmsTotal_1  INTEGER        0       None   0
 ..   ...                    ...      ...      ...        ...  ..
 148  148            Turkeys_145  INTEGER        0       None   0
 149  149  DucksForFattening_146  INTEGER        0       None   0
 150  150       OtherPoultry_147  INTEGER        0       None   0
 151  151            Rabbits_148  INTEGER        0       None   0
 152  152      FurredAnimals_149     REAL        0       None   0
 
 [153 rows x 6 columns],
 '80784eng':     cid                              name     type  notnull dflt_value  pk
 0     0                                ID  INTEGER        0       None   0
 1   

In [7]:
data.get_database_schema2()

DatabaseError: Execution failed on sql 'SELECT name, type, sql
            FROM sqlite_master
            WHERE type IN ('table', 'view')
            ORDER BY name;" ': You can only execute one statement at a time.