In [1]:
import cbsodata
from google.cloud import bigquery
import pandas as pd
import sqlite3
from tqdm import tqdm

In [2]:
def check_language_count(tables):
    eng_count = 0
    ned_count = 0
    # iterate over the list of dictionaries
    for table in tables:
        language = table.get('Language', '')
        if  language.endswith('en'):
            eng_count += 1
        elif language.endswith('nl'):
            ned_count += 1
        else:
            print(table.get('Identifier', ''))
    print(f"Number of 'ENG': {eng_count}")
    print(f"Number of 'NED': {ned_count}")

def return_english_tables(tables):
    eng_tables = []
    for table in tables:
        language = table.get('Language', '')
        if  language.endswith('en'):
            identifier = table.get('Identifier', '')
            eng_tables.append(identifier)
    return eng_tables    

def check_col_names(table, metadata):
    # extracting the column names from the 'table' dataset
    table_columns = table.columns.tolist()

    # extracting the 'Key' values from the 'metadata' dataset
    key_values = metadata['Key'].dropna().tolist()  # Remove NaN values from the 'Key' column

    # finding matches between table columns and key values
    matches = [col for col in table_columns if col in key_values]

    # finding non-matching columns
    non_matches = [col for col in table_columns if col not in key_values]

    # Output results
    print("Matching columns:", matches, len(matches))
    print("Non-matching columns:", non_matches, len(non_matches))


In [26]:
tables = cbsodata.get_table_list()
check_language_count(tables)
# retrieve specific table with identifier
table = pd.DataFrame(cbsodata.get_data('37105ENG'))
print(table.columns)
# retrieve metadata of specific table
metadata = pd.DataFrame(cbsodata.get_meta('37105ENG',name= 'DataProperties')) 
print(metadata)
# check if keys in metadata are corresponding to the columns in the actual dataset
check_col_names(table, metadata)

Number of 'ENG': 1026
Number of 'NED': 4729
Index(['ID', 'Regions', 'Periods', 'TotalSurface_1', 'TransportTotal_2',
       'Railroad_3', 'MainRoad_4', 'Airport_5', 'BuiltUpAreaTotal_6',
       'Residential_7', 'IndustryBusinessPublicInstitutions_8',
       'SocioCulturalFacilities_9', 'SemiBuiltUpAreaTotal_10', 'MiningArea_11',
       'BuildingSite_12', 'OtherSemiBuiltUpArea_13', 'RecreationTotal_14',
       'ParkAndPublicGarden_15', 'SportGrounds_16', 'OtherRecreationUsage_17',
       'AgricultureTotal_18', 'Greenhouses_19', 'OtherAgriculturalUsage_20',
       'WoodlandAndNatureTotal_21', 'Woodland_22', 'NaturalOpenArea_23',
       'WaterTotal_24', 'InlandWaterTotal_25', 'TidalWaterTotal_26'],
      dtype='object')
                 odata.type  ID  Position  ParentID           Type  \
0    Cbs.OData.GeoDimension   0       0.0       NaN   GeoDimension   
1   Cbs.OData.TimeDimension   1       1.0       NaN  TimeDimension   
2           Cbs.OData.Topic   2       2.0       NaN          To

In [14]:
print(metadata)

                 odata.type  ID  Position  ParentID           Type  \
0    Cbs.OData.GeoDimension   0       0.0       NaN   GeoDimension   
1   Cbs.OData.TimeDimension   1       1.0       NaN  TimeDimension   
2           Cbs.OData.Topic   2       2.0       NaN          Topic   
3      Cbs.OData.TopicGroup   3       NaN       NaN     TopicGroup   
4           Cbs.OData.Topic   4       3.0       3.0          Topic   
5           Cbs.OData.Topic   5       4.0       3.0          Topic   
6           Cbs.OData.Topic   6       5.0       3.0          Topic   
7           Cbs.OData.Topic   7       6.0       3.0          Topic   
8      Cbs.OData.TopicGroup   8       NaN       NaN     TopicGroup   
9           Cbs.OData.Topic   9       7.0       8.0          Topic   
10          Cbs.OData.Topic  10       8.0       8.0          Topic   
11          Cbs.OData.Topic  11       9.0       8.0          Topic   
12          Cbs.OData.Topic  12      10.0       8.0          Topic   
13     Cbs.OData.Top

In [7]:
table

Unnamed: 0,ID,Regions,Periods,TotalSurface_1,TransportTotal_2,Railroad_3,MainRoad_4,Airport_5,BuiltUpAreaTotal_6,Residential_7,...,OtherRecreationUsage_17,AgricultureTotal_18,Greenhouses_19,OtherAgriculturalUsage_20,WoodlandAndNatureTotal_21,Woodland_22,NaturalOpenArea_23,WaterTotal_24,InlandWaterTotal_25,TidalWaterTotal_26
0,0,The Netherlands,1900,32550.0,,,,,,,...,,,,21160.0,8760.0,2520.0,6240.0,910.0,,
1,1,The Netherlands,1905,32570.0,,,,,,,...,,,,21270.0,8410.0,2570.0,5840.0,900.0,,
2,2,The Netherlands,1906,32570.0,,,,,,,...,,,,,8360.0,2570.0,5790.0,890.0,,
3,3,The Netherlands,1907,32570.0,,,,,,,...,,,,,8320.0,2580.0,5740.0,890.0,,
4,4,The Netherlands,1908,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,1542,Limburg (PV),2018,2210.0,,,,,,,...,,,,,,,,,,
1543,1543,Limburg (PV),2019,2210.0,,,,,,,...,,,,,,,,,,
1544,1544,Limburg (PV),2020,2210.0,,,,,,,...,,,,,,,,,,
1545,1545,Limburg (PV),2021,2210.0,,,,,,,...,,,,,,,,,,


In [4]:
client = bigquery.Client()

def df_to_bq(df, table_id):
    # Upload DataFrame to BigQuery
    job = client.load_table_from_dataframe(df, table_id)
    job.result()  # Wait for the job to complete

    print(f"Loaded {job.output_rows} rows into {table_id}.")

In [9]:
# Define table ID and schema
table_id = "boreal-logic-446611-v5.cbs.firsttable"

df_to_bq(table, table_id)

Loaded 1547 rows into boreal-logic-446611-v5.cbs.firsttable.


# SQLITE

In [23]:
conn = sqlite3.connect('database.db')
cursor = conn.cursor()


In [22]:
conn.close()

In [37]:
def upload_tables(ids, size_limit = 2147483645, max_count = None): # Size limit is 200 MB
    if max_count is not None:
        ids = ids[:max_count]
    for id in tqdm(ids, desc="Uploading tables", unit="table"):
        table = pd.DataFrame(cbsodata.get_data(id))
        if table.memory_usage().sum() > size_limit:
            print(f"Table {id} is too large to upload.")
            continue
        table.to_sql(id, conn, if_exists='replace', index=False)

In [29]:
table.memory_usage(deep=True).sum()/1024**2

np.float64(0.5167198181152344)

In [36]:
1073741824/5

214748364.8

In [5]:
tables = cbsodata.get_table_list()

In [32]:
eng_ids = return_english_tables(tables)
eng_ids = eng_ids[262:] # eng_ids[257] has not been uploaded yet

upload_tables(eng_ids)


Uploading tables:   0%|          | 0/764 [00:44<?, ?table/s]


KeyboardInterrupt: 

In [None]:
metadata = pd.DataFrame(cbsodata.get_meta('37105ENG',name= 'DataProperties'))

Unnamed: 0,ID,FarmTypes,Regions,Periods,NumberOfFarmsTotal_1,LandUseTotal_2,UtilisedAgriculturalAreaUAA_3,OtherLand_4,LandUseTotal_5,UtilisedAgriculturalAreaUAA_6,...,ChickensTotal_140,LayingHens_141,ParentBirdsOfLayingHens_142,Broilers_143,ParentBirdsOfBroilers_144,Turkeys_145,DucksForFattening_146,OtherPoultry_147,Rabbits_148,FurredAnimals_149
0,0,All farm types,Nederland,2000,97389,212737529.0,197550427,15187102.0,97389.0,95944,...,3860,2292,,1094,520,121,114,124,201,192.0
1,1,All farm types,Nederland,2001,92647,207711643.0,194613178,13098465.0,92647.0,91240,...,3560,2145,,1027,430,120,100,113,183,187.0
2,2,All farm types,Nederland,2002,89479,209973386.0,196407461,13565925.0,89479.0,88075,...,3358,1880,,1096,418,112,97,103,166,183.0
3,3,All farm types,Nederland,2003,85408,209745968.0,194564993,15180975.0,85408.0,84141,...,2446,1360,,777,330,80,80,92,152,183.0
4,4,All farm types,Nederland,2004,83794,207968902.0,194940340,13028562.0,83794.0,82575,...,2768,1710,,771,314,92,74,78,147,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30820,30820,Specialist mixed crops/livestock,Zuid-Limburg (LB),2020,88,300849.0,293601,7248.0,88.0,88,...,0,0,0.0,0,0,0,0,0,0,0.0
30821,30821,Specialist mixed crops/livestock,Zuid-Limburg (LB),2021,89,,297779,,,89,...,0,0,0.0,0,0,0,0,0,0,
30822,30822,Specialist mixed crops/livestock,Zuid-Limburg (LB),2022,71,,278537,,,71,...,0,0,0.0,0,0,0,0,0,0,
30823,30823,Specialist mixed crops/livestock,Zuid-Limburg (LB),2023,75,281658.0,276587,5071.0,75.0,75,...,0,0,0.0,0,0,0,0,0,0,


## Metadata

In [60]:
# Exploration

names = ['TableInfos', 'UntypedDataSet', 'TypedDataSet', 'DataProperties', 'CategoryGroups', 'Geschlecht', 'Alter', 'Region', 'Perioden']

names_that_dont_work = ['Geschlecht', 'Alter', 'Region', 'Perioden'] # These supposedly valid names (from the API web) don't work

names = [name for name in names if name not in names_that_dont_work]

for name in names:
    metadata = pd.DataFrame(cbsodata.get_meta('37105ENG',name= name)) 
    print(name)
    print(metadata)
    print('-----------------')

TableInfos
   ID                                              Title  \
0   0  Land use; main categories; regional, 1900 onwards   

                         ShortTitle Identifier  \
0  Land use; regional, 1900 onwards   37105ENG   

                                             Summary             Modified  \
0  Traffic area, Built up area, Forest, Natural a...  2022-11-30T02:00:00   

  ReasonDelivery ExplanatoryText Language Catalog  ...     Period  \
0         Update                       en     CBS  ...  1900-2017   

                                    ShortDescription  \
0  \nThis table provides information about the la...   

                                         Description  \
0  CONTENTS\r\n\r\n1. General information\r\n2. D...   

                                 DefaultPresentation  \
0  ts=1668751650543&graphtype=Table&r=Topics&k=Re...   

                                    DefaultSelection      GraphTypes  \
0  $filter=((Periods eq '1900JJ00') or (Periods e...  Table,Ba

In [61]:
pd.DataFrame(cbsodata.get_data('37105ENG'))

Unnamed: 0,ID,Regions,Periods,TotalSurface_1,TransportTotal_2,Railroad_3,MainRoad_4,Airport_5,BuiltUpAreaTotal_6,Residential_7,...,OtherRecreationUsage_17,AgricultureTotal_18,Greenhouses_19,OtherAgriculturalUsage_20,WoodlandAndNatureTotal_21,Woodland_22,NaturalOpenArea_23,WaterTotal_24,InlandWaterTotal_25,TidalWaterTotal_26
0,0,The Netherlands,1900,32550.0,,,,,,,...,,,,21160.0,8760.0,2520.0,6240.0,910.0,,
1,1,The Netherlands,1905,32570.0,,,,,,,...,,,,21270.0,8410.0,2570.0,5840.0,900.0,,
2,2,The Netherlands,1906,32570.0,,,,,,,...,,,,,8360.0,2570.0,5790.0,890.0,,
3,3,The Netherlands,1907,32570.0,,,,,,,...,,,,,8320.0,2580.0,5740.0,890.0,,
4,4,The Netherlands,1908,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,1542,Limburg (PV),2018,2210.0,,,,,,,...,,,,,,,,,,
1543,1543,Limburg (PV),2019,2210.0,,,,,,,...,,,,,,,,,,
1544,1544,Limburg (PV),2020,2210.0,,,,,,,...,,,,,,,,,,
1545,1545,Limburg (PV),2021,2210.0,,,,,,,...,,,,,,,,,,


In [35]:
metadata_names_dont_work = pd.DataFrame(columns = ['ID', 'Name'])
count = 0
max_count = 5
for id in eng_ids:
    count += 1
    if count == max_count:
        break
    for name in names:
        try:
            metadata = pd.DataFrame(cbsodata.get_meta(id,name= name))
        except:
            new_entry = pd.DataFrame({'ID': [id], 'Name': [name]})
            metadata_names_dont_work = pd.concat([metadata_names_dont_work, new_entry], ignore_index=True)
            continue

metadata_names_dont_work.groupby('ID')['Name'].agg(', '.join).reset_index()

Unnamed: 0,ID,Name
0,37738ENG,"Geschlecht, Alter, Region, Perioden"
1,80783eng,"Geschlecht, Alter, Region, Perioden"
2,80784eng,"Geschlecht, Alter, Region, Perioden"
3,85636ENG,"Geschlecht, Alter, Region, Perioden"


Upon inspection of the different metadata names I decided that only TableInfos and DataProperties contain useful information

### Table Metadata

In [13]:
def get_tables_database(conn = conn, exclude_metadata_tables = True):
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    cursor = conn.cursor()
    cursor.execute(query)

    # Fetch all table names
    table_names = [row[0] for row in cursor.fetchall()]
    if exclude_metadata_tables:
        table_names = [name for name in table_names if 'metadata' not in name]
    return table_names

In [14]:
def generate_metadata_tables_df(ids, max_count=None):
    metadata_tables_list = []  # List to store DataFrames
    if max_count is not None:
        ids = ids[:max_count]
    for id in ids:
        # Fetch metadata and drop 'ID' column
        new_entry = pd.DataFrame(cbsodata.get_meta(id, name='TableInfos'))
        metadata_tables_list.append(new_entry)  # Add the DataFrame to the list

    # Concatenate all DataFrames at once
    metadata_tables = pd.concat(metadata_tables_list, ignore_index=True).drop(columns=['ID'])
    return metadata_tables

In [19]:
ids = get_tables_database()
metadata_tables = generate_metadata_tables_df(ids)
metadata_tables.to_sql('metadata_tables', conn, if_exists='replace', index=False)

262

### Column Metadata

In [50]:
pd.DataFrame(cbsodata.get_meta(id, name='DataProperties'))

Unnamed: 0,odata.type,ID,Position,ParentID,Type,Key,Title,Description,MapYear,ReleasePolicy,Datatype,Unit,Decimals,Default,PresentationType
0,Cbs.OData.Dimension,0,0.0,,Dimension,FarmTypes,Farm types,,,,,,,,
1,Cbs.OData.GeoDimension,1,1.0,,GeoDimension,Regions,Regions,,,,,,,,
2,Cbs.OData.TimeDimension,2,2.0,,TimeDimension,Periods,Periods,,,True,,,,,
3,Cbs.OData.Topic,3,3.0,,Topic,NumberOfFarmsTotal_1,"Number of farms, total",Holdings that produce agricultural goods comme...,,,Long,number,0.0,Zero,Absolute
4,Cbs.OData.TopicGroup,4,,,TopicGroup,,Land use,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,Cbs.OData.Topic,193,147.0,180.0,Topic,Turkeys_145,Turkeys,,,,Long,number,0.0,Zero,Absolute
194,Cbs.OData.Topic,194,148.0,180.0,Topic,DucksForFattening_146,Ducks for fattening,,,,Long,number,0.0,Zero,Absolute
195,Cbs.OData.Topic,195,149.0,180.0,Topic,OtherPoultry_147,Other poultry,"Laying ducks, geese, guinea fowl and similar.",,,Long,number,0.0,Zero,Absolute
196,Cbs.OData.Topic,196,150.0,180.0,Topic,Rabbits_148,Rabbits,Weanling rabbits for meat and does.,,,Long,number,0.0,Zero,Absolute


In [16]:
def generate_metadata_columns_df(ids, max_count=None):
    metadata_columns_list = []  # List to store DataFrames
    if max_count is not None:
        ids = ids[:max_count]
    for id in ids:
        # Fetch metadata and drop 'ID' column
        new_entry = pd.DataFrame(cbsodata.get_meta(id, name='DataProperties'))
        new_entry['Identifier'] = id # Add the 'Identifier' column
        metadata_columns_list.append(new_entry)  # Add the DataFrame to the list

    # Concatenate all DataFrames at once
    metadata_columns = pd.concat(metadata_columns_list, ignore_index=True).drop(columns=['ID'])
    return metadata_columns

In [20]:
ids = get_tables_database()
metadata_columns = generate_metadata_columns_df(ids)
metadata_columns.to_sql('metadata_columns', conn, if_exists='replace', index=False)

  metadata_columns = pd.concat(metadata_columns_list, ignore_index=True).drop(columns=['ID'])


5048

I erased the following tables from the database:
85682ENG
85680ENG
85683ENG
85428ENG
84668ENG

In [33]:
table = pd.DataFrame(cbsodata.get_data('84668ENG'))

table.memory_usage(deep=True).sum()/1024**3

JSONDecodeError: Unterminated string starting at: line 9248 column 508 (char 2795457)