In [1]:
import cbsodata
import pandas as pd
import sqlite3
from tqdm import tqdm
import data

%load_ext autoreload
%autoreload 2

# Tutorial

In [2]:
# Set the connection

path = 'CBSdatabase.db'
data.set_db_path(path)

In [4]:
# Get table schema

ids = data.get_tables_database() # This retrieves the ids of the tables in the database
id = ids[0]
data.get_table_schema(id)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ID,INTEGER,0,,0
1,1,FarmTypes,TEXT,0,,0
2,2,Regions,TEXT,0,,0
3,3,Periods,TEXT,0,,0
4,4,NumberOfFarmsTotal_1,INTEGER,0,,0
...,...,...,...,...,...,...
148,148,Turkeys_145,INTEGER,0,,0
149,149,DucksForFattening_146,INTEGER,0,,0
150,150,OtherPoultry_147,INTEGER,0,,0
151,151,Rabbits_148,INTEGER,0,,0


In [None]:
# Get column info

data.get_column_info(ids) # Just input the ids you need

Unnamed: 0,odata.type,Position,ParentID,Type,Key,Title,Description,MapYear,ReleasePolicy,Datatype,Unit,Decimals,Default,PresentationType,Identifier
0,Cbs.OData.Dimension,0.0,,Dimension,FarmTypes,Farm types,,,,,,,,,80783eng
1,Cbs.OData.GeoDimension,1.0,,GeoDimension,Regions,Regions,,,,,,,,,80783eng
2,Cbs.OData.TimeDimension,2.0,,TimeDimension,Periods,Periods,,,1.0,,,,,,80783eng
3,Cbs.OData.Topic,3.0,,Topic,NumberOfFarmsTotal_1,"Number of farms, total",Holdings that produce agricultural goods comme...,,,Long,number,0.0,Zero,Absolute,80783eng
4,Cbs.OData.TopicGroup,,,TopicGroup,,Land use,,,,,,,,,80783eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4992,Cbs.OData.TopicGroup,,,TopicGroup,,Wage per job,,,,,,,,,84669ENG
4993,Cbs.OData.Topic,7.0,8.0,Topic,HourlyWage_3,Hourly wage,The hourly wage are calculated by the sum of g...,,,Double,euros,2.0,Missing,,84669ENG
4994,Cbs.OData.TopicGroup,,,TopicGroup,,Working hours,The number of hours someone works in a normal ...,,,,,,,,84669ENG
4995,Cbs.OData.Topic,8.0,10.0,Topic,PerJobPerWeekIncludingOvertime_4,Per job per week including overtime,Weekly working hours are calculated by the ave...,,,Double,hours,1.0,Missing,,84669ENG


In [None]:
# Get all tables info. This is in a json but it can be accessed like this too.

data.get_all_tables_info()

Unnamed: 0,Identifier,Title,Summary
0,80783eng,"Agriculture; crops, livestock and land use by ...","Agricultural census; crops, livestock, land us..."
1,80784eng,Agriculture; labour force by region,"Agricultural census; number of persons, annual..."
2,85636ENG,"Arable crops; production, region",area and yield per crop\nprovince
3,37738ENG,Vegetables; yield and cultivated area per kind...,"Area and yield per vegetable\nArea, yield, kin..."
4,83981ENG,"Livestock manure; production, transport and us...","Manure production, nitrogen and phosphate in m..."
...,...,...,...
257,85682ENG,"Trade in goods; border crossing, SITC (5 digit...",Import and export value of goods (border cross...
258,85683ENG,"Trade in goods; border crossing, SITC (5 digit...",Import and export value of goods (border cross...
259,85919ENG,"Compensation of employees, employment; economi...","Compensation of employees, wages, labour volum..."
260,85917ENG,"Compensation of employees, employment; quarter...","Compensation of employees, wages, jobs (by sex..."


# SQLITE

In [None]:
# API use example from Demetrio's script

tables = cbsodata.get_table_list()
data.check_language_count(tables)
# retrieve specific table with identifier
table = pd.DataFrame(cbsodata.get_data('37105ENG'))
print(table.columns)
# retrieve metadata of specific table
metadata = pd.DataFrame(cbsodata.get_meta('37105ENG',name= 'DataProperties')) 
print(metadata)
# check if keys in metadata are corresponding to the columns in the actual dataset
data.check_col_names(table, metadata)

Number of 'ENG': 1026
Number of 'NED': 4729
Index(['ID', 'Regions', 'Periods', 'TotalSurface_1', 'TransportTotal_2',
       'Railroad_3', 'MainRoad_4', 'Airport_5', 'BuiltUpAreaTotal_6',
       'Residential_7', 'IndustryBusinessPublicInstitutions_8',
       'SocioCulturalFacilities_9', 'SemiBuiltUpAreaTotal_10', 'MiningArea_11',
       'BuildingSite_12', 'OtherSemiBuiltUpArea_13', 'RecreationTotal_14',
       'ParkAndPublicGarden_15', 'SportGrounds_16', 'OtherRecreationUsage_17',
       'AgricultureTotal_18', 'Greenhouses_19', 'OtherAgriculturalUsage_20',
       'WoodlandAndNatureTotal_21', 'Woodland_22', 'NaturalOpenArea_23',
       'WaterTotal_24', 'InlandWaterTotal_25', 'TidalWaterTotal_26'],
      dtype='object')
                 odata.type  ID  Position  ParentID           Type  \
0    Cbs.OData.GeoDimension   0       0.0       NaN   GeoDimension   
1   Cbs.OData.TimeDimension   1       1.0       NaN  TimeDimension   
2           Cbs.OData.Topic   2       2.0       NaN          To

In [2]:
# Create connection to the database

path = 'CBSdatabase.db'
data.set_db_path(path)


In [4]:
table.memory_usage(deep=True).sum()/1024**2

np.float64(0.5167198181152344)

In [5]:
tables = cbsodata.get_table_list()

In [None]:
# Upload tables to the database

eng_ids = data.return_english_tables(tables)
eng_ids = eng_ids[262:] # Select which tables to upload

data.upload_tables(eng_ids)

Uploaded tables are ids from eng_ids[0] to eng_ids[261].

Then, I erased the following tables from the database:
85682ENG
85680ENG
85683ENG
85428ENG
84668ENG

## Metadata

### Exploration

In [60]:

names = ['TableInfos', 'UntypedDataSet', 'TypedDataSet', 'DataProperties', 'CategoryGroups', 'Geschlecht', 'Alter', 'Region', 'Perioden']

names_that_dont_work = ['Geschlecht', 'Alter', 'Region', 'Perioden'] # These supposedly valid names (from the API web) don't work

names = [name for name in names if name not in names_that_dont_work]

for name in names:
    metadata = pd.DataFrame(cbsodata.get_meta('37105ENG',name= name)) 
    print(name)
    print(metadata)
    print('-----------------')

TableInfos
   ID                                              Title  \
0   0  Land use; main categories; regional, 1900 onwards   

                         ShortTitle Identifier  \
0  Land use; regional, 1900 onwards   37105ENG   

                                             Summary             Modified  \
0  Traffic area, Built up area, Forest, Natural a...  2022-11-30T02:00:00   

  ReasonDelivery ExplanatoryText Language Catalog  ...     Period  \
0         Update                       en     CBS  ...  1900-2017   

                                    ShortDescription  \
0  \nThis table provides information about the la...   

                                         Description  \
0  CONTENTS\r\n\r\n1. General information\r\n2. D...   

                                 DefaultPresentation  \
0  ts=1668751650543&graphtype=Table&r=Topics&k=Re...   

                                    DefaultSelection      GraphTypes  \
0  $filter=((Periods eq '1900JJ00') or (Periods e...  Table,Ba

In [35]:
metadata_names_dont_work = pd.DataFrame(columns = ['ID', 'Name'])
count = 0
max_count = 5
for id in eng_ids:
    count += 1
    if count == max_count:
        break
    for name in names:
        try:
            metadata = pd.DataFrame(cbsodata.get_meta(id,name= name))
        except:
            new_entry = pd.DataFrame({'ID': [id], 'Name': [name]})
            metadata_names_dont_work = pd.concat([metadata_names_dont_work, new_entry], ignore_index=True)
            continue

metadata_names_dont_work.groupby('ID')['Name'].agg(', '.join).reset_index()

Unnamed: 0,ID,Name
0,37738ENG,"Geschlecht, Alter, Region, Perioden"
1,80783eng,"Geschlecht, Alter, Region, Perioden"
2,80784eng,"Geschlecht, Alter, Region, Perioden"
3,85636ENG,"Geschlecht, Alter, Region, Perioden"


Upon inspection of the different metadata names I decided that only TableInfos and DataProperties contain useful information

### Table Metadata

In [19]:
ids = data.get_tables_database()
metadata_tables = data.generate_metadata_tables_df(ids)
conn = sqlite3.connect(path)
metadata_tables.to_sql('metadata_tables', conn, if_exists='replace', index=False)
conn.close()

262

### Column Metadata

In [9]:
ids = data.get_tables_database()
metadata_columns = data.generate_metadata_columns_df(ids)
conn = sqlite3.connect(path)
metadata_columns.to_sql('metadata_columns', conn, if_exists='replace', index=False)
conn.close()

  metadata_columns = pd.concat(metadata_columns_list, ignore_index=True).drop(columns=['ID'])


4997

In [7]:
data.set_db_path('CBSdatabase.db')

In [9]:
info = data.get_all_tables_info()
info.to_json('tables_info.json', orient='records')

In [11]:
ids = data.get_tables_database()
ids = ids[:5]
data.get_column_info(ids)

Unnamed: 0,odata.type,Position,ParentID,Type,Key,Title,Description,MapYear,ReleasePolicy,Datatype,Unit,Decimals,Default,PresentationType,Identifier
0,Cbs.OData.Dimension,0.0,,Dimension,FarmTypes,Farm types,,,,,,,,,80783eng
1,Cbs.OData.GeoDimension,1.0,,GeoDimension,Regions,Regions,,,,,,,,,80783eng
2,Cbs.OData.TimeDimension,2.0,,TimeDimension,Periods,Periods,,,1.0,,,,,,80783eng
3,Cbs.OData.Topic,3.0,,Topic,NumberOfFarmsTotal_1,"Number of farms, total",Holdings that produce agricultural goods comme...,,,Long,number,0.0,Zero,Absolute,80783eng
4,Cbs.OData.TopicGroup,,,TopicGroup,,Land use,,,,,,,,,80783eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,Cbs.OData.Topic,11.0,,Topic,ManureSupplyToFarms_10,Manure supply to farms,Supply of manure takes place mainly on agricul...,,,Long,million kg,0.0,Impossible,,83981ENG
264,Cbs.OData.Topic,12.0,,Topic,ProcessedManureExcludingExports_11,Processed manure (excluding exports),,,,Double,million kg,0.0,Impossible,,83981ENG
265,Cbs.OData.Topic,13.0,,Topic,NetManureExports_12,Net manure exports,,,,Double,million kg,0.0,Impossible,,83981ENG
266,Cbs.OData.Topic,14.0,,Topic,SpreadingAreaForManure_13,Spreading area for manure,,,,Long,million kg,0.0,Impossible,,83981ENG


In [5]:
data.set_db_path('CBSdatabase.db')
data.get_all_tables_info(True)

Unnamed: 0,Identifier,Title,Summary,Description
0,80783eng,"Agriculture; crops, livestock and land use by ...","Agricultural census; crops, livestock, land us...",CONTENTS\r\n\r\n1. General information\r\n2. D...
1,80784eng,Agriculture; labour force by region,"Agricultural census; number of persons, annual...",CONTENTS\r\n\r\n1. General information\r\n2. D...
2,85636ENG,"Arable crops; production, region",area and yield per crop\nprovince,\r\nCONTENTS\r\n\r\n1. General information\r\n...
3,37738ENG,Vegetables; yield and cultivated area per kind...,"Area and yield per vegetable\nArea, yield, kin...",1. General information\r\n2. Definitions and e...
4,83981ENG,"Livestock manure; production, transport and us...","Manure production, nitrogen and phosphate in m...",CONTENTS\r\n\r\n1. General information\r\n2. D...
...,...,...,...,...
257,85682ENG,"Trade in goods; border crossing, SITC (5 digit...",Import and export value of goods (border cross...,CONTENTS \r\n\r\n1. General information\r\n2. ...
258,85683ENG,"Trade in goods; border crossing, SITC (5 digit...",Import and export value of goods (border cross...,CONTENTS \r\n\r\n1. General information\r\n2. ...
259,85919ENG,"Compensation of employees, employment; economi...","Compensation of employees, wages, labour volum...",CONTENTS\r\n\r\n1. General information\r\n2. D...
260,85917ENG,"Compensation of employees, employment; quarter...","Compensation of employees, wages, jobs (by sex...",CONTENTS\r\n\r\n1. General information\r\n2. D...


In [6]:
ids = data.get_tables_database()
id = ids[0]
data.get_table_schema(id)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ID,INTEGER,0,,0
1,1,FarmTypes,TEXT,0,,0
2,2,Regions,TEXT,0,,0
3,3,Periods,TEXT,0,,0
4,4,NumberOfFarmsTotal_1,INTEGER,0,,0
...,...,...,...,...,...,...
148,148,Turkeys_145,INTEGER,0,,0
149,149,DucksForFattening_146,INTEGER,0,,0
150,150,OtherPoultry_147,INTEGER,0,,0
151,151,Rabbits_148,INTEGER,0,,0
