In [8]:
import sqlite3
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import numpy as np
# paths to local databases
# path to databases
data_folder = './../../../s3/data'
city_geom  = data_folder + '/d000_lookuptables/city_pts_urban_audit2021.sqlite'
fua_geom   = data_folder + '/d000_lookuptables/fua_pts_urban_audit2021.sqlite'
lookuptable= data_folder + '/d000_lookuptables/lookuptables.gpkg'
city_cube  = data_folder + '/c001_city_cube/C_urban_cube_sh.sqlite'

## Get the Eurostat Table and attribute description, city names

In [2]:
# get Eurostat variables
con = sqlite3.connect(city_cube)
# read full table
eurostat_all = pd.read_sql_query("SELECT * FROM c_urban_cube_eurostat", con)
con.close()
eurostat_all

Unnamed: 0,index,indic_code,urau_code,1991,1992,1993,1994,1995,1996,1997,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0,EN1002V,AT001C,,,,,,,,...,,,,,,,,,,
1,1,EN1002V,AT002C,,,,,,,,...,,,,,,,,,,
2,2,EN1002V,AT003C,,,,,,,,...,,,,,,,,,,
3,3,EN1002V,AT004C,,,,,,,,...,,,,,,,,,,
4,4,EN1002V,AT005C,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47683,6132,TT1080V,UK024C,,,,,,,,...,,51.3,45.5,,,,,,,
47684,6133,TT1080V,UK027C,,,,,,,,...,,,,,,,,,,
47685,6134,TT1080V,UK029C,,,,,,,,...,,,,,102.61,,,,,
47686,6135,TT1080V,UK030C,,,,,,,,...,,,66.0,,,,,,,


In [3]:
# also get lookup tables with attribute description and city names
con_l = sqlite3.connect(lookuptable)
eurostat_attributes = pd.read_sql_query("SELECT * FROM L_eurostat", con_l)
city_names = pd.read_sql_query("SELECT URAU_CODE, URAU_NAME FROM L_core_city_urb_atl18", con_l)
con_l.close()
# city code has a trailing number, lets drop it for now
city_names.URAU_CODE = city_names.URAU_CODE.str[:-1]

In [4]:
# join tables with two consecutive merge (dropping rows that do not match) 
eurostat_extended = eurostat_all.merge(
    city_names, how="inner", left_on="urau_code", right_on="URAU_CODE").merge(
    eurostat_attributes, how="inner", left_on="indic_code", right_on="variable_code")
# drop duplicated columns
eurostat_extended.drop(columns=["index_x", "index_y", "variable_code", "URAU_CODE"], inplace=True)

## Code significance  
indic_code (type of data): e.g., DE1001V = total population (see: https://www.espon.eu/sites/default/files/attachments/CB02_Inception_report_Annex_I.pdf) 

City name from urau_code, here: https://ec.europa.eu/eurostat/documents/3217494/5729233/RY_CH_ANNEXES_2011-EN.PDF.pdf/deb1dbbc-e8d3-4dd2-aa76-fae2f36a0bd9?t=1414776025000 

In [5]:
eurostat_extended.head()

Unnamed: 0,indic_code,urau_code,1991,1992,1993,1994,1995,1996,1997,1998,...,2016,2017,2018,2019,2020,2021,URAU_NAME,eurostat_table,variable_description,uc1_priority
0,EN1002V,AT001C,,,,,,,,,...,,,,,,,Wien,urb_cenv,Total number of hours of sunshine per day,2
1,EN1002V,AT002C,,,,,,,,,...,,,,,,,Graz,urb_cenv,Total number of hours of sunshine per day,2
2,EN1002V,AT003C,,,,,,,,,...,,,,,,,Linz,urb_cenv,Total number of hours of sunshine per day,2
3,EN1002V,AT004C,,,,,,,,,...,,,,,,,Salzburg,urb_cenv,Total number of hours of sunshine per day,2
4,EN1002V,AT005C,,,,,,,,,...,,,,,,,Innsbruck,urb_cenv,Total number of hours of sunshine per day,2


In [6]:
print('Features: ', eurostat_extended.variable_description.unique())
print('Number of Features: ', len(eurostat_extended.variable_description.unique()))

Features:  ['Total number of hours of sunshine per day'
 'Average temperature of warmest month - degrees'
 'Average temperature of coldest month - degrees' 'Rainfall - litre/m²'
 'Number of days ozone O3 concentrations exceed 120 µg/m³'
 'Number of hours nitrogen dioxide NO2 concentrations exceed 200 µg/m³'
 'Number of days particulate matter PM10 concentrations exceed 50 µg/m³'
 'Accumulated ozone concentration in excess 70 µg/m³'
 'Annual average concentration of NO2 (µg/m³)'
 'Annual average concentration of PM10 (µg/m³)' 'Total use of water - m³'
 'Price of a m³ of domestic water - Euro'
 'Share of the urban waste water load (in population equivalents) treated according to the applicable standard -%'
 'Municipal waste generated (domestic and commercial), total - 1000 t'
 'Number of deaths per year under 65 due to diseases of the circulatory or respiratory systems'
 'Total deaths under 65 per year' 'Total deaths per year'
 'Share of severely materially deprived persons -%'
 'Share o

## Study data availability across cities and years

In [5]:
# make availability matrix
# drop descriptive columns, keep only value columns, i.e. years
av_table = eurostat_extended.drop(columns=["indic_code", "urau_code", "URAU_NAME", "eurostat_table", "variable_description", "uc1_priority"])
av_matrix = ~pd.isnull(av_table)
# sum over rows: returns the number of years with non-null data
cities_av = av_matrix.sum(axis=1)
# sum over columns: returns the number of available cities for that year
years_av = av_matrix.sum(axis=0)

In [28]:
print(cities_av)
print(years_av)

0      15
1      10
2      10
3       9
4       9
       ..
648     4
649     6
650     6
651     6
652     3
Length: 653, dtype: int64
1991    169
1992    191
1993    125
1994    125
1995    191
1996    208
1997    178
1998    194
1999    215
2000    237
2001    326
2002    264
2003    206
2004    402
2005    313
2006    327
2007    327
2008    394
2009    420
2010    514
2011    580
2012    563
2013    578
2014    568
2015    512
2016    515
2017    571
2018    552
2019    426
2020    355
2021    260
dtype: int64


In [6]:
av_matrix = av_matrix.astype(int)

In [7]:
av_matrix

Unnamed: 0,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
34929,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
34930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
34931,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1


In [36]:
import math
def get_data_gaps(av_table):
    av_matrix = ~pd.isnull(av_table)
    av_matrix = av_matrix.astype(int)
    yr_with_values = av_matrix.apply(lambda x: np.where(x.values == 1)[0], axis=1)
    n_years = yr_with_values.apply(lambda x: len(x))
    first_year = yr_with_values.apply(lambda x: 1991+x[0] if len(x) > 0 else 0)
    last_year = yr_with_values.apply(lambda x: 1991+x[-1] if len(x) > 0 else 0)
    gaps = yr_with_values.apply(lambda x: [t - s for s, t in zip(x, x[1:])])
    gaps_max = gaps.apply(lambda x: np.max(x) if len(x) > 0 else 0)
    gaps_median = gaps.apply(lambda x: int(math.ceil(np.median(x))) if len(x) > 0 else 0)
    d = {
        "first_year": first_year,
        "last_year": last_year,
        "n_years": n_years,
        "gap_max": gaps_max,
        "gap_median": gaps_median
    }
    df = pd.DataFrame(data=d)
    return df

In [37]:
%time
data_av = get_data_gaps(av_table)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.34 µs


In [38]:
eurostat_extended2 = eurostat_extended.merge(data_av,left_index=True, right_index=True)
eurostat_extended2

Unnamed: 0,indic_code,urau_code,1991,1992,1993,1994,1995,1996,1997,1998,...,2021,URAU_NAME,eurostat_table,variable_description,uc1_priority,first_year,last_year,n_years,gap_max,gap_median
0,EN1002V,AT001C,,,,,,,,,...,,Wien,urb_cenv,Total number of hours of sunshine per day,2,2004,2004,1,0,0
1,EN1002V,AT002C,,,,,,,,,...,,Graz,urb_cenv,Total number of hours of sunshine per day,2,2001,2004,2,3,3
2,EN1002V,AT003C,,,,,,,,,...,,Linz,urb_cenv,Total number of hours of sunshine per day,2,2001,2004,2,3,3
3,EN1002V,AT004C,,,,,,,,,...,,Salzburg,urb_cenv,Total number of hours of sunshine per day,2,2001,2004,2,3,3
4,EN1002V,AT005C,,,,,,,,,...,,Innsbruck,urb_cenv,Total number of hours of sunshine per day,2,2001,2004,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34928,SA1053V,PT011C,,,,,,,,,...,1226.0,Amadora,urb_clivcon,Average price for buying an apartment per m2 ...,1,2017,2021,5,1,1
34929,SA1053V,PT013C,,,,,,,,,...,1562.0,Odivelas,urb_clivcon,Average price for buying an apartment per m2 ...,1,2017,2021,5,1,1
34930,SA1053V,PT014C,,,,,,,,,...,1536.0,Viseu,urb_clivcon,Average price for buying an apartment per m2 ...,1,2017,2021,5,1,1
34931,SA1053V,PT002C,,,,,,,,,...,1810.0,Porto,urb_clivcon,Average price for buying an apartment per m2 ...,1,2017,2021,5,1,1


In [31]:
indicator_stats_max_gap = eurostat_extended2.groupby(['indic_code','gap_max']).agg({'urau_code': 'count',
                         'n_years': ['min', 'max', 'mean'],
                        'first_year': 'min',
                        'last_year': 'max'})
indicator_stats_max_gap.dropna(inplace=True)
indicator_stats_max_gap

Unnamed: 0_level_0,Unnamed: 1_level_0,urau_code,n_years,n_years,n_years,first_year,last_year
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,min,max
indic_code,gap_max,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
CR1015V,0,68,1,1,1.000000,2008,2021
CR1015V,1,122,2,16,9.213115,2005,2021
CR1015V,2,88,3,15,6.000000,2005,2021
CR1015V,3,42,2,10,3.404762,2008,2020
CR1015V,4,60,3,13,6.966667,2001,2021
...,...,...,...,...,...,...,...
TT1080V,5,47,2,14,7.531915,2001,2020
TT1080V,6,11,4,13,9.181818,2001,2021
TT1080V,7,1,6,6,6.000000,2010,2021
TT1080V,9,5,4,10,8.800000,2001,2021


In [42]:
indicator_stats_median_gap = eurostat_extended2.groupby(['indic_code','gap_median']).agg({'urau_code': 'count',
                         'n_years': ['min', 'max', 'mean'],
                        'first_year': 'min',
                        'last_year': 'max'})
indicator_stats_median_gap.dropna(inplace=True)
indicator_stats_median_gap

Unnamed: 0_level_0,Unnamed: 1_level_0,urau_code,n_years,n_years,n_years,first_year,last_year
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,mean,min,max
indic_code,gap_median,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
CR1015V,0,68,1,1,1.000000,2008,2021
CR1015V,1,198,2,16,9.570707,2001,2021
CR1015V,2,79,3,9,3.430380,2004,2021
CR1015V,3,41,2,4,2.390244,2001,2011
CR1015V,4,20,3,3,3.000000,2001,2016
...,...,...,...,...,...,...,...
TT1080V,1,267,2,19,10.786517,2001,2021
TT1080V,2,4,3,9,5.500000,2001,2017
TT1080V,3,28,2,4,2.928571,2001,2017
TT1080V,4,9,2,3,2.111111,2001,2008


In [52]:
n_cities = eurostat_extended2.urau_code.nunique()
min_n_cities = n_cities*0.5
# indicator_stats_median_gap.columns = ['_'.join(col) for col in indicator_stats_median_gap.columns]
print(indicator_stats_median_gap.columns)
sel_indic = indicator_stats_median_gap[(indicator_stats_median_gap.urau_code_count >= min_n_cities)]# & 
                  # (indicator_stats2.n_years_min >= min_n_values) & 
                 # (indicator_stats2.max_gap_ <= max_gap)]
sel_indic.reset_index(inplace=True)
n_indic = sel_indic.indic_code.nunique()
sel_indic

Index(['urau_code_count', 'n_years_min', 'n_years_max', 'n_years_mean',
       'first_year_min', 'last_year_max'],
      dtype='object')


Unnamed: 0,indic_code,gap_median,urau_code_count,n_years_min,n_years_max,n_years_mean,first_year_min,last_year_max
0,DE1001V,1,622,2,31,17.249196,1991,2021
1,DE1025V,1,613,2,24,13.277325,1991,2021
2,DE1028V,1,612,2,24,13.062092,1991,2021
3,DE1040V,1,617,2,31,14.377634,1991,2021
4,DE1046V,1,616,2,31,14.563312,1991,2021
5,DE1049V,1,616,2,31,14.543831,1991,2021
6,DE1055V,1,613,2,24,13.055465,1991,2021
7,DE1058V,1,559,2,19,11.420394,2000,2021
8,DE1061V,1,567,2,19,11.477954,2000,2021
9,DE1064V,1,566,2,19,11.448763,2000,2021
