In [1]:
import sqlite3
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import numpy 
city_cube  = 'C_urban_cube_sh.sqlite'



In [2]:
# get Eurostat number of population data
con = sqlite3.connect(city_cube)
# read full table
df = pd.read_sql_query("SELECT * FROM c_urban_cube_eurostat", con)

## Functions

### The first available year given a city/attribute

In [3]:
def first_year(df, city, attribute):
    line = df[(df['urau_code'] == city) & (df['indic_code'] == attribute)]
    if(len(line)==0):
        return None
    first_year = 1991
    col_set = line.columns
    col_set = col_set.drop(['index', 'indic_code', 'urau_code'])
    for col in col_set:
        if  not pd.isnull(line.iloc[0][col]): 
            first_year = col
            break
    return first_year

### Last available year

In [18]:
def last_year(df, city, attribute):
    line = df[(df['urau_code'] == city) & (df['indic_code'] == attribute)]
    if(len(line)==0):
        return None
    last_year = 2021
    col_set = line.columns[::-1]
    col_set = col_set.drop(['index', 'indic_code', 'urau_code'])
    for col in col_set:
        if  not pd.isnull(line.iloc[0][col]): 
            last_year = col
            break
    return last_year

### Check if the data is available every 'sequence' year

In [19]:
def isAvailable(df, city, attribute, sequence):
    line = df[(df['urau_code'] == city) & (df['indic_code'] == attribute)]
    if(len(line)==0):
        return False
    year = first_year(df, city, attribute)
    last = last_year(df, city, attribute)
    col_set = df.columns
    for col in col_set:
        if col == year:
            break
        col_set = col_set.drop(col)
    
    for col in col_set[::-1]:
        if col == last:
            break
        col_set = col_set.drop(col)
    last = last_year(df, city, attribute)
    available = True
    itterate = col_set
    for i in range(sequence):
        itterate = itterate[:-1]
    for idx, col in enumerate(itterate):
        if pd.isnull(line.iloc[0][col_set[idx+sequence]]):
            available = False
            break
    return available

### Number of available years of a city/attribute

In [20]:
def number_years(df, city, attribute):
    line = df[(df['urau_code'] == city) & (df['indic_code'] == attribute)]
    if(len(line)==0):
        return 0
    return len(line.columns) - line.isna().sum().sum() - 3

## Create an output dataset (city|attribute|number of available years|first available year|booleans on sequences)

In [22]:
%%time
d = {}
lcities = []
lattributes = []
firstyears = []
lastyears = []
n_years = []
for c in df.urau_code.unique():
    for a in df.indic_code.unique():
        lcities.append(c)
        lattributes.append(a)
        firstyears.append(first_year(df, c, a))
        lastyears.append(last_year(df, c, a))
        n_years.append(number_years(df, c, a))
d['city'] = lcities
d['attribute'] = lattributes
d['n_years'] = n_years
d['first_year'] = firstyears
d['last_year'] = lastyears
seq_max = 3
for i in range(1, seq_max+1):
    lsequence = []
    for c in df.urau_code.unique():
        for a in df.indic_code.unique():
            lsequence.append(isAvailable(df, c, a, i))
    d['seq_'+ str(i)] = lsequence

df_result = pd.DataFrame(data=d)
df.to_csv('Available_seq.csv', index=False)
df_result.head()

CPU times: total: 1h 51min 38s
Wall time: 1h 53min 29s


Unnamed: 0,city,attribute,n_years,first_year,last_year,seq_1,seq_2,seq_3
0,AT001C,EN1002V,1,2004,2004,True,True,True
1,AT001C,EN1003V,2,2004,2008,False,False,False
2,AT001C,EN1004V,2,2004,2008,False,False,False
3,AT001C,EN1005V,2,2004,2008,False,False,False
4,AT001C,EN2002V,22,1992,2013,True,True,True


In [23]:
df_result[df_result['seq_2']& ~df_result['seq_1'] & (df_result['n_years']>10)]

Unnamed: 0,city,attribute,n_years,first_year,last_year,seq_1,seq_2,seq_3
580,BE002C,EN2005V,11,2002,2013,False,True,True
1482,BG002C,EN2005V,12,2001,2013,False,True,True
2548,CZ004C,EN2005V,17,1996,2013,False,True,True
2551,CZ004C,EN2027V,17,1996,2013,False,True,True
3614,DE004C,EN2005V,12,2001,2013,False,True,True
...,...,...,...,...,...,...,...,...
57373,LT005C,DE1077V,13,2008,2021,False,True,True
57442,LT006C,EC1177V,13,2008,2021,False,True,True
57454,LT006C,DE1074V,13,2008,2021,False,True,True
57455,LT006C,DE1077V,13,2008,2021,False,True,True
