In [1]:
from datenguidepy.query_builder import Query
from datenguidepy.query_execution import QueryExecutioner

import pandas as pd
from functools import partial

In [2]:
# from unittest.mock import Mock

In [3]:
%%time

def get_all_regions():
    def nuts_query(nuts_level):
        q = Query.all_regions(nuts=nuts_level)
        return q

    def lau_query(lau_level):
        q = Query.all_regions(lau = lau_level)
        return q

    qb_all = Query.all_regions()

    qe = QueryExecutioner()
    all_regions = qe.run_query(qb_all)
    r_nuts1 = qe.run_query(nuts_query(1))
    r_nuts2 = qe.run_query(nuts_query(2))
    r_nuts3 = qe.run_query(nuts_query(3))
    r_lau1 = qe.run_query(lau_query(1))
    r_lau2 = qe.run_query(lau_query(2))

    levels = {
    'nuts1':r_nuts1,
    'nuts2':r_nuts2,
    'nuts3':r_nuts3,
    'lau':r_lau1,
    # 'lau2':r_lau2
    }

    def isAnscestor(region_id,candidate):
        return region_id.startswith(candidate) and candidate != region_id


    def parent(region_id,region_details):
        desc =  region_details.assign(ansc = lambda df:df.index.map(lambda i: isAnscestor(region_id,i))).query('ansc')
        max_lev = desc.level.max()
        parent_frame = desc.query('level == @max_lev')
        if not parent_frame.empty:
            return parent_frame.iloc[0,:].name
        else:
            None

    all_regions_df = pd.concat([pd.DataFrame(page['data']['allRegions']['regions']) for page in all_regions[0].query_results]).set_index('id')

    level_df = pd.concat(pd.concat([pd.DataFrame(page['data']['allRegions']['regions']) for page in levels[k][0].query_results]).assign(level = k) for k in levels)

    all_rg_parents= (all_regions_df.join(
        level_df.set_index('id').loc[:,'level']
        ).assign(parent = lambda df: df.index.map(partial(parent,region_details = all_regions_df.assign(level = lambda df:df.index.map(len)))))
    )
    all_rg_parents.loc[all_rg_parents.level == 'nuts1','parent'] = 'DG'
    
    return all_rg_parents

CPU times: user 2min 59s, sys: 3.24 s, total: 3min 2s
Wall time: 3min 9s


In [13]:
qb_all.get_graphql_query()

'query ($page : Int, $itemsPerPage : Int) {allRegions (page: $page, itemsPerPage: $itemsPerPage){regions {id name }page itemsPerPage total }}'

In [4]:
all_regions_df.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
10,Saarland
11,Berlin
12,Brandenburg
13,Mecklenburg-Vorpommern
14,Sachsen


In [5]:
ar_detail.head()

Unnamed: 0_level_0,name,level
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,Saarland,2
11,Berlin,2
12,Brandenburg,2
13,Mecklenburg-Vorpommern,2
14,Sachsen,2


In [6]:
all_rg_parents.head()

Unnamed: 0_level_0,name,level,parent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,Saarland,nuts1,DG
11,Berlin,nuts1,DG
12,Brandenburg,nuts1,DG
13,Mecklenburg-Vorpommern,nuts1,DG
14,Sachsen,nuts1,DG


In [7]:
level_df.head()

Unnamed: 0,id,name,level
0,10,Saarland,nuts1
1,11,Berlin,nuts1
2,12,Brandenburg,nuts1
3,13,Mecklenburg-Vorpommern,nuts1
4,14,Sachsen,nuts1


In [18]:
    def isDescendent(region_id,candidate):
        return candidate.startswith(region_id) and candidate != region_id

    def children(region_id,region_details):
        desc =  region_details.assign(desc = lambda df:df.index.map(lambda i: isDescendent(region_id,i))).query('desc')
        min_lev = desc.level.min()
        return desc.query('level == @min_lev')


In [23]:
all_regions_df.join(level_df.set_index('id').loc[:,'level']).to_csv('regions.csv')

In [15]:
level_df.level.value_counts(ascending=True)

nuts1       16
nuts2       35
nuts3      483
lau1     13963
lau2     13963
Name: level, dtype: int64

This does not correspond to the current german nuts. These changed in the past however. One shoudl try to make a query with statistics from 2018 or later and do the same analysis again, to see whether one gets an up to date view of the nuts.

In [16]:
level_df.id.str.len().value_counts(ascending=True)

2        16
3        35
5       483
11     1734
10     5476
8     20716
Name: id, dtype: int64

In [17]:
level_df.groupby('level').apply(lambda df: (df.id.str.len().min(),df.id.str.len().max()))

level
lau1     (8, 11)
lau2     (8, 11)
nuts1     (2, 2)
nuts2     (3, 3)
nuts3     (5, 5)
dtype: object

In [18]:
no_level = all_regions_df.merge(level_df,on='id',how='left').query('level.isna()')
print(no_level.shape)
no_level.head()

(180, 4)


Unnamed: 0,id,name_x,name_y,level
9561,DG,Deutschland,,
16316,0713101,Adenau,,
16391,0713102,Altenahr,,
16416,0713103,Bad Breisig,,
16425,0713104,Brohltal,,


CPU times: user 2min 46s, sys: 2.44 s, total: 2min 49s
Wall time: 2min 49s


In [167]:
all_rg_parents.to_csv('regions.csv')

In [151]:
def hirachy_up(lowestids,hirachy_frame = all_rg_parents):
    anscestors = []
    current_ids = lowestids
    while len(current_ids)>0:
        current_regions = hirachy_frame.query('index.isin(@current_ids)')
        anscestors.append(current_regions)
        current_ids = current_regions.dropna().parent.unique()
    return pd.concat(anscestors).sort_index()

def hirachy_down(highest_ids,lowest_level='lau',hirachy_frame = all_rg_parents):
    descendents = [hirachy_frame.query('index.isin(@highest_ids)')]
    current_ids = highest_ids
    while len(current_ids)>0:
        current_regions = hirachy_frame.query('parent.isin(@current_ids)')
        descendents.append(current_regions)
        current_ids = current_regions.dropna().index.unique()
#         print(current_regions)
#         print(current_regions.level.unique())
        if lowest_level in current_regions.level.unique():
            break
    return pd.concat(descendents).sort_index()

def siblings(region_id,hirachy_frame = all_rg_parents):
    parent = hirachy_frame.query('index == @region_id').loc[:,'parent'].iloc[0]
    return hirachy_frame.query('parent == @parent')

In [153]:
siblings('051')

Unnamed: 0_level_0,name,level,parent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51,Düsseldorf,nuts2,5
53,Köln,nuts2,5
55,Münster,nuts2,5
57,Detmold,nuts2,5
59,Arnsberg,nuts2,5


In [145]:
hirachy_up(['051']).sort_values(['parent','level'])

dg_map = lambda x: x if x != 'DG' else ''

hu = hirachy_up(['051'])
hu.assign(sort_col = lambda df: df.index.map(dg_map)).sort_values('sort_col').drop('sort_col',axis=1)

Unnamed: 0_level_0,name,level,parent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DG,Deutschland,,
05,Nordrhein-Westfalen,nuts1,DG
051,Düsseldorf,nuts2,05


In [149]:
hirachy_down(['05'],None)

Unnamed: 0_level_0,name,level,parent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
05,Nordrhein-Westfalen,nuts1,DG
051,Düsseldorf,nuts2,05
05111,Düsseldorf,nuts3,051
05111000,Düsseldorf,lau,05111
05112,Duisburg,nuts3,051
05112000,Duisburg,lau,05112
05113,Essen,nuts3,051
05113000,Essen,lau,05113
05114,Krefeld,nuts3,051
05114000,Krefeld,lau,05114


In [47]:
ar_detail.query('id == "05911"')

Unnamed: 0_level_0,name,level
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5911,Bochum,5


In [54]:
parent_rel = ar_detail.reset_index().assign(parent = lambda df: df.id.map(parent))

In [73]:
all_rg_parents.loc[all_rg_parents.level == 'nuts1','parent'] = 'DG'

In [60]:
def children(region_id):
    return parent_rel.query('parent == @region_id')

In [61]:
children('16')

Unnamed: 0,id,name,level,parent
156,16051,Erfurt,5,16
157,16052,Gera,5,16
158,16053,Jena,5,16
159,16054,Suhl,5,16
160,16055,Weimar,5,16
161,16056,Eisenach,5,16
162,16061,Eichsfeld,5,16
163,16062,Nordhausen,5,16
164,16063,Wartburgkreis,5,16
165,16064,Unstrut-Hainich-Kreis,5,16
