In [1]:
import geopandas as gpd
import pandas as pd
from shapely import wkt
import matplotlib.pyplot as plt
from shapely.geometry import Point
import os

#### 1. Get the NUTS geometries

In [4]:
# load geometries
geometries_path = 'test_data\\espon\\geometry.csv'
# Load the CSV file into a pandas DataFrame
gdf = pd.read_csv(geometries_path)

# Convert the 'geometry' column to actual geometrical data using shapely
gdf['geometry'] = gdf['geometry'].apply(wkt.loads)

# Convert the pandas DataFrame to a GeoPandas GeoDataFrame
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')

# Set the current coordinate reference system (CRS)
gdf.set_crs(epsg=4326, inplace=True)

Unnamed: 0,geometry,tunit_code
0,"POLYGON ((16.02442 47.38532, 16.04583 47.39272...",AT11
1,"POLYGON ((13.57837 47.41070, 13.57786 47.41138...",AT22
2,"POLYGON ((14.46053 48.06805, 14.46171 48.06858...",AT12
3,"POLYGON ((16.19393 48.22388, 16.25562 48.23942...",AT13
4,"POLYGON ((12.73732 47.11885, 12.84519 47.11302...",AT21
...,...,...
3288,"POLYGON ((-0.38119 52.41218, -0.38176 52.44140...",UKH1
3289,"MULTIPOLYGON (((-0.35391 42.90804, -0.30195 42...",FRJ2
3290,"POLYGON ((-0.01695 51.69028, -0.01228 51.69725...",UKH3
3291,"POLYGON ((-0.93636 53.70488, -0.95523 53.75601...",UKE1


#### 2. Get the available csv files with socioeconmic data

In [7]:
def create_csv_dict(folder_path):
    csv_dict = {}
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                csv_dict[file] = file_path
    return csv_dict

# Create dict with files and paths 
folder_path = 'test_data\\espon'
csv_files_dict = create_csv_dict(folder_path)
csv_files_dict.pop('geometry.csv')
for k,v in csv_files_dict.items():
    print(f"'{k}':{v}")

'Population_-_age_group_25-64_-_with_educational_attainment_level_0-2_.csv':test_data\espon\Educational_attainment_level_by_age_group\Population_-_age_group_25-64_-_with_educational_attainment_level_0-2_.csv
'Population_-_age_group_25-64_-_with_educational_attainment_level_3-4_.csv':test_data\espon\Educational_attainment_level_by_age_group\Population_-_age_group_25-64_-_with_educational_attainment_level_3-4_.csv
'Population_-_age_group_25-64_-_with_educational_attainment_level_5-8_.csv':test_data\espon\Educational_attainment_level_by_age_group\Population_-_age_group_25-64_-_with_educational_attainment_level_5-8_.csv
'Population_-_age_group_30-34_-_with_educational_attainment_level_0-2_.csv':test_data\espon\Educational_attainment_level_by_age_group\Population_-_age_group_30-34_-_with_educational_attainment_level_0-2_.csv
'Population_-_age_group_30-34_-_with_educational_attainment_level_3-4_.csv':test_data\espon\Educational_attainment_level_by_age_group\Population_-_age_group_30-34_-_wit

##### get avilable years for fetaure with respect to nuts level 

In [12]:
res_df = pd.DataFrame()

nuts_level_oi = 2 #https://en.wikipedia.org/wiki/Nomenclature_of_Territorial_Units_for_Statistics

for k,v in csv_files_dict.items():
    try:
        df = pd.read_csv(v)
        c = list(df.columns)
        c = [cc for cc in c if 'y_' in cc]
        data = {}
        data['name'] = k
        for cc in c:
            count = len(df[(df[cc].notna()) & (df['level'] == nuts_level_oi)])
            data[cc] = count
        if any(data[key] != 0 for key in c):
            # Add the dictionary as a row to the DataFrame using concat
            res_df = pd.concat([res_df, pd.DataFrame([data])], ignore_index=True)
    except Exception as e:
        print

print(res_df)

                                                 name  y_2000  y_2001  y_2002  \
0   Population_-_age_group_25-64_-_with_educationa...     312     312     312   
1   Population_-_age_group_25-64_-_with_educationa...     315     315     315   
2   Population_-_age_group_25-64_-_with_educationa...     312     312     312   
3   Population_-_age_group_30-34_-_with_educationa...     313     313     313   
4   Population_-_age_group_30-34_-_with_educationa...     311     311     311   
5   Population_-_age_group_30-34__-_with_education...     307     307     307   
6   Female_Population_-_age_group_25-64__0-2_educa...     307     307     307   
7   Female_Population_-_age_group_25-64__3-4_educa...     313     313     313   
8   Female_Population_-_age_group_25-64__5-8_educa...     314     314     312   
9   Female_Population_-_age_group_30-34__0-2_educa...     304     303     303   
10  Female_Population_-_age_group_30-34__3-4_educa...     308     308     308   
11  Female_Population_-_age_

In [13]:
print(list(df.columns))

count = len(df[(df['y_2017'].notna()) & (df['level'] == 2)])
print(count)  # Output: 2

['geom_id', 'id', 'name', 'code', 'nomenclatu', 'level', 'version', 'tunit_code', 'tunit_name', 'processes', 'sources', 'y_1999', 'y_2000', 'y_2001', 'y_2002', 'y_2003', 'y_2004', 'y_2005', 'y_2006', 'y_2007', 'y_2008', 'y_2009', 'y_2010', 'y_2011', 'y_2012', 'y_2013', 'y_2014', 'y_2015', 'y_2016', 'y_2017', 'y_2018']
267


In [14]:
print(df)

      geom_id     id                                        name  \
0    152000.0  248.0  Employment rate (%) - male - age group 65+   
1     11521.0  248.0  Employment rate (%) - male - age group 65+   
2    152017.0  248.0  Employment rate (%) - male - age group 65+   
3     11578.0  248.0  Employment rate (%) - male - age group 65+   
4     11612.0  248.0  Employment rate (%) - male - age group 65+   
..        ...    ...                                         ...   
449   13441.0  248.0  Employment rate (%) - male - age group 65+   
450   13450.0  248.0  Employment rate (%) - male - age group 65+   
451  152018.0  248.0  Employment rate (%) - male - age group 65+   
452  152041.0  248.0  Employment rate (%) - male - age group 65+   
453  152131.0  248.0  Employment rate (%) - male - age group 65+   

               code nomenclatu  level  version tunit_code  \
0    empl_males_65+       NUTS      0   2013.0         AL   
1    empl_males_65+       NUTS      0   2013.0         AT   
