# Regionalisation of the 2014-2020 payments dataset

The first operation is to import all the relevant libraries one will be working with

In [None]:
import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Import all the relevant data files

_annual_ contains all the yearly payments to be regionalised, _mapping_ is the look-up table for the attribution of the payments at different
granularity through the correspondence between the payment _CCI-code_ and the _NUTS_ area; _population_ includes the population figures at the _NUTS2_ granularity level; _wealth_ maps the _NUTS2_ regions onto the development categories _More developed, Transitional, Less developed_ or the geographical category _Outermost or Northern Sparsely Populated_.

In [None]:
df_annual = pd.read_csv('ESIF_2014-2020_EU_payments__daily_update_.csv')
df_mapping = pd.read_csv('ESIF_2014-2020_-_LOOKUP_TABLE_-_ERDF_ESF_CF_Programme_Coverage_VS_NUTS_V2010.csv').dropna()
df_population = pd.read_excel('2019.07.11_Population_1January.xlsx',sheet_name='Sheet 1',header=10).iloc[:,::2]
df_wealth = pd.read_excel('NUTS2list_with_objectives.xls',sheet_name='MASTER NUTS-2 LIST 2007-2020',header=0,
                         usecols=[0,1,2,3,5,8])

#### Harmonising the NUTS codes across datasets

The blocks of code presented below are aimed at harmonising the _NUTS_ nomenclatures across datasets. The standard is the _NUTS-2010_ scheme. The population figures are presented according to the _NUTS-2016_ nomenclature. This requires a double mapping of this latter onto the corresponding _NUTS-2013_ scheme, which is in turn mapped onto the _NUTS-2010_. The development categories are listed as per the _NUTS-2006_ format, which needs to be updated to _NUTS-2010_.

In [None]:
df_wealth.loc[df_wealth.MS=='GR','MS']='EL'
df_wealth.loc[df_wealth['NUTS-2'].str[:2]=='GR','NUTS-2']='EL'+df_wealth['NUTS-2'].str[2:]
df_wealth.loc[df_wealth['NUTS-2'].str[:3]=='ITD','NUTS-2']='ITH'+df_wealth['NUTS-2'].str[3:]
df_wealth.loc[df_wealth['NUTS-2'].str[:3]=='ITE','NUTS-2']='ITI'+df_wealth['NUTS-2'].str[3:]
df_wealth.loc[(df_wealth['NUTS-2']=='FI13')|(df_wealth['NUTS-2']=='FI1A'),'NUTS-2']='FI1D'
df_wealth.loc[(df_wealth['NUTS-2']=='DE41')|(df_wealth['NUTS-2']=='DE42'),'NUTS-2']='DE40'
df_wealth.loc[df_wealth['NUTS-2']=='DED1','NUTS-2']='DED4'
df_wealth.loc[df_wealth['NUTS-2']=='DED3','NUTS-2']='DED5'
df_wealth.loc[df_wealth['NUTS-2']=='FI18','NUTS-2']='FI1B'
df_wealth.loc[df_wealth['NUTS-2']=='UKD2','NUTS-2']='UKD6'
df_wealth.loc[df_wealth['NUTS-2']=='UKD5','NUTS-2']='UKD7'

In [None]:
df_population.columns = ['NUTS2',2015,2016,2017,2018]
df_population=df_population[df_population[2018]!=':']

In [None]:
df_population.loc[(df_population.NUTS2=='UKI5')|(df_population.NUTS2=='UKI6')|(df_population.NUTS2=='UKI7'),'NUTS2']='UKI2'
df_population.loc[(df_population.NUTS2=='UKI3')|(df_population.NUTS2=='UKI4'),'NUTS2']='UKI1'
df_population.loc[df_population.NUTS2=='SI04','NUTS2']='SI02'
df_population.loc[df_population.NUTS2=='SI03','NUTS2']='SI01'
df_population.loc[(df_population.NUTS2=='FRA4')|(df_population.NUTS2=='FRA5'),'NUTS2']='FR94'
df_population.loc[df_population.NUTS2=='FRA3','NUTS2']='FR93'
df_population.loc[df_population.NUTS2=='FRA2','NUTS2']='FR92'
df_population.loc[df_population.NUTS2=='FRA1','NUTS2']='FR91'
df_population.loc[(df_population.NUTS2=='FRY4')|(df_population.NUTS2=='FRY5'),'NUTS2']='FR94'
df_population.loc[df_population.NUTS2=='FRY3','NUTS2']='FR93'
df_population.loc[df_population.NUTS2=='FRY2','NUTS2']='FR92'
df_population.loc[df_population.NUTS2=='FRY1','NUTS2']='FR91'
df_population.loc[df_population.NUTS2=='EL51','NUTS2']='EL11'
df_population.loc[df_population.NUTS2=='EL52','NUTS2']='EL12'
df_population.loc[df_population.NUTS2=='EL53','NUTS2']='EL13'
df_population.loc[df_population.NUTS2=='EL54','NUTS2']='EL21'
df_population.loc[df_population.NUTS2=='EL61','NUTS2']='EL14'
df_population.loc[df_population.NUTS2=='EL62','NUTS2']='EL22'
df_population.loc[df_population.NUTS2=='EL63','NUTS2']='EL23'
df_population.loc[df_population.NUTS2=='EL64','NUTS2']='EL24'
df_population.loc[df_population.NUTS2=='EL65','NUTS2']='EL25'

# NUTS 2013 - 2016 below
df_population.loc[df_population.NUTS2=='IE04','NUTS2']='IE01'
df_population.loc[(df_population.NUTS2=='IE05')|(df_population.NUTS2=='IE06'),'NUTS2']='IE02'
df_population.loc[df_population.NUTS2=='FRB0','NUTS2']='FR24'
df_population.loc[df_population.NUTS2=='FRC1','NUTS2']='FR26'
df_population.loc[df_population.NUTS2=='FRC2','NUTS2']='FR43'
df_population.loc[df_population.NUTS2=='FRD1','NUTS2']='FR25'
df_population.loc[df_population.NUTS2=='FRD2','NUTS2']='FR23'
df_population.loc[df_population.NUTS2=='FRE1','NUTS2']='FR30'
df_population.loc[df_population.NUTS2=='FRE2','NUTS2']='FR22'
df_population.loc[df_population.NUTS2=='FRF1','NUTS2']='FR42'
df_population.loc[df_population.NUTS2=='FRF2','NUTS2']='FR21'
df_population.loc[df_population.NUTS2=='FRF3','NUTS2']='FR41'
df_population.loc[df_population.NUTS2=='FRG0','NUTS2']='FR51'
df_population.loc[df_population.NUTS2=='FRH0','NUTS2']='FR52'
df_population.loc[df_population.NUTS2=='FRI1','NUTS2']='FR61'
df_population.loc[df_population.NUTS2=='FRI2','NUTS2']='FR63'
df_population.loc[df_population.NUTS2=='FRI3','NUTS2']='FR53'
df_population.loc[df_population.NUTS2=='FRJ1','NUTS2']='FR81'
df_population.loc[df_population.NUTS2=='FRJ2','NUTS2']='FR62'
df_population.loc[df_population.NUTS2=='FRK1','NUTS2']='FR72'
df_population.loc[df_population.NUTS2=='FRK2','NUTS2']='FR71'
df_population.loc[df_population.NUTS2=='FRL0','NUTS2']='FR82'
df_population.loc[df_population.NUTS2=='FRM0','NUTS2']='FR83'
df_population.loc[(df_population.NUTS2=='LT01')|(df_population.NUTS2=='LT02'),'NUTS2']='LT00'
df_population.loc[(df_population.NUTS2=='HU11')|(df_population.NUTS2=='HU12'),'NUTS2']='HU10'
df_population.loc[df_population.NUTS2=='PL71','NUTS2']='PL11'
df_population.loc[df_population.NUTS2=='PL72','NUTS2']='PL33'
df_population.loc[df_population.NUTS2=='PL81','NUTS2']='PL31'
df_population.loc[df_population.NUTS2=='PL82','NUTS2']='PL32'
df_population.loc[df_population.NUTS2=='PL84','NUTS2']='PL34'
df_population.loc[(df_population.NUTS2=='PL91')|(df_population.NUTS2=='PL92'),'NUTS2']='PL12'
df_population.loc[(df_population.NUTS2=='UKM8')|(df_population.NUTS2=='UKM9'),'NUTS2']='UKM3'
df_population.loc[df_population.NUTS2=='UKM7','NUTS2']='UKM2'

In [None]:
df_population['NUTS1']=df_population['NUTS2'].str[:-1]
df_population['MS']=df_population['NUTS2'].str[:-2]
df_population = pd.melt(df_population, id_vars=['MS','NUTS1','NUTS2'], value_vars=[2015,2016,2017,
2018]).rename(columns={'variable':'Year','value':'Population'}).sort_values(by=['MS','NUTS1',
                                                                                'NUTS2']).dropna().reset_index(drop=True)
df_population['Population']=pd.to_numeric(df_population['Population'])

In [None]:
df_population['Population'] = df_population.groupby(['NUTS2','Year'])['Population'].transform(sum)
df_population=df_population.drop_duplicates(subset=['NUTS2', 'Year'])

df_wealth=df_wealth[df_wealth.Period=='2014-2020'].rename(columns={'NUTS-2':'NUTS2'})
df_wealth['NUTS1']=df_wealth['NUTS2'].str[:-1]

df_wealth.loc[(df_wealth.NUTS1=='FRY')|(df_wealth.NUTS2=='PT20')|(df_wealth.NUTS2=='PT30')|
                  (df_wealth.NUTS2=='SE07')|(df_wealth.NUTS2=='SE08'),'Category of region']=\
'Outermost or Northern Sparsely Populated'

#### Mapping across datasets

The development categories are mapped onto the datasets: the _NUTS2_ codes are the key for the mapping across datasets. Discrepancies due to different _NUTS_ nomenclatures are harmonised by hand with targeted commands.

In [None]:
mapping_cat = dict(df_wealth[['NUTS2', 'Category of region']].values)
df_population['Category of region'] = df_population['NUTS2'].map(mapping_cat)
df_population['Population_MS'] = df_population.groupby(['MS','Year'])['Population'].transform(sum)
df_population['Population_NUTS1'] = df_population.groupby(['NUTS1','Year'])['Population'].transform(sum)
df_population['Population_MS_category'] = df_population.groupby(['MS','Category of region',
                                                                 'Year'])['Population'].transform(sum)
df_population['Population_NUTS1_category'] = df_population.groupby(['NUTS1','Category of region',
                                                                    'Year'])['Population'].transform(sum)
df_population['MS_share']=df_population['Population']/df_population['Population_MS']
df_population['NUTS1_share']=df_population['Population']/df_population['Population_NUTS1']
df_population['MS_share']=df_population['Population']/df_population['Population_MS']
df_population['NUTS1_share']=df_population['Population']/df_population['Population_NUTS1']
df_population['MS_share_category']=df_population['Population']/df_population['Population_MS_category']
df_population['NUTS1_share_category']=df_population['Population']/df_population['Population_NUTS1_category']
df_population.loc[df_population.NUTS2=='FI1C','Category of region']='More developed'
df_population.loc[df_population.NUTS2.isin(['FRXX','HUXX']),'Category of region']='Less developed'
df_population.loc[df_population.NUTS1=='FR9','Category of region']=\
'Outermost or Northern Sparsely Populated'

In [None]:
df_mapping=df_mapping[df_mapping.CCI_code.str[4:6]!='TC']
df_mapping=df_mapping.assign(NUTS3=df_mapping['NUTS_(2010)_code'].where(df_mapping['NUTS_(2010)_code'].str.len()>4))
df_mapping=df_mapping.assign(NUTS2=df_mapping['NUTS_(2010)_code'].where(df_mapping['NUTS_(2010)_code'].str.len()==4))
df_mapping['NUTS2'] = df_mapping['NUTS2'].fillna(df_mapping.NUTS3.str[:-1])
df_mapping=df_mapping.assign(NUTS1=df_mapping['NUTS_(2010)_code'].where(df_mapping['NUTS_(2010)_code'].str.len()==3))
df_mapping[df_mapping.Programme_Short_Title.str.contains('Mayotte')]['NUTS2']='FR94'
df_mapping.loc[df_mapping.CCI_code=='2014FR16M2OP012','NUTS_(2010)_code']='FR94'
df_mapping[(df_mapping.NUTS2=='UKZZ')&(df_mapping.Programme_Short_Title.str.contains('Wales'))]['NUTS2']='UKL1'
mapping_wealth = dict(df_wealth[['NUTS2', 'Category of region']].values)
df_mapping['Category of region'] = df_mapping['NUTS2'].map(mapping_wealth).fillna('void')
df_mapping[df_mapping['NUTS2']=='FI1C']['Category of region']='More developed'
df_mapping[(df_mapping.NUTS2=='UKZZ')&(df_mapping.Programme_Short_Title.str.contains('Gibraltar'))]['Category of region']=\
'More developed'
df_mapping.loc[df_mapping['NUTS_(2010)_code'].str.contains('FR9'),'Category of region']=\
'Outermost or Northern Sparsely Populated'

### Preparation of the EC payment datasets for the regionalisation

The dataframe is initially sliced so as to include only the relevant columns. The _Interregio_ programme is then dropped along potential sources of double counting (i.e. _IPAE-contribution from ERDF_ and _YEI Specific Allocation_). Annual figures are obtained from the cumulative ones through row-wise subtractions.

In [None]:
df_annual_r = df_annual.loc[:,['MS','CCI','Category of region','Year','Fund','Net interim payments']].copy()
df_annual_r.loc[df_annual_r['Category of region']=='Transition','Category of region']='Transitional'
df_annual_r = df_annual_r[(df_annual_r.Year!=2019)&(df_annual_r.Fund!='IPAE-contribution from ERDF')
                         &(df_annual_r.Fund!='YEI ESF Matching Component')&(df_annual_r.Fund!='YEI Specific Allocation')&
                         (df_annual_r.MS!='TC')]
df_annual_r = df_annual_r.sort_values(by=['CCI','Fund','Category of region','Year']).reset_index(drop=True)
df_annual_r['Category of region']=df_annual_r['Category of region'].fillna('void')
df_annual_r['Net interim payments']=pd.to_numeric(df_annual_r['Net interim payments'].str.replace(',', ''))

df_annual_r['Payments']=df_annual_r.groupby(['CCI','Fund','Category of region'])['Net interim payments'].diff().fillna(0)

### Mapping of the NUTS codes from the look-up table onto the payment dataset

The _NUTS_ code reported for each _CCI_ payment code is mapped onto the payments dataset only for those payments that have an unambiguous correspondence between the _CCI_ code and the _NUTS_ code. These are eventually mapped onto _NUTS3_ and _NUTS2_ categories by using the number of characters in the NUTS code provided.

In [None]:
lookup_dict = df_mapping.groupby(['CCI_code','Category of region'])['NUTS_(2010)_code'].apply(list).to_dict()
ld = {k: list(set(v))[0] for k, v in lookup_dict.items() if len(set(v)) == 1}

df_annual_r['NUTS_code'] = df_annual_r.set_index(['CCI','Category of region']).index.map(ld.get).fillna(value=pd.np.nan)

In [None]:
df_annual_r=df_annual_r.assign(NUTS3=df_annual_r.NUTS_code.where(df_annual_r.NUTS_code.str.len()>4))
df_annual_r=df_annual_r.assign(NUTS2=df_annual_r.NUTS_code.where(df_annual_r.NUTS_code.str.len()==4))

df_annual_r['NUTS2'] = df_annual_r['NUTS2'].fillna(df_annual_r.NUTS3.str[:-1])
df_annual_r=df_annual_r.assign(NUTS1=df_annual_r.NUTS_code.where(df_annual_r.NUTS_code.str.len()==3))

#### The unique member-state Outmost and Nothern sparsed territories are acknowledged in the annualised database

When a payment is remitted to this typology of areas and these is only within a country, one can easily attribute the corresponding NUTS2 code to these typology of regions. All with the aim to ease the following regionalisation of these payments.

In [None]:
df_annual_r.loc[(df_annual_r['Category of region']=='Outermost or Northern Sparsely Populated')&(df_annual_r.MS=='ES'),
                'NUTS2']='ES70'

df_annual_r.loc[(df_annual_r['Category of region']=='Outermost or Northern Sparsely Populated')&(df_annual_r.MS=='FI'),
               'NUTS2']='FI1D'

df_annual_r.loc[(df_annual_r.CCI=='2014PT16M2OP006')|(df_annual_r.CCI=='2014PT16M2OP004'),'Category of region']=\
'Outermost or Northern Sparsely Populated'

df_annual_r.loc[df_annual_r.CCI.isin(list(set(df_mapping[df_mapping['NUTS_(2010)_code'].str.contains('FR9')]['CCI_code']))),
                'Category of region']='Outermost or Northern Sparsely Populated'

In [None]:
df_annual_r.loc[(df_annual_r.NUTS_code.isna())&(df_annual_r.NUTS1.isna())&(df_annual_r.NUTS2.isna())&(df_annual_r.NUTS3.isna()),
               'NUTS0']=df_annual_r.MS

Payments are then clustered per fund and per country at granularity level at which they have been reported

In [None]:
df_fund = pd.concat([df_annual_r[~df_annual_r.iloc[:,-k].isna()].groupby('Fund').Payments.sum() for k in 
                        range(1,5)],axis=1)
df_fund.columns = ['NUTS0','NUTS1','NUTS2','NUTS3']

In [None]:
df_country = pd.concat([df_annual_r[~df_annual_r.iloc[:,-k].isna()].groupby('MS').Payments.sum() for k in 
                        range(1,5)],axis=1)
df_country.columns = ['NUTS0','NUTS1','NUTS2','NUTS3']

Their shares can be finally visualised as per the following tables

In [46]:
(df_fund.T/df_fund.T.sum()).round(2)

Unnamed: 0,CF,EAFRD,EMFF,ERDF,ESF,FEAD,YEI
NUTS0,0.92,1.0,1.0,0.68,0.72,1.0,0.67
NUTS1,0.08,,,,,,0.14
NUTS2,,,,0.32,0.28,,0.19
NUTS3,,,,0.01,0.0,,


In [49]:
pd.set_option('display.max_columns', 30)
(df_country.T/df_country.T.sum()).round(2)

Unnamed: 0,AT,BE,BG,CY,CZ,DE,DK,EE,ES,FI,FR,GR,HR,HU,IE,IT,LT,LU,LV,MT,NL,PL,PT,RO,SE,SI,SK,UK
NUTS0,1.0,0.97,1.0,1.0,0.97,0.84,0.96,1.0,0.78,0.95,0.83,0.98,0.91,0.99,0.9,0.73,0.57,0.69,1.0,0.75,1.0,0.54,0.79,0.99,0.81,0.38,1.0,0.82
NUTS1,,0.02,,,0.01,,,,,,0.0,,0.09,0.01,0.06,,,,,0.25,,,,,,0.29,,0.0
NUTS2,,0.01,,,0.02,0.14,0.04,,0.22,0.05,0.17,0.02,,,0.04,0.27,0.43,0.31,,,,0.46,0.21,0.01,0.18,0.33,0.0,0.18
NUTS3,,0.01,,,0.01,0.02,,,0.0,,0.0,,,,,0.0,,,,,,,,,0.01,,0.0,
