### Importing all the relevant libraries

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import pkg_resources
import pandas as pd
import numpy as np
import sobol_seq
from tabulate import tabulate
import time
import types

In [None]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

### Define a function to uniform the database formatting as to ease their cross-comparison

In [None]:
def formatting(x):
    return pd.pivot_table(pd.melt(x,id_vars=['ProgrammingPeriod','Country','NUTS1Code','NUTS2Code','Year'],
        var_name='FundingScheme'),index=['ProgrammingPeriod','FundingScheme','Country','NUTS1Code','NUTS2Code'],
        values='value',columns='Year')

### Open the dataset and isolate the rows relative to DG REGIO programmes

In [None]:
df = pd.read_excel('nuts_prog_kat_Cohesion_codesonly_v3.xlsx',usecols=[0,1,5,6,9,10,11,12])

df_EU = pd.read_excel('Database_Final_UPD(3).xlsx')
df_REGIO = df[(df['CCI'].str.contains("161"))|df['CCI'].str.contains("162")]
df_expenditures = pd.read_excel('PivotedData.xlsx',sheet_name='Mean',index_col=0)

df_REGIO.head()

### Let us normalise the database as to make figures consistent between the Czech and the EU datasets

In [None]:
df_REGIO_capped = df_REGIO[df_REGIO.year<2017]

### Assumption on the exchange rate - retrieved from http://sdw.ecb.europa.eu

In [None]:
ER = pd.read_csv('CZK_EURO_historical_exchange_rate.csv')
ExchangeRates = ER.copy()
ExchangeRates['year']=ER.date.astype(str).str[:4].astype(int)
ExchangeRates = ExchangeRates[(ExchangeRates['year']>2006)&(ExchangeRates['year']<2017)]
ExchangeRates=ExchangeRates.drop(['date','conf'],axis=1).set_index('year')

### Let adjust the database formatting for the sake of comparability across figures

In [None]:
DS = [df_EU,df_expenditures]
ds_names = ['EU_Payments','Expenditures']
ds_pivoted = dict(zip(ds_names,[formatting(ds) for ds in DS]))

### Let isolate the IT figures for the programming period 2007-2013, the funding scheme ERDF

In [None]:
ds_ERDF = ds_pivoted['Expenditures'].loc['2007-2013','ERDF_TOTAL','CZ',:,:]
ds_ERDF.index = ds_ERDF.index.droplevel(1)
ds_CF = ds_pivoted['Expenditures'].loc['2007-2013','CF_TOTAL','CZ',:,:]
ds_CF.index = ds_CF.index.droplevel(1)
df_20072013_CFERDF_CZ=(ds_ERDF+ds_CF).T.loc[2007:].T

In [None]:
df_REGIO_yearly = df_REGIO_capped.groupby('year')
df_REGIO_su = pd.DataFrame([dfr['EU (czk)'].sum() for idf, dfr in df_REGIO_yearly], 
                            index=[idf for idf, dfr in df_REGIO_yearly],columns=['sum'])

In [None]:
exchange_rate = [ExchangeRates.mean(),ExchangeRates]
er_n = ['Constant','Yearly average']

### Let us test how the assumption on the number of years from which the exceeding payment should be cut out for the sake of normalisation

In [26]:
for i in reversed(df_REGIO_su.index[::9]):
    plt.step(df_REGIO_su.index,df_20072013_CFERDF_CZ.T.loc[2007:].T.sum().expanding(1).sum(),label='Estimated Expenditure')
    for ie,er in enumerate(exchange_rate):
        df_REGIO_sum = df_REGIO_su/er.values
        Excess = -df_20072013_CFERDF_CZ.sum().sum()+df_REGIO_sum.sum()
        df_REGIO_norm = df_REGIO_sum.copy()
        df_REGIO_norm.loc[i:2016]=df_REGIO_sum.loc[i:2016]-Excess/(2016-i+1)
        print('The cumulative distance is ' + str((abs(df_REGIO_norm.values[:,0]-df_20072013_CFERDF_CZ.T.loc[2007:].T.sum().values).sum()/
              df_REGIO_norm.values[:,0].sum()).round(2)))
        plt.step(df_REGIO_sum.index,df_REGIO_norm.expanding(1).sum(),label='CZ database '+er_n[ie]+
                 ' exchange rate')
    plt.xlabel('Year')
    plt.ylabel('Amount, €')
    plt.title('Normalised over '+str(int(2016-i+1))+' year(s), '+str(i)+'-2016')
    plt.legend()
    plt.savefig(time.strftime("%Y.%m.%d") + '_2007-2013_CF-ERDF_CZ expenditure database ' + 
                'Normalised over '+str(int(2016-i+1))+' year(s), '+str(i)+'-2016'+'.png')
    plt.close()

The cumulative distance is 0.4
The cumulative distance is 0.4
The cumulative distance is 0.37
The cumulative distance is 0.37
