In [None]:
### Importing all the relevant libraries

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import pkg_resources
import pandas as pd
import numpy as np
import sobol_seq
from tabulate import tabulate
import types

def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))

### Define a function to uniform the database formatting as to ease their cross-comparison

In [None]:
def formatting(x):
    return pd.pivot_table(pd.melt(x,id_vars=['ProgrammingPeriod','Country','NUTS1Code','NUTS2Code','Year'],
        var_name='FundingScheme'),index=['ProgrammingPeriod','FundingScheme','Country','NUTS1Code','NUTS2Code'],
        values='value',columns='Year')

### Open the dataset and isolate the rows relative to DG REGIO programmes

In [None]:
df = pd.read_excel('20181231 Pagamenti ammessi PO 2007-2013.xls',usecols=[0,1,2,3,4,5,7])
df_EU = pd.read_excel('Database_Final_UPD(3).xlsx')
df_REGIO = df[(df['CCI'].str.contains("161"))|df['CCI'].str.contains("162")]
df_expenditures = pd.read_excel('PivotedData.xlsx',sheet_name='Mean',index_col=0)

### Let harmonise the nomenclature across databases

In [None]:
NUTS2_dic = {'ABRUZZO':'ITF1','BASILICATA':'ITF5','CALABRIA':'ITF6','CAMPANIA':'ITF3','EMILIA':'ITH5','FRIULI':'ITH4','LAZIO':'ITI4',
'LIGURIA':'ITC3','LOMBARDIA':'ITC4','MARCHE':'ITI3','MOLISE':'ITF2','PIEMONTE':'ITC1','PUGLIA':'ITF4','SARDEGNA':'ITG2','SICILIA':'ITG1',
'TOSCANA':'ITI1','TRENTINO':'ITH0','UMBRIA':'ITI2',"VALLE D'AOSTA":'ITC2','VENETO':'ITH3'}

### Let us normalise the database as to make figures consistent between the Italian and the EU datasets

In [None]:
df_REGIO_capped = df_REGIO[df_REGIO.ANNO<2017]

df_REGIO_capped['NUTS2'] = df_REGIO_capped.REGIONE.map(NUTS2_dic)

### Let adjust the database formatting for the sake of comparability across figures

In [None]:
DS = [df_EU,df_expenditures]
ds_names = ['EU_Payments','Expenditures']
ds_pivoted = dict(zip(ds_names,[formatting(ds) for ds in DS]))

### Let us re-attribute the items unclearly allocated to the different regions

In [None]:
df_REGIO_nonAttributed = df_REGIO_capped[(df_REGIO_capped.REGIONE=='STATO ESTERO')|(df_REGIO_capped.REGIONE=='PLURI')|(df_REGIO_capped.REGIONE=='AMBITO NAZIONALE')]
df_REGIO_nonAttributed_yearly = df_REGIO_nonAttributed.groupby('ANNO').sum()
df_REGIO_nonAttributed_yearly

In [None]:
df_REGIO_REGIO = df_REGIO_capped[(df_REGIO_capped.REGIONE!='STATO ESTERO')&(df_REGIO_capped.REGIONE!='PLURI')&
                                 (df_REGIO_capped.REGIONE!='AMBITO NAZIONALE')]
df_REGIO_REGIO['NUTS2'] = df_REGIO_REGIO.REGIONE.map(NUTS2_dic)
df_REGIO_REGIO['multiplier']= df_REGIO_REGIO.NUTS2.map((df_REGIO_REGIO.groupby('NUTS2').sum()/
                                                        df_REGIO_REGIO.groupby('NUTS2').sum().sum()).PAGAMENTO_AMMESSO_UE)

In [None]:
df_REGIO_REGIO_yearly = df_REGIO_REGIO.groupby(['NUTS2','ANNO','multiplier']).sum()
df_REGIO_REGIO_yearly.PAGAMENTO_AMMESSO_UE=df_REGIO_REGIO_yearly.PAGAMENTO_AMMESSO_UE+df_REGIO_REGIO_yearly.index.get_level_values(2)*\
df_REGIO_REGIO_yearly.index.get_level_values(1).map(df_REGIO_nonAttributed_yearly.PAGAMENTO_AMMESSO_UE)
df_REGIO_REGIO_yearly=df_REGIO_REGIO_yearly.droplevel(2)

### Let isolate the IT figures for the programming period 2007-2013, the funding scheme ERDF

In [None]:
df_ERDF_20172013_IT=ds_pivoted['Expenditures'].loc['2007-2013','ERDF_TOTAL','IT',:,:]

The following lines need to be run twice

In [None]:
df_ERDF_20172013_IT.loc[('2007-2013','ERDF_TOTAL','IT','ITH','ITH0'),:] = \
df_ERDF_20172013_IT.loc[pd.IndexSlice[:,:,:,:,'ITH1'],:].droplevel(4).values+\
df_ERDF_20172013_IT.loc[pd.IndexSlice[:,:,:,:,'ITH2'],:].droplevel(4).values 

df_ERDF_20172013_IT = df_ERDF_20172013_IT.drop('ITH1',level='NUTS2Code').drop('ITH2',level='NUTS2Code').sort_index(by='NUTS2Code')

Excess=(df_REGIO_REGIO_yearly.groupby('NUTS2').sum()).PAGAMENTO_AMMESSO_UE-df_ERDF_20172013_IT.groupby('NUTS2Code').sum().sum(axis=1)

Excess

### Let us test how the assumption on the number of years from which the exceeding payment should be cut out for the sake of normalisation

In [None]:
df_REGIO_REGIO_yearly['Year']=df_REGIO_REGIO_yearly.index.get_level_values(1)
df_REGIO_REGIO_yearly=df_REGIO_REGIO_yearly.droplevel(1)
df_REGIO_pv = df_REGIO_REGIO_yearly.pivot_table(index='Year', columns='NUTS2', values='PAGAMENTO_AMMESSO_UE').fillna(0)

In [None]:
distance_l = []
distance_y = []
norm = []
idx = []
for ei,i in enumerate(reversed(df_REGIO_pv.index)):
    df_REGIO_norm = df_REGIO_pv.copy()
    df_REGIO_norm.loc[i:2016]=df_REGIO_pv.loc[i:2016]-Excess/(2016-i+1)
    norm.append(df_REGIO_norm)
    distance_l.append((np.abs((df_REGIO_norm.cumsum()-df_ERDF_20172013_IT.droplevel([0,1,2,3]).T.loc[2007:].cumsum())/\
                       df_REGIO_norm.sum())).sum().round(1))
    distance_y.append(np.abs(df_REGIO_norm.cumsum()-df_ERDF_20172013_IT.droplevel([0,1,2,3]).T.loc[2007:].cumsum()))
    distance_y[-1]['years']=ei+1
    norm[-1]['years']=ei+1
distance = pd.concat(distance_l,axis=1).T
distance.index = distance.index+1
distance_yearly = pd.concat(distance_y).sort_index()
norm_df = pd.concat(norm).sort_index()

distance_yearly.set_index('years',append=True, inplace=True)
norm_df.set_index('years',append=True, inplace=True)

distance_yearly=distance_yearly.swaplevel().sort_index(by=['years','Year'])
norm_df=norm_df.swaplevel().sort_index(by=['years','Year'])

### Get the min-max range for the normalised figures for the sake of comparison again the individual NUTS2

In [48]:
norm_data = norm_df.groupby('Year').min()
norm_data['val']='min'
norm_data.set_index('val',append=True, inplace=True)
norm_data=norm_data.swaplevel().sort_index(by=['val','Year'])
norm_data2 = norm_df.groupby('Year').max()
norm_data2['val']='max'
norm_data2.set_index('val',append=True, inplace=True)
norm_data2=norm_data2.swaplevel().sort_index(by=['val','Year'])
norm_dataset = pd.concat([norm_data,norm_data2]).to_csv('norm_IT_NUTS2.csv',header=True)

  after removing the cwd from sys.path.
  


In [None]:
pd.concat([pd.concat([norm_df.loc[pd.IndexSlice[:,yr],:].min() for yr in range(2007,2017)]),
           pd.concat([norm_df.loc[pd.IndexSlice[:,yr],:].max() for yr in range(2007,2017)])],axis=1)

In [None]:
mu = pd.read_csv('mu_IT.csv',index_col=0)

In [None]:
plt.scatter(mu.rank(ascending=False),distance.max().rank(),label=distance.max().index)
plt.xlabel('index of regional specificity reverse rank')
plt.ylabel('maximum distance rank')
plt.show()

In [None]:
plt.scatter(mu.rank(ascending=False),distance.min().rank(),label=distance.max().index)
plt.xlabel('index of regional specificity reverse rank')
plt.ylabel('minimum distance rank')
plt.show()