In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
import warnings

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

warnings.simplefilter("ignore")

In [2]:
# Loading the part of the dataset where the sheet information is located
information = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                            sheet_name = None, skiprows = 3, nrows = 4,
                            usecols = 'A, B, C')

# Loading the data
data = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                     sheet_name = None, skiprows = 9)

# Extracting the sheet names from the dataset
keys = list(information.keys())
keys.remove('Summary')
keys.remove('Structure')

information_sheets = {}
dataset = pd.DataFrame()

for key in keys:
    
    df = data[key]
    info_sheet = information[key].T

    # Bringing the sheet information into an appropriate format
    # and storing them into a dictionary
    info_sheet.columns = info_sheet.iloc[0]
    info_sheet.drop(info_sheet.head(2).index,inplace = True)
    info_sheet.reset_index(drop = True, inplace = True)

    information_sheets[key] = info_sheet
    
    # Extracting the category of the information fields
    category_column = info_sheet.columns.values[2]
    category = info_sheet[category_column].iloc[0]

    # Dataset contains some flags, visualized as collumns, which get removed
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    # Removing the last 5 rows (not needed)
    df.drop(df.tail(5).index,inplace = True)

    # Removing first row since it contains no data
    df.drop(index = 0, inplace = True)

    # Replacing broken header names with their actual name
    df.rename(columns={'TIME': 'Country', 'TIME.1': 'UNIT'}, inplace = True)
    
    
    # Extracting the Year columns
    year_list=list(df.columns)
    year_list.remove('Country')
    year_list.remove('UNIT')

    # Splitting the data based on the measurement unit and also adding the category name
    gdp = df.loc[df['UNIT'] == 'Percentage of gross domestic product (GDP)']
    gdp = pd.melt(gdp, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = '% GDP', ignore_index=False)
    gdp = gdp.sort_values(by = ['Country', 'Year']).reset_index(drop = True)
    gdp['Category'] = category

    millions = df.loc[df['UNIT'] == 'Million euro']
    millions = pd.melt(millions, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = 'Million euro', ignore_index=False)
    millions = millions.sort_values(by = ['Country', 'Year']).reset_index(drop = True)

    # Merging the necessary data
    df = millions
    df['% GDP'] = gdp['% GDP']
    df['Category'] = gdp['Category']
    
    dataset = pd.concat([dataset, df])

dataset.reset_index(drop = True, inplace = True)
dataset.replace(':', np.nan, inplace = True)
dataset.replace('European Union - 27 countries (from 2020)', 'European Union', inplace = True)
dataset.replace('Euro area - 19 countries  (from 2015)', 'Eurozone', inplace = True)
dataset.replace('Germany (until 1990 former territory of the FRG)', 'Germany', inplace = True)


####################################################
# Handling the GDP dataset

# Loading the data
data = pd.read_excel('nama_10_gdp__custom_4142114_spreadsheet.xlsx',
                    sheet_name = None, skiprows = 8)

# Extracting the sheet names from the dataset
keys = list(data.keys())
keys.remove('Summary')
keys.remove('Structure')

for key in keys:
    df = data[key]

    # Dataset contains some flags, visualized as collumns, which get removed
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    # Removing the last 5 rows (not needed)
    df.drop(df.tail(5).index,inplace = True)

    # Removing first row since it contains no data
    df.drop(index = 0, inplace = True)

    # Replacing broken header names with their actual name
    df.rename(columns={'TIME': 'Country', 'TIME.1': 'UNIT'}, inplace = True)

    # Extracting the Year columns
    year_list=list(df.columns)
    year_list.remove('Country')

    gdp = pd.melt(df, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = 'Million GDP', ignore_index=False)

    gdp.reset_index(drop = True, inplace = True)
    gdp.replace(':', np.nan, inplace = True)
    gdp.replace('European Union - 27 countries (from 2020)', 'European Union', inplace = True)
    gdp.replace('Euro area - 19 countries  (from 2015)', 'Eurozone', inplace = True)
    gdp.replace('Germany (until 1990 former territory of the FRG)', 'Germany', inplace = True)

dataset = dataset.merge(gdp, how='left')

# Updating the %GDP with the new values
dataset['% GDP'] = (dataset['Million euro'] / dataset['Million GDP']) * 100


dataset

Unnamed: 0,Country,Year,Million euro,% GDP,Category,Million GDP
0,Austria,2012,163191.9,51.213044,Total,318653.0
1,Austria,2013,167292.1,51.647679,Total,323910.2
2,Austria,2014,174671.6,52.430930,Total,333146.1
3,Austria,2015,176030.0,51.131498,Total,344269.2
4,Austria,2016,179059.0,50.071307,Total,357608.0
...,...,...,...,...,...,...
25595,Switzerland,2017,55.1,0.008948,Social protection n.e.c.,615776.3
25596,Switzerland,2018,53.7,0.008742,Social protection n.e.c.,614304.4
25597,Switzerland,2019,56.1,0.008705,Social protection n.e.c.,644443.2
25598,Switzerland,2020,59.3,0.009138,Social protection n.e.c.,648913.3


In [3]:
# dataset.to_excel('dataset.xlsx')

In [3]:
result = dataset.loc[dataset['Country'] == 'Greece']
result.head(30)

Unnamed: 0,Country,Year,Million euro,% GDP,Category,Million GDP
130,Greece,2012,106844.0,56.717093,Total,188380.6
131,Greece,2013,112926.0,62.776983,Total,179884.4
132,Greece,2014,89913.0,50.730664,Total,177236.0
133,Greece,2015,95336.0,54.054882,Total,176368.9
134,Greece,2016,87154.0,49.946646,Total,174494.2
135,Greece,2017,85871.0,48.541181,Total,176903.4
136,Greece,2018,87137.0,48.52869,Total,179557.7
137,Greece,2019,87758.0,47.863336,Total,183351.2
138,Greece,2020,98871.0,59.774772,Total,165405.9
139,Greece,2021,,,Total,181674.6
