# Imports - Run first

In [140]:
import pandas as pd
from IPython.display import display
import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

warnings.simplefilter("ignore")

# Step-by-step execution

In [141]:
# Loads column 'C' of the 6th row of the dataset
sheet_information = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                               sheet_name = 'Sheet 1', skiprows = 5, nrows = 1,
                              usecols = 'C')

# The sector is the column name of the dataframe, and its only row should be the category name
sector = sheet_information.columns.values[0]
category = sheet_information[sector].iloc[0]

category

'Total'

In [142]:
# Loads the dataset and skips the first 9 rows (not needed)
df = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                   sheet_name = 'Sheet 1', skiprows = 9)

# Dataset contains some flags, visualized as collumns, which get removed
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

# Removing the last 6 rows (not needed)
df.drop(df.tail(6).index,inplace = True)

# Removing first row since it contains no data
df.drop(index = 0, inplace = True)

# Replacing broken header names with their actual name
df.rename(columns={'TIME': 'Country', 'TIME.1': 'UNIT'}, inplace = True)

df_before = df.loc[df['Country'] == 'Greece']

df

Unnamed: 0,Country,UNIT,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
1,European Union - 27 countries (from 2020),Million euro,5660949.1,5709529.5,5770198.1,5874466.9,5942333.4,6105736.5,6295433.4,6520557.3,7118015.8,:
2,European Union - 27 countries (from 2020),Percentage of gross domestic product (GDP),49.7,49.6,49.0,48.1,47.4,46.7,46.5,46.5,52.9,:
3,Euro area - 19 countries (from 2015),Million euro,4918492.7,4960347.2,5012826.4,5095520.2,5163761.1,5289227.9,5436975.8,5622101.3,6129243.0,:
4,Euro area - 19 countries (from 2015),Percentage of gross domestic product (GDP),50.0,49.9,49.3,48.4,47.7,47.1,46.9,46.9,53.5,:
5,Belgium,Million euro,218102.2,220470.1,224069.3,223850.3,228451.6,231561.0,240313.6,247898.6,270409.9,:
6,Belgium,Percentage of gross domestic product (GDP),56.5,56.1,55.6,53.7,53.1,52.0,52.2,51.8,59.2,:
7,Bulgaria,Million euro,14475.6,15879.2,18575.1,18495.3,16952.4,18266.1,20770.5,21851.6,25636.0,:
8,Bulgaria,Percentage of gross domestic product (GDP),34.3,37.8,43.2,40.4,34.8,34.8,36.9,35.5,41.8,:
9,Czechia,Million euro,72636.1,68010.6,67266.7,71102.8,70533.6,75673.2,85642.7,92623.1,101648.8,:
10,Czechia,Percentage of gross domestic product (GDP),44.7,42.7,42.6,41.9,39.8,39.0,40.6,41.1,47.2,:


In [143]:
# Extracting the Year columns
year_list=list(test.columns)
year_list.remove('Country')
year_list.remove('UNIT')

# Splitting the data based on the measurement unit and also adding the category name
gdp = df.loc[df['UNIT'] == 'Percentage of gross domestic product (GDP)']
gdp = pd.melt(gdp, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = '% GDP', ignore_index=False)
gdp = gdp.sort_values(by = ['Country', 'Year']).reset_index(drop = True)
gdp['Category'] = category

millions = df.loc[df['UNIT'] == 'Million euro']
millions = pd.melt(millions, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = 'Million euro', ignore_index=False)
millions = millions.sort_values(by = ['Country', 'Year']).reset_index(drop = True)

# Merging the necessary data
df = millions
df['% GDP'] = gdp['% GDP']
df['Category'] = gdp['Category']

df

Unnamed: 0,Country,Year,Million euro,% GDP,Category
0,Austria,2012,163191.9,51.2,Total
1,Austria,2013,167292.1,51.6,Total
2,Austria,2014,174671.6,52.4,Total
3,Austria,2015,176030.0,51.1,Total
4,Austria,2016,179059.0,50.1,Total
5,Austria,2017,182091.3,49.3,Total
6,Austria,2018,187850.6,48.8,Total
7,Austria,2019,193136.6,48.6,Total
8,Austria,2020,216207.5,56.7,Total
9,Austria,2021,227261.6,56,Total


In [144]:
# Displaying all entries where 'Country' collumn is 'Greece'
df_after = df.loc[df['Country'] == 'Greece']

display(df_before)
display(df_after)

Unnamed: 0,Country,UNIT,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
19,Greece,Million euro,106844.0,112926.0,89913.0,95336.0,87154.0,85871.0,87137.0,87758.0,98871.0,:
20,Greece,Percentage of gross domestic product (GDP),56.7,62.8,50.7,54.1,49.9,48.5,48.5,47.9,59.8,:


Unnamed: 0,Country,Year,Million euro,% GDP,Category
130,Greece,2012,106844.0,56.7,Total
131,Greece,2013,112926.0,62.8,Total
132,Greece,2014,89913.0,50.7,Total
133,Greece,2015,95336.0,54.1,Total
134,Greece,2016,87154.0,49.9,Total
135,Greece,2017,85871.0,48.5,Total
136,Greece,2018,87137.0,48.5,Total
137,Greece,2019,87758.0,47.9,Total
138,Greece,2020,98871.0,59.8,Total
139,Greece,2021,:,:,Total


# Row binding of 2 sheets DEMO

In [171]:
# All of the above code as a function

dataset = pd.DataFrame()

def cleaner(sheet):
    
    global dataset
    
    # Loads column 'C' of the 6th row of the dataset
    sheet_information = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                                   sheet_name = sheet, skiprows = 5, nrows = 1,
                                  usecols = 'C')

    # The sector is the column name of the dataframe, and its only row should be the category name
    sector = sheet_information.columns.values[0]
    category = sheet_information[sector].iloc[0]
    
    # Loads the dataset and skips the first 9 rows (not needed)
    df = pd.read_excel('gov_10a_exp__custom_4037524_spreadsheet.xlsx',
                       sheet_name = sheet, skiprows = 9)

    # Dataset contains some flags, visualized as collumns, which get removed
    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    # Removing the last 6 rows (not needed)
    df.drop(df.tail(6).index,inplace = True)

    # Removing first row since it contains no data
    df.drop(index = 0, inplace = True)

    # Replacing broken header names with their actual name
    df.rename(columns={'TIME': 'Country', 'TIME.1': 'UNIT'}, inplace = True)
    
    
    # Extracting the Year columns
    year_list=list(df.columns)
    year_list.remove('Country')
    year_list.remove('UNIT')

    # Splitting the data based on the measurement unit and also adding the category name
    gdp = df.loc[df['UNIT'] == 'Percentage of gross domestic product (GDP)']
    gdp = pd.melt(gdp, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = '% GDP', ignore_index=False)
    gdp = gdp.sort_values(by = ['Country', 'Year']).reset_index(drop = True)
    gdp['Category'] = category

    millions = df.loc[df['UNIT'] == 'Million euro']
    millions = pd.melt(millions, id_vars = ['Country'], value_vars = year_list, var_name = 'Year', value_name = 'Million euro', ignore_index=False)
    millions = millions.sort_values(by = ['Country', 'Year']).reset_index(drop = True)

    # Merging the necessary data
    df = millions
    df['% GDP'] = gdp['% GDP']
    df['Category'] = gdp['Category']
    
    dataset = pd.concat([dataset, df])

In [172]:
cleaner('Sheet 1')
cleaner('Sheet 2')
cleaner('Sheet 3')

In [173]:
dataset.loc[dataset['Country'] == 'Greece']

Unnamed: 0,Country,Year,Million euro,% GDP,Category
130,Greece,2012,106844.0,56.7,Total
131,Greece,2013,112926.0,62.8,Total
132,Greece,2014,89913.0,50.7,Total
133,Greece,2015,95336.0,54.1,Total
134,Greece,2016,87154.0,49.9,Total
135,Greece,2017,85871.0,48.5,Total
136,Greece,2018,87137.0,48.5,Total
137,Greece,2019,87758.0,47.9,Total
138,Greece,2020,98871.0,59.8,Total
139,Greece,2021,:,:,Total
