In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
folder = './Data'

files = os.listdir(folder)

# extract keys with which each file starts
# one key == one dataformat
df_keys = map(lambda f: f.split('_')[0], files)
df_keys = list(df_keys)
df_keys = set(df_keys)

df_keys

{'bloom', 'bp', 'edata', 'qu', 'wbank', 'web'}

In [3]:
def process_bloom_df(df):
    countries = ['China', 'Germany', 'India', 'U.S.']
    
    df = df.drop('quarter', axis=1)
    df = df[df['country'].isin(countries)]
    df = df.groupby(['year', 'country']).sum()
    df = df.reset_index(level=[0,1])
    df = df.replace('U.S.', 'United States')
    df = df.sort_values(by=['year', 'country'])
    df = df.rename({'year': 'Year', 'country': 'Country', 'value': 'Asset investment in renewables (Billions USD)'}, axis=1)
    df = df.reset_index(drop=True)
    
    return df

In [4]:
def process_bp_edata_qu_df(df, indicator, unit, years, countries, countries_new):
    value_name = f'{indicator} ({unit})'
    years = years.astype(str)

    df = df[df['Country'].isin(countries)]
    df = df.reset_index(drop=True)
    df = pd.melt(df, id_vars=['Country'], value_vars=years, 
                     var_name='Year', value_name=value_name)
    df = df.replace(countries, countries_new)
    df = df.sort_values(by=['Country', 'Year'])
    df = df.reset_index(drop=True)
    
    return df

In [5]:
def process_wbank_df(df):
    countries = ['China', 'Germany', 'India', 'United States']
    
    df = df[df['Country Name'].isin(countries)]
    df = df.drop(['Country Code', 'Time Code'], axis=1)
    df = df.rename({'Country Name': 'Country', 'Time': 'Year'}, axis=1)

    def rename_column(column):
        try:
            column_renamed = column.split('[')[0]

            return column_renamed
        except:
            return column

    df = df.rename(rename_column, axis=1)
    df['Year'] = df['Year'].astype('int').astype('str')
    df = df.sort_values(by=['Country', 'Year'])
    
    return df

In [6]:
target_folder = './Data Formated'

get_df_files_by_key = lambda key: list(filter(lambda file: key in file, files))
is_xlsx = lambda file: '.xlsx' in file

metadata = pd.read_csv('./metadata.csv', index_col = 0)

for df_key in df_keys:
    for file in get_df_files_by_key(df_key):
        if is_xlsx(file):
            continue
        
        if df_key == 'web':
            continue
            
        df = pd.read_csv(f'{folder}/{file}')
        
        if df_key == 'bloom':
            df = process_bloom_df(df)
            
        elif df_key == 'bp' or df_key == 'edata' or df_key == 'qu':
            file_metadata = metadata[metadata['source file'] == file.split('.')[0].strip()]

            indicator = file_metadata['indicator'].values[0]
            unit = file_metadata['unit'].values[0]
            
            countries_new = ['China', 'Germany', 'India', 'United States']
            if df_key == 'bp':
                countries = ['China', 'Germany', 'India', 'US']
                years = np.arange(2000, 2020)
                
            elif df_key == 'edata':
                countries = ['China', 'Germany', 'India', 'United States']
                years = np.arange(2000, 2018)
                
            elif df_key == 'qu':
                df = df.rename({'Land': 'Country'}, axis=1)
                
                countries = ['China', 'Deutschland', 'Indien', 'USA']
                years = np.arange(2000, 2020)
                
            df = process_bp_edata_qu_df(df, indicator, unit, years, countries, countries_new)
            
        elif df_key == 'wbank':
            df = process_wbank_df(df)
        
        df.to_csv(f'{target_folder}/{file}')