# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob

In [2]:
# Load all corresponding metadata in a compound dict (dict of dicts)
df_meta = pd.read_csv('Indicators_metadata.csv')
df_meta_temp = df_meta.set_index('KEY')

In [3]:
# Identify files to be preprocessed
csv_path_list = glob.glob('source_data/Energy/*.csv', recursive=True)
csv_list = [file.replace('source_data/Energy\\', '').replace('.csv', '') for file in csv_path_list]

In [4]:
# Build the Dict
df_metadict={}
for ind_key in df_meta['KEY']:
    if df_meta_temp['SOURCE FILE'][ind_key] in csv_list:
#        print(ind_key)
        df_metadict_sub = {}
        df_metadict_sub['Units_ind']= df_meta_temp['UNIT'][ind_key]
        df_metadict_sub['Origin_ind']= df_meta_temp['SITE'][ind_key]
        df_metadict_sub['Name_ind'] =  df_meta_temp['INDICATOR'][ind_key]
#       df_metadict_sub['Key_ind'] =  df_meta_temp['KEY'][Key_meta]
        df_metadict_sub['Desc_ind'] =  df_meta_temp['DESCRIPTION'][ind_key]
        df_metadict_sub['source_file'] =  df_meta_temp['SOURCE FILE'][ind_key]+'.csv'
        df_metadict_sub['excep_format'] = df_meta_temp['SPECIAL FORMAT'][ind_key]#.astype(bool)
        df_metadict[ind_key] = df_metadict_sub 

In [5]:
indicators=list(df_metadict.keys())

## Iteratively generate the Data Frames and print to *.csv*

In [6]:
# Call desired data and stored in DF-Dict
df_dict = {}
for indic in indicators:    
    df_dict[indic] = pd.read_csv('source_data/Energy\\'+ df_metadict[indic]['source_file'])
print(df_dict.keys())

dict_keys(['ELECTP_A', 'ENECON_A', 'EINT_A', 'RELECTP_A', 'RENECON_A', 'RELECTP_B', 'WPELECTP_B', 'ELECTP_B', 'ELECTC_B', 'IGEO_C', 'ISOL_C', 'IWIN_C', 'PENERC_C', 'RPENEC_C', 'ELECTP_C', 'ELECTP2_C', 'ASST_D', 'SECSER_E', 'SECRES_E', 'SECTRA_E', 'SECIND_E', 'EBALAN_E', 'ISOL_F', 'IWIN_F'])


In [8]:
df_dict['ELECTP2_C'].describe()

Unnamed: 0,2008-18,2019.2
count,47.0,47.0
mean,0.023751,0.07984
std,0.190674,0.216492
min,-1.0,-0.535
25%,0.0,0.0
50%,0.002835,0.008042
75%,0.052017,0.075996
max,0.498208,1.0


In [9]:
# Adjust the DF to homogeneity
SELECTED_COUNTRIES = ['China', 'Germany', 'India', 'United States']
YEARS_INCLUDED = [2000,2017]
for indic in indicators:    
    if not df_metadict[indic]['excep_format']:
        Name_ind =df_metadict[indic]['Name_ind']
#        print(indic, df_metadict[indic]['excep_format'])
        df_p = df_dict[indic]
        # Melt to a Long format
        df_p=df_dict[indic].melt(id_vars='Country')
        # Rename column to Years
        df_p =df_p.rename(columns = {'variable':'Years'})
        # Adjust column types
        df_p['value']= pd.to_numeric(df_p['value'],errors='ignore')
        df_p['Years'] = pd.to_numeric(df_p['Years'],errors='coerce') #df_p['Years'].astype(int)
        # Rename column to Indic Name
        df_p =df_p.rename(columns = {'value':Name_ind})
        # Selecting rows based on time range and selected countries 
        sel_y = YEARS_INCLUDED
        df_p =df_p[(df_p['Years']>=sel_y[0])&(df_p['Years']<=sel_y[1])] # (df_p['Years'].is_integer())&
        df_p = df_p[df_p.Country.isin(SELECTED_COUNTRIES)]
        # Clean the indexes
        df_p = df_p.set_index('Country')
        df_p = df_p.reset_index()
        df_p.to_csv('prov_result/'+indic+'.csv')