# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [None]:
# Imports
import pandas as pd
import numpy as np
import glob

In [None]:
# Identify files to be preprocessed
csv_list = glob.glob('source_data/Energy/*.csv', recursive=True)
#csv_list = [file.replace('select_df\\', '') for file in csv_list]

In [None]:
# Load all corresponding metadata in a compound dict (dict of dicts)
df_metadict = {}
df_meta = pd.read_csv('Indicators_metadata.csv')
df_meta_temp = df_meta.set_index('SOURCE FILE')

In [None]:
# Build the Dict
for csv_file in csv_list:    
    df_metadict_sub = {}
    csv_file= csv_file.replace('source_data/Energy\\', '').replace('.csv', '')
    df_metadict_sub['Units_ind']= df_meta_temp['UNIT'][csv_file]
    df_metadict_sub['Origin_ind']= df_meta_temp['SITE'][csv_file]
    df_metadict_sub['Name_ind'] =  df_meta_temp['INDICATOR'][csv_file]
    df_metadict_sub['Key_ind'] =  df_meta_temp['KEY'][csv_file]
    df_metadict_sub['Desc_ind'] =  df_meta_temp['DESCRIPTION'][csv_file]
    df_metadict_sub['excep_format'] = df_meta_temp['SPECIAL FORMAT'][csv_file]
    df_metadict[csv_file] = df_metadict_sub 

## Iteratively generate the Data Frames

In [None]:
# Call desired data and stored in DF-Dict
df_dict = {}
for csv_file in csv_list:
    df_dict[csv_file.replace('source_data/Energy\\', '').replace('.csv', '')] = pd.read_csv(csv_file)  # index_col=0 COULD CAUSE PPROBLEMS
    

In [None]:
# Adjust the DF to homogeneity
SELECTED_COUNTRIES = ['China', 'Germany', 'India', 'United States']
YEARS_INCLUDED = [2000,2017]
for csv_file in csv_list:    
    csv_file = csv_file.replace('source_data/Energy\\', '').replace('.csv', '')
    if df_metadict_sub['excep_format'][csv_file]:
        df_p = df_dict[csv_file]
        # Melt to a Long format
        df_p=df_dict['bp_penergy_consum'].melt(id_vars='Country')
        # Rename Years column
        df_p =df_p.rename(columns = {'variable':'Years'})
        # Adjust column types
        df_p['value']= df_p['value'].astype(float) #pd.to_numeric(df_p['value'],errors='coerce')
        df_p['Years'] = pd.to_numeric(df_p['Years'],errors='coerce') #df_p['Years'].astype(int)
        # Selecting rows based on time range and selected countries 
        sel_y = YEARS_INCLUDED
        df_p =df_p[(df_p['Years']>sel_y[0])&(df_p['Years']<sel_y[1])] # (df_p['Years'].is_integer())&
        df_p = df_p[df_p.Country.isin(SELECTED_COUNTRIES)]
        # Clean the indexes
        df_p = df_p.set_index('Country')
        df_p = df_p.reset_index()
        df_p

## Print to csv

In [None]:
for csv_file in csv_list:    
    csv_file = csv_file.replace('source_data/Energy\\', '').replace('.csv', '')
    df_p.to_csv('prov_result/'+csv_file+'.csv')