# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [165]:
# Imports
import pandas as pd
import numpy as np

In [166]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bp_solar_instcap'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [167]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [168]:
# Check column names
df.columns

Index(['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2019.1', '2008-18',
       '2019.2'],
      dtype='object')

In [169]:
df.head()

Unnamed: 0_level_0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,3.0,3.0,5.0,6.0,7.0,9.0,10.0,12.0,14.0,17.0,...,1210.0,1843.0,2517.0,2661.0,2913.0,3100.0,3310.0,0.067742,0.575015294,0.005644
Mexico,10.0,11.0,12.0,13.0,14.0,15.0,16.0,16.0,16.0,16.0,...,82.0,116.0,173.0,388.6,673.74,2555.0,4440.208,0.73785,0.632539622,0.007572
US,13.9,14.6,15.2,16.6,18.5,21.9,28.0,73.0,111.0,190.0,...,13045.0,17651.0,23442.0,34716.0,43115.0,53183.5,62297.9,0.171376,0.53072859,0.106234
Total North America,26.9,28.6,32.2,35.6,39.5,45.9,54.0,101.0,141.0,223.0,...,14337.0,19610.0,26132.0,37765.6,46701.74,58838.5,70048.108,0.190515,0.535983574,0.11945


In [143]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
df=df.rename(columns={'Country Name':'Country'})

In [144]:
df['Country'].unique()

array(['China', 'Germany', 'India', 'United States', nan,
       'Data from database: Sustainable Energy for All',
       'Last Updated: 06/30/2018'], dtype=object)

In [145]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'United States'])]

In [146]:
df

Unnamed: 0,Country,Country Code,Time,Time Code,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC],Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]
0,China,CHN,2000.0,YR2000,10.23305051,16.63949819,29.73071484,1355738,30187188.73
1,China,CHN,2001.0,YR2001,9.833583306,18.95851917,28.45581538,1480949,30940867.04
2,China,CHN,2002.0,YR2002,9.621842057,17.6186279,27.09820152,1654164,32272543.97
3,China,CHN,2003.0,YR2003,9.994209983,15.03704033,23.94657959,1910755,35505254.39
4,China,CHN,2004.0,YR2004,10.35599738,16.22326642,20.24947926,2203502,41469975.08
...,...,...,...,...,...,...,...,...,...
63,United States,USA,2012.0,YR2012,5.692995246,12.00739706,8.481450408,4270884,56628804.85
64,United States,USA,2013.0,YR2013,5.676035631,12.63791912,8.713261573,4287114,57984073.56
65,United States,USA,2014.0,YR2014,5.621145713,12.95382709,8.754308954,4319156,59116385.16
66,United States,USA,2015.0,YR2015,5.40839254,13.22859321,8.716935867,4297048,58483061.91


In [147]:
# Check country name available
df['Time'].unique()

array([2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016.])

In [148]:
# Check column characteristics
df.describe()

Unnamed: 0,Time
count,68.0
mean,2008.0
std,4.935404
min,2000.0
25%,2004.0
50%,2008.0
75%,2012.0
max,2016.0


In [149]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('USA', 'United States')
df['Country']=df['Country'].replace('Deutschland', 'Germany')
df['Country']=df['Country'].replace('Indien', 'India')

In [150]:
df

Unnamed: 0,Country,Country Code,Time,Time Code,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC],Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]
0,China,CHN,2000.0,YR2000,10.23305051,16.63949819,29.73071484,1355738,30187188.73
1,China,CHN,2001.0,YR2001,9.833583306,18.95851917,28.45581538,1480949,30940867.04
2,China,CHN,2002.0,YR2002,9.621842057,17.6186279,27.09820152,1654164,32272543.97
3,China,CHN,2003.0,YR2003,9.994209983,15.03704033,23.94657959,1910755,35505254.39
4,China,CHN,2004.0,YR2004,10.35599738,16.22326642,20.24947926,2203502,41469975.08
...,...,...,...,...,...,...,...,...,...
63,United States,USA,2012.0,YR2012,5.692995246,12.00739706,8.481450408,4270884,56628804.85
64,United States,USA,2013.0,YR2013,5.676035631,12.63791912,8.713261573,4287114,57984073.56
65,United States,USA,2014.0,YR2014,5.621145713,12.95382709,8.754308954,4319156,59116385.16
66,United States,USA,2015.0,YR2015,5.40839254,13.22859321,8.716935867,4297048,58483061.91


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [151]:
index = ['Country']
df=df.set_index(index)

In [152]:
df.dropna()

Unnamed: 0_level_0,Country Code,Time,Time Code,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC],Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
China,CHN,2000.0,YR2000,10.23305051,16.63949819,29.73071484,1355738,30187188.73
China,CHN,2001.0,YR2001,9.833583306,18.95851917,28.45581538,1480949,30940867.04
China,CHN,2002.0,YR2002,9.621842057,17.6186279,27.09820152,1654164,32272543.97
China,CHN,2003.0,YR2003,9.994209983,15.03704033,23.94657959,1910755,35505254.39
China,CHN,2004.0,YR2004,10.35599738,16.22326642,20.24947926,2203502,41469975.08
...,...,...,...,...,...,...,...,...
United States,USA,2012.0,YR2012,5.692995246,12.00739706,8.481450408,4270884,56628804.85
United States,USA,2013.0,YR2013,5.676035631,12.63791912,8.713261573,4287114,57984073.56
United States,USA,2014.0,YR2014,5.621145713,12.95382709,8.754308954,4319156,59116385.16
United States,USA,2015.0,YR2015,5.40839254,13.22859321,8.716935867,4297048,58483061.91


### Select countries and time range

In [153]:
# Assign selected Countries (carefull with the names: Case sensitive, etc.)
sel_c = ['United States', 'India', 'Germany','China']
# Assign selected Years or interval
sel_y = [2000,2017]#one member more
sel_years = [*range(sel_y[0],sel_y[1])] #using unpacking(*) operator
sel_years_txt = [str(x) for x in sel_years]

In [154]:
# Check Time name available
df['Time'].unique()

array([2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016.])

In [155]:
# turn from float into int
df['Time'] = df['Time'].astype(int)

In [156]:
# rename as years
df.rename(columns = {'Time':'Years'}, inplace = True)
df['Years']

Country
China            2000
China            2001
China            2002
China            2003
China            2004
                 ... 
United States    2012
United States    2013
United States    2014
United States    2015
United States    2016
Name: Years, Length: 68, dtype: int64

In [157]:
# Apply selection creteria
# df_p = df[sel_years]
# df_p = df_p[df_p.index.isin(sel_c)]

In [158]:
df_p = df[(df['Years']>0) & (df['Years']<2020)]
df_p = df_p[df_p.index.isin(sel_c)]

In [159]:
df_p

Unnamed: 0_level_0,Country Code,Years,Time Code,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC],Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
China,CHN,2000,YR2000,10.23305051,16.63949819,29.73071484,1355738,30187188.73
China,CHN,2001,YR2001,9.833583306,18.95851917,28.45581538,1480949,30940867.04
China,CHN,2002,YR2002,9.621842057,17.6186279,27.09820152,1654164,32272543.97
China,CHN,2003,YR2003,9.994209983,15.03704033,23.94657959,1910755,35505254.39
China,CHN,2004,YR2004,10.35599738,16.22326642,20.24947926,2203502,41469975.08
...,...,...,...,...,...,...,...,...
United States,USA,2012,YR2012,5.692995246,12.00739706,8.481450408,4270884,56628804.85
United States,USA,2013,YR2013,5.676035631,12.63791912,8.713261573,4287114,57984073.56
United States,USA,2014,YR2014,5.621145713,12.95382709,8.754308954,4319156,59116385.16
United States,USA,2015,YR2015,5.40839254,13.22859321,8.716935867,4297048,58483061.91


### Reform the DF to a suitable form

In [160]:
# Reset index to return DF to a tidy state
df_p=df_p.reset_index()

In [161]:
df_p.columns

Index(['Country', 'Country Code', 'Years', 'Time Code',
       'Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY]',
       'Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY]',
       'Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC]',
       'Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT]',
       'Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]'],
      dtype='object')

In [162]:
df_p = df_p.drop(['Country Code', 'Time Code', 'Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY]',
       'Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC]',
       'Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT]',
       'Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]'], axis = 1) 

In [163]:
df_p

Unnamed: 0,Country,Years,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY]
0,China,2000,10.23305051
1,China,2001,9.833583306
2,China,2002,9.621842057
3,China,2003,9.994209983
4,China,2004,10.35599738
...,...,...,...
63,United States,2012,5.692995246
64,United States,2013,5.676035631
65,United States,2014,5.621145713
66,United States,2015,5.40839254


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [164]:
df_p.to_csv('result_df/preva_inten.csv')