# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [18]:
# Imports
import pandas as pd
import numpy as np

In [19]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bp_solar_instcap'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [20]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [21]:
# Check column names
df.columns

Index(['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2019.1', '2008-18',
       '2019.2'],
      dtype='object')

In [22]:
df.head()

Unnamed: 0_level_0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,3.0,3.0,5.0,6.0,7.0,9.0,10.0,12.0,14.0,17.0,...,1210.0,1843.0,2517.0,2661.0,2913.0,3100.0,3310.0,0.067742,0.575015294,0.005644
Mexico,10.0,11.0,12.0,13.0,14.0,15.0,16.0,16.0,16.0,16.0,...,82.0,116.0,173.0,388.6,673.74,2555.0,4440.208,0.73785,0.632539622,0.007572
US,13.9,14.6,15.2,16.6,18.5,21.9,28.0,73.0,111.0,190.0,...,13045.0,17651.0,23442.0,34716.0,43115.0,53183.5,62297.9,0.171376,0.53072859,0.106234
Total North America,26.9,28.6,32.2,35.6,39.5,45.9,54.0,101.0,141.0,223.0,...,14337.0,19610.0,26132.0,37765.6,46701.74,58838.5,70048.108,0.190515,0.535983574,0.11945


In [23]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [24]:
df['Country'].unique()

array([nan, 'Canada', 'Mexico', 'US', 'Total North America', 'Brazil',
       'Chile', 'Honduras', 'Other S. & Cent. America',
       'Total S. & Cent. America', 'Austria', 'Belgium', 'Bulgaria',
       'Czech Republic', 'Denmark', 'France', 'Germany', 'Greece',
       'Hungary', 'Italy', 'Netherlands', 'Portugal', 'Romania',
       'Slovakia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine',
       'United Kingdom', 'Other Europe', 'Total Europe',
       'Russian Federation', 'Other CIS', 'Total CIS', 'Israel', 'Jordan',
       'United Arab Emirates', 'Other Middle East', 'Total Middle East',
       'Algeria', 'Egypt', 'Morocco', 'South Africa', 'Other Africa',
       'Total Africa', 'Australia', 'China', 'India', 'Japan', 'Malaysia',
       'Pakistan', 'Philippines', 'South Korea', 'Taiwan', 'Thailand',
       'Other Asia Pacific', 'Total Asia Pacific', 'Total World'],
      dtype=object)

In [25]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'US'])]

In [26]:
df

Unnamed: 0,Country,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,US,13.9,14.6,15.2,16.6,18.5,21.9,28.0,73.0,111.0,...,13045.0,17651.0,23442.0,34716.0,43115.0,53183.5,62297.9,0.171376,0.53072859,0.106234
18,Germany,28.0,42.0,54.0,70.0,114.0,195.0,260.0,435.0,1105.0,...,36711.0,37900.0,39224.0,40679.0,42293.0,45181.0,48962.0,0.083686,0.221294614,0.083493
53,China,1.0,2.5,5.0,10.0,33.515,38.02,56.53,66.6,76.6,...,17758.8,28398.8,43548.8,77808.8,130822.29,175236.864,205493.165,0.172659,0.923315339,0.350419
54,India,0.0,0.0,0.0,0.0,0.0,1.1,5.5,6.7,7.9,...,926.0,3672.508,5593.484,9879.021,18151.756,27355.324,35059.884,0.281648,1.012504287,0.059786


In [27]:
# Check country name available
# df['Time'].unique()

In [28]:
# Check column characteristics
df.describe()

Unnamed: 0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,...,2012,2013,2014,2015,2016,2017,2018,2019,2019.1,2019.2
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,10.725,14.775,18.55,24.15,41.50375,64.005,87.5075,145.325,325.125,598.75,...,12172.4,17110.2,21905.577,27952.071,40770.70525,58595.5115,75239.172,87953.23725,0.177342,0.149983
std,13.141632,19.237182,24.465145,31.319163,50.236972,88.628025,116.87549,195.411555,521.680268,974.547421,...,14919.055254,14866.811172,14702.944633,17229.446615,28064.11994,49523.809119,67533.566163,79145.129025,0.081053,0.134963
min,0.0,0.0,0.0,0.0,0.0,1.1,5.5,6.7,7.9,7.8,...,565.8,926.0,3672.508,5593.484,9879.021,18151.756,27355.324,35059.884,0.083686,0.059786
25%,0.75,1.875,3.75,7.5,13.875,16.7,22.375,51.625,59.425,107.85,...,5180.55,10015.25,14156.377,18979.871,28506.75525,36257.689,40724.581,45486.471,0.149454,0.077566
50%,7.45,8.55,10.1,13.3,26.0075,29.96,42.265,69.8,93.8,165.6,...,7023.4,15401.9,23024.9,31333.0,37697.5,42704.0,49182.25,55629.95,0.172018,0.094863
75%,17.425,21.45,24.9,29.95,53.63625,77.265,107.3975,163.5,359.5,656.5,...,14015.25,22496.85,30774.1,40305.2,49961.45,65041.8225,83696.841,98096.71625,0.199906,0.16728
max,28.0,42.0,54.0,70.0,114.0,195.0,260.0,435.0,1105.0,2056.0,...,34077.0,36711.0,37900.0,43548.8,77808.8,130822.29,175236.864,205493.165,0.281648,0.350419


In [29]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [30]:
df

Unnamed: 0,Country,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,United States,13.9,14.6,15.2,16.6,18.5,21.9,28.0,73.0,111.0,...,13045.0,17651.0,23442.0,34716.0,43115.0,53183.5,62297.9,0.171376,0.53072859,0.106234
18,Germany,28.0,42.0,54.0,70.0,114.0,195.0,260.0,435.0,1105.0,...,36711.0,37900.0,39224.0,40679.0,42293.0,45181.0,48962.0,0.083686,0.221294614,0.083493
53,China,1.0,2.5,5.0,10.0,33.515,38.02,56.53,66.6,76.6,...,17758.8,28398.8,43548.8,77808.8,130822.29,175236.864,205493.165,0.172659,0.923315339,0.350419
54,India,0.0,0.0,0.0,0.0,0.0,1.1,5.5,6.7,7.9,...,926.0,3672.508,5593.484,9879.021,18151.756,27355.324,35059.884,0.281648,1.012504287,0.059786


In [31]:
df.drop(df.iloc[:, 1:5], inplace = True, axis = 1) 
#df.iloc[:, 1:5]
#df = df.iloc[:, 1:3]

In [17]:
df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,2019.1,2008-18,2019.2
3,0.171376,0.53072859,0.106234
18,0.083686,0.221294614,0.083493
53,0.172659,0.923315339,0.350419
54,0.281648,1.012504287,0.059786


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [63]:
index = ['Country']
df=df.set_index(index)

In [64]:
df.dropna()

Unnamed: 0_level_0,Country Code,Time,Time Code,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC],Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
China,CHN,2000.0,YR2000,10.23305051,16.63949819,29.73071484,1355738,30187188.73
China,CHN,2001.0,YR2001,9.833583306,18.95851917,28.45581538,1480949,30940867.04
China,CHN,2002.0,YR2002,9.621842057,17.6186279,27.09820152,1654164,32272543.97
China,CHN,2003.0,YR2003,9.994209983,15.03704033,23.94657959,1910755,35505254.39
China,CHN,2004.0,YR2004,10.35599738,16.22326642,20.24947926,2203502,41469975.08
...,...,...,...,...,...,...,...,...
United States,USA,2012.0,YR2012,5.692995246,12.00739706,8.481450408,4270884,56628804.85
United States,USA,2013.0,YR2013,5.676035631,12.63791912,8.713261573,4287114,57984073.56
United States,USA,2014.0,YR2014,5.621145713,12.95382709,8.754308954,4319156,59116385.16
United States,USA,2015.0,YR2015,5.40839254,13.22859321,8.716935867,4297048,58483061.91


### Select countries and time range

In [65]:
# Assign selected Countries (carefull with the names: Case sensitive, etc.)
sel_c = ['United States', 'India', 'Germany','China']
# Assign selected Years or interval
sel_y = [2000,2017]#one member more
sel_years = [*range(sel_y[0],sel_y[1])] #using unpacking(*) operator
sel_years_txt = [str(x) for x in sel_years]

In [66]:
# Check Time name available
df['Time'].unique()

array([2000., 2001., 2002., 2003., 2004., 2005., 2006., 2007., 2008.,
       2009., 2010., 2011., 2012., 2013., 2014., 2015., 2016.])

In [68]:
# turn from float into int
df['Time'] = df['Time'].astype(int)

In [69]:
# rename as years
df.rename(columns = {'Time':'Years'}, inplace = True)
df['Years']

Country
China            2000
China            2001
China            2002
China            2003
China            2004
                 ... 
United States    2012
United States    2013
United States    2014
United States    2015
United States    2016
Name: Years, Length: 68, dtype: int64

In [None]:
# Apply selection creteria
df_p = df[sel_years]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p = df[(df['Years']>0) & (df['Years']<2020)]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p

### Reform the DF to a suitable form

In [None]:
# Reset index to return DF to a tidy state
df_p=df_p.reset_index()

In [None]:
df_p

In [None]:
df_p.columns

In [None]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [None]:
df_p

In [None]:
df_p.head()

In [None]:
# Rename column to Years
#df_p=df_p.rename(columns={'variable':'Years'})
#df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [None]:
df_p

### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [None]:
Key_ind

In [None]:
#df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')