# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [77]:
# Imports
import pandas as pd
import numpy as np

In [78]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bp_wind_instcap'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [79]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [80]:
# Check column names
df.columns

Index(['1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003',
       '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012',
       '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2019.1',
       '2008-2018', '2019.2'],
      dtype='object')

In [81]:
df.head()

Unnamed: 0_level_0,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-2018,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,,,26.0,83.0,126.0,139.0,214.0,270.0,351.0,444.0,...,7801.0,9694.0,11214.0,11973.0,12403.0,12816.0,13413.0,0.046582,0.185572,0.02154
Mexico,,,2.0,2.0,2.0,17.0,17.0,18.0,18.0,18.0,...,2122.0,2569.0,3271.0,4051.0,4198.98,4875.375,6590.875,0.35187,0.473563,0.010584
US,,,1611.0,2141.0,2445.0,2377.0,3864.0,4417.0,5995.0,6456.0,...,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2,0.097088,0.143725,0.166346
Total North America,,,1639.0,2226.0,2573.0,2533.0,4095.0,4705.0,6364.0,6918.0,...,69896.0,76495.0,87058.02,97310.0,104198.98,112108.775,123588.075,0.102394,0.15262,0.19847


In [82]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [83]:
df['Country'].unique()

array([nan, 'Canada', 'Mexico', 'US', 'Total North America', 'Argentina',
       'Brazil', 'Chile', 'Costa Rica', 'Uruguay',
       'Other S. & Cent. America', 'Total S. & Cent. America', 'Austria',
       'Belgium', 'Bulgaria', 'Denmark', 'Finland', 'France', 'Germany',
       'Greece', 'Ireland', 'Italy', 'Netherlands', 'Norway', 'Poland',
       'Portugal', 'Romania', 'Spain', 'Sweden', 'Turkey', 'Ukraine',
       'United Kingdom', 'Other Europe', 'Total Europe',
       'Russian Federation', 'Other CIS', 'Total CIS', 'Iran', 'Jordan',
       'Other Middle East', 'Total Middle East', 'Egypt', 'Morocco',
       'South Africa', 'Tunisia', 'Other Africa', 'Total Africa',
       'Australia', 'China', 'India', 'Japan', 'New Zealand', 'Pakistan',
       'Philippines', 'South Korea', 'Taiwan', 'Thailand',
       'Other Asia Pacific', 'Total Asia Pacific', 'Total World'],
      dtype=object)

In [84]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'US'])]

In [85]:
df

Unnamed: 0,Country,1995,1996,1997,1998,1999,2000,2001,2002,2003,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-2018,2019.2
3,US,,,1611.0,2141.0,2445.0,2377.0,3864.0,4417.0,5995.0,...,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2,0.097088,0.143725,0.166346
20,Germany,,,2089.0,2877.0,4435.0,6095.0,8754.0,12001.0,14381.0,...,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0,0.033632,0.09948,0.097674
54,China,,,146.0,200.0,262.0,341.0,383.0,449.0,547.0,...,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38,0.139786,0.362304,0.338007
55,India,,,940.0,992.0,1035.0,1267.0,1456.0,1702.0,2125.0,...,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18,0.062828,0.131676,0.06023


In [86]:
# Check country name available
# df['Time'].unique()

In [87]:
# Check column characteristics
df.describe()

Unnamed: 0,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-2018,2019.2
count,0.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,,,1196.5,1552.5,2044.25,2520.0,3614.25,4642.25,5762.0,6659.5,...,47150.2325,55532.6075,68322.235,76984.5825,85099.98425,93303.34425,103097.44,0.083333,0.184296,0.165564
std,,,844.138417,1189.404473,1832.237862,2524.488331,3722.831572,5177.848805,6185.074616,6914.957339,...,26151.97844,32454.801776,46135.428418,52363.190384,57423.373711,65578.08241,76638.677376,0.045705,0.120132,0.123074
min,,,146.0,200.0,262.0,341.0,383.0,449.0,547.0,763.0,...,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18,0.033632,0.09948,0.06023
25%,,,741.5,794.0,841.75,1035.5,1187.75,1388.75,1730.5,2440.75,...,29712.85,34576.825,39707.0475,44251.36,49897.115,52954.275,54992.795,0.055529,0.123627,0.088313
50%,,,1275.5,1566.5,1740.0,1822.0,2660.0,3059.5,4060.0,4728.0,...,46725.0,51423.0,58576.51,65360.5,71588.5,76630.2,82203.1,0.079958,0.137701,0.13201
75%,,,1730.5,2325.0,2942.5,3306.5,5086.5,6313.0,8091.5,8946.75,...,64162.3825,72378.7825,87191.6975,98093.7225,106791.36925,116979.26925,130307.745,0.107762,0.19837,0.209261
max,,,2089.0,2877.0,4435.0,6095.0,8754.0,12001.0,14381.0,16419.0,...,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38,0.139786,0.362304,0.338007


In [88]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [89]:
df

Unnamed: 0,Country,1995,1996,1997,1998,1999,2000,2001,2002,2003,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-2018,2019.2
3,United States,,,1611.0,2141.0,2445.0,2377.0,3864.0,4417.0,5995.0,...,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2,0.097088,0.143725,0.166346
20,Germany,,,2089.0,2877.0,4435.0,6095.0,8754.0,12001.0,14381.0,...,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0,0.033632,0.09948,0.097674
54,China,,,146.0,200.0,262.0,341.0,383.0,449.0,547.0,...,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38,0.139786,0.362304,0.338007
55,India,,,940.0,992.0,1035.0,1267.0,1456.0,1702.0,2125.0,...,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18,0.062828,0.131676,0.06023


In [90]:
# drop years before 2000
df.drop(df.iloc[:, 1:6], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [91]:
# drop years after 2019
df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
3,United States,2377.0,3864.0,4417.0,5995.0,6456.0,8706.0,11329.0,16515.0,24651.0,...,39135.0,45676.0,59075.0,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2
20,Germany,6095.0,8754.0,12001.0,14381.0,16419.0,18248.0,20474.0,22116.0,22794.0,...,26903.0,28712.0,30979.0,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0
54,China,341.0,383.0,449.0,547.0,763.0,1060.0,2070.0,4200.0,8387.728,...,29633.478,46354.6,61596.873,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38
55,India,1267.0,1456.0,1702.0,2125.0,3000.0,4433.9,6315.0,7844.5,10242.5,...,13184.0,16179.0,17299.7,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [92]:
index = ['Country']
df=df.set_index(index)

In [93]:
df

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,2377.0,3864.0,4417.0,5995.0,6456.0,8706.0,11329.0,16515.0,24651.0,34296.0,39135.0,45676.0,59075.0,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2
Germany,6095.0,8754.0,12001.0,14381.0,16419.0,18248.0,20474.0,22116.0,22794.0,25732.0,26903.0,28712.0,30979.0,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0
China,341.0,383.0,449.0,547.0,763.0,1060.0,2070.0,4200.0,8387.728,17599.408,29633.478,46354.6,61596.873,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38
India,1267.0,1456.0,1702.0,2125.0,3000.0,4433.9,6315.0,7844.5,10242.5,10925.0,13184.0,16179.0,17299.7,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18


In [94]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,2377.0,3864.0,4417.0,5995.0,6456.0,8706.0,11329.0,16515.0,24651.0,34296.0,39135.0,45676.0,59075.0,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2
Germany,6095.0,8754.0,12001.0,14381.0,16419.0,18248.0,20474.0,22116.0,22794.0,25732.0,26903.0,28712.0,30979.0,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0
China,341.0,383.0,449.0,547.0,763.0,1060.0,2070.0,4200.0,8387.728,17599.408,29633.478,46354.6,61596.873,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38
India,1267.0,1456.0,1702.0,2125.0,3000.0,4433.9,6315.0,7844.5,10242.5,10925.0,13184.0,16179.0,17299.7,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18


### Select countries and time range

In [74]:
# Assign selected Countries (carefull with the names: Case sensitive, etc.)
sel_c = ['United States', 'India', 'Germany','China']
# Assign selected Years or interval
sel_y = [2000,2017]#one member more
sel_years = [*range(sel_y[0],sel_y[1])] #using unpacking(*) operator
sel_years_txt = [str(x) for x in sel_years]

In [76]:
# Check Time name available
# df['Time'].unique()

In [None]:
# turn from float into int
df['Time'] = df['Time'].astype(int)

In [None]:
# rename as years
df.rename(columns = {'Time':'Years'}, inplace = True)
df['Years']

In [None]:
# Apply selection creteria
df_p = df[sel_years]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p = df[(df['Years']>0) & (df['Years']<2020)]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p

### Reform the DF to a suitable form

In [95]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [96]:
df_p

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,2377.0,3864.0,4417.0,5995.0,6456.0,8706.0,11329.0,16515.0,24651.0,...,39135.0,45676.0,59075.0,59973.0,64232.0,72573.02,81286.0,87597.0,94417.4,103584.2
1,Germany,6095.0,8754.0,12001.0,14381.0,16419.0,18248.0,20474.0,22116.0,22794.0,...,26903.0,28712.0,30979.0,33477.0,38614.0,44580.0,49435.0,55580.0,58843.0,60822.0
2,China,341.0,383.0,449.0,547.0,763.0,1060.0,2070.0,4200.0,8387.728,...,29633.478,46354.6,61596.873,76730.53,96819.13,131047.73,148516.89,164374.477,184664.877,210478.38
3,India,1267.0,1456.0,1702.0,2125.0,3000.0,4433.9,6315.0,7844.5,10242.5,...,13184.0,16179.0,17299.7,18420.4,22465.3,25088.19,28700.44,32848.46,35288.1,37505.18


In [None]:
df_p.columns

In [97]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [98]:
df_p

Unnamed: 0,Country,variable,value
0,United States,2000,2377.00
1,Germany,2000,6095.00
2,China,2000,341.00
3,India,2000,1267.00
4,United States,2001,3864.00
...,...,...,...
75,India,2018,35288.10
76,United States,2019,103584.20
77,Germany,2019,60822.00
78,China,2019,210478.38


In [99]:
Name_ind

'Renewable Energy Wind  (Installed capacity)'

In [100]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [101]:
df_p

Unnamed: 0,Country,Years,Renewable Energy Wind (Installed capacity) MW
0,United States,2000,2377.00
1,Germany,2000,6095.00
2,China,2000,341.00
3,India,2000,1267.00
4,United States,2001,3864.00
...,...,...,...
75,India,2018,35288.10
76,United States,2019,103584.20
77,Germany,2019,60822.00
78,China,2019,210478.38


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [102]:
Key_ind

'c_icapw'

In [54]:
df_p

Unnamed: 0,Country,Years,Renewable Energy Solar (Installed capacity) MW
0,United States,2000,18.500
1,Germany,2000,114.000
2,China,2000,33.515
3,India,2000,0.000
4,United States,2001,21.900
...,...,...,...
75,India,2018,27355.324
76,United States,2019,62297.900
77,Germany,2019,48962.000
78,China,2019,205493.165


In [103]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'