# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [294]:
# Imports
import pandas as pd
import numpy as np

In [295]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bp_elec_gen_other'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [296]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [297]:
# Check column names
df.columns

Index(['1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
       '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2019.1', '2008-18', '2019.2'],
      dtype='object')

In [298]:
df.head()

Unnamed: 0_level_0,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,0.1424,0.137033,0.1597,0.195765,0.102,0.152,0.152,0.158,0.165,0.165,...,3.793996,0.409049,0.457237,0.57229,0.520677,0.737882,0.670086,-0.091879,-0.150815,0.002868
Mexico,0.0,0.0,0.0,0.0,-0.00077,0.001901,0.000624,0.001008,0.00105,0.003624,...,0.767183,0.655485,0.284077,0.876164,3.506569,20.555161,21.550522,0.048424,0.120144,0.09224
US,0.343859,0.368405,0.373087,0.396913,4.926283,10.672409,13.009215,13.797283,13.932804,13.696135,...,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467,0.025018,0.010392,0.059753
Total North America,0.486259,0.505438,0.532786,0.592678,5.027513,10.82631,13.161839,13.956291,14.098854,13.864759,...,18.866844,15.136371,15.493176,15.802049,17.716068,34.912774,36.181075,0.036328,0.0441,0.154861


In [299]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [300]:
df['Country'].unique()

array([nan, 'Canada', 'Mexico', 'US', 'Total North America', 'Argentina',
       'Brazil', 'Other S. & Cent. America', 'Total S. & Cent. America',
       'Germany', 'Italy', 'Netherlands', 'Poland', 'Spain', 'Turkey',
       'Ukraine', 'United Kingdom', 'Other Europe', 'Total Europe',
       'Kazakhstan', 'Russian Federation', 'Other CIS', 'Total CIS',
       'Iran', 'Saudi Arabia', 'United Arab Emirates',
       'Other Middle East', 'Total Middle East', 'Egypt', 'South Africa',
       'Other Africa', 'Total Africa', 'Australia', 'China', 'India',
       'Indonesia', 'Japan', 'Malaysia', 'South Korea', 'Taiwan',
       'Thailand', 'Vietnam', 'Other Asia Pacific', 'Total Asia Pacific',
       'Total World', 'of which: OECD', '                 Non-OECD',
       '                 European Union '], dtype=object)

In [301]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'US'])]

In [302]:
df

Unnamed: 0,Country,1985,1986,1987,1988,1989,1990,1991,1992,1993,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,US,0.343859,0.368405,0.373087,0.396913,4.926283,10.672409,13.009215,13.797283,13.932804,...,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467,0.025018,0.010392,0.059753
11,Germany,14.388321,14.133449,13.680181,15.626727,16.067004,20.158,16.2,16.2937,15.82,...,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343,-0.041592,0.008323,0.110134
39,China,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000242,...,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133,0.135254,0.0,0.241941
40,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505,-0.071401,0.0,0.00082


In [303]:
# Check country name available
# df['Time'].unique()

In [304]:
# Check column characteristics
df.describe()

Unnamed: 0,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,3.683045,3.625464,3.513317,4.00591,5.248322,7.707602,7.302304,7.522746,7.438261,7.771033,...,15.867098,16.834492,17.202087,21.04855,21.698246,22.616399,24.102362,0.01182,0.004679,0.103162
std,7.138691,7.007476,6.780191,7.749471,7.577101,9.705956,8.532001,8.746101,8.623311,9.09586,...,11.707419,12.54856,12.749765,18.007211,19.600318,21.130985,24.003761,0.091631,0.005468,0.10274
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505,-0.071401,0.0,0.00082
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000182,0.003597,...,10.729249,10.605578,11.098318,10.791222,10.310257,10.266356,10.518226,-0.049044,0.0,0.04502
50%,0.17193,0.184203,0.186543,0.198456,2.463142,5.336204,6.504607,6.898642,6.966523,6.850466,...,18.644196,20.109084,20.767726,20.948297,20.216311,20.233866,19.845905,-0.008287,0.004162,0.084944
75%,3.854975,3.809666,3.69986,4.204367,7.711464,13.043807,13.806911,14.421387,14.404603,14.617901,...,23.782045,26.337997,26.871495,31.205625,31.604299,32.583908,33.430041,0.052577,0.00884,0.143086
max,14.388321,14.133449,13.680181,15.626727,16.067004,20.158,16.2,16.2937,15.82,17.3832,...,26.18,26.913,27.135209,42.1935,46.185796,49.791633,56.526133,0.135254,0.010392,0.241941


In [305]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [306]:
df

Unnamed: 0,Country,1985,1986,1987,1988,1989,1990,1991,1992,1993,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,United States,0.343859,0.368405,0.373087,0.396913,4.926283,10.672409,13.009215,13.797283,13.932804,...,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467,0.025018,0.010392,0.059753
11,Germany,14.388321,14.133449,13.680181,15.626727,16.067004,20.158,16.2,16.2937,15.82,...,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343,-0.041592,0.008323,0.110134
39,China,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000242,...,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133,0.135254,0.0,0.241941
40,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505,-0.071401,0.0,0.00082


In [307]:
# drop years before 2000
df.drop(df.iloc[:, 1:16], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [308]:
# drop years after 2019
df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
3,United States,18.030998,12.227131,13.975489,14.545181,14.750847,13.359156,13.524136,12.70203,12.281998,...,13.464264,14.801048,14.502507,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467
11,Germany,21.27425,21.2795,18.62625,20.306,21.2268,24.1468,25.4006,26.5586,24.7124,...,26.4293,25.3062,26.1096,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343
39,China,0.003687,0.002308,0.006128,0.007895,0.0,0.0,0.0,0.0,0.0,...,19.85186,21.669,20.27989,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133
40,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [309]:
index = ['Country']
df=df.set_index(index)

In [310]:
df

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,18.030998,12.227131,13.975489,14.545181,14.750847,13.359156,13.524136,12.70203,12.281998,12.524095,13.464264,14.801048,14.502507,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467
Germany,21.27425,21.2795,18.62625,20.306,21.2268,24.1468,25.4006,26.5586,24.7124,21.1252,26.4293,25.3062,26.1096,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343
China,0.003687,0.002308,0.006128,0.007895,0.0,0.0,0.0,0.0,0.0,0.0,19.85186,21.669,20.27989,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133
India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505


In [311]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,18.030998,12.227131,13.975489,14.545181,14.750847,13.359156,13.524136,12.70203,12.281998,12.524095,13.464264,14.801048,14.502507,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467
Germany,21.27425,21.2795,18.62625,20.306,21.2268,24.1468,25.4006,26.5586,24.7124,21.1252,26.4293,25.3062,26.1096,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343
China,0.003687,0.002308,0.006128,0.007895,0.0,0.0,0.0,0.0,0.0,0.0,19.85186,21.669,20.27989,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133
India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505


### Reform the DF to a suitable form

In [312]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [313]:
df_p

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,18.030998,12.227131,13.975489,14.545181,14.750847,13.359156,13.524136,12.70203,12.281998,...,13.464264,14.801048,14.502507,14.305666,14.071837,14.751862,14.353595,13.688822,13.619731,13.960467
1,Germany,21.27425,21.2795,18.62625,20.306,21.2268,24.1468,25.4006,26.5586,24.7124,...,26.4293,25.3062,26.1096,26.18,26.913,27.135209,27.543,26.7438,26.848,25.731343
2,China,0.003687,0.002308,0.006128,0.007895,0.0,0.0,0.0,0.0,0.0,...,19.85186,21.669,20.27989,22.982727,26.14633,26.78359,42.1935,46.185796,49.791633,56.526133
3,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2068,0.137685,0.104105,0.174565,0.20623,0.191505


In [314]:
df_p.columns

Index(['Country', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019'],
      dtype='object')

In [315]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [316]:
df_p

Unnamed: 0,Country,variable,value
0,United States,2000,18.030998
1,Germany,2000,21.274250
2,China,2000,0.003687
3,India,2000,0.000000
4,United States,2001,12.227131
...,...,...,...
75,India,2018,0.206230
76,United States,2019,13.960467
77,Germany,2019,25.731343
78,China,2019,56.526133


In [317]:
Name_ind

'Electricity generation from other'

In [318]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [319]:
df_p

Unnamed: 0,Country,Years,Electricity generation from other TWh
0,United States,2000,18.030998
1,Germany,2000,21.274250
2,China,2000,0.003687
3,India,2000,0.000000
4,United States,2001,12.227131
...,...,...,...
75,India,2018,0.206230
76,United States,2019,13.960467
77,Germany,2019,25.731343
78,China,2019,56.526133


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [249]:
Key_ind

'c_egen'

In [250]:
df_p

Unnamed: 0,Country,Years,Electricity generation TWh
0,United States,2000,4052.253106
1,Germany,2000,576.556000
2,China,2000,1355.600000
3,India,2000,571.393936
4,United States,2001,3984.467890
...,...,...,...
75,India,2018,1551.441500
76,United States,2019,4401.299402
77,Germany,2019,612.397325
78,China,2019,7503.428000


In [251]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'