# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [150]:
# Imports
import pandas as pd
import numpy as np

In [151]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bp_renew_consum'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [152]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [153]:
# Check column names
df.columns

Index(['1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973',
       '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
       '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
       '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
       '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018',
       '2019', '2019.1', '2008-18', '2019.2'],
      dtype='object')

In [154]:
df.head()

Unnamed: 0_level_0,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.290927,0.3033,0.437742,0.488314,0.492008,0.503491,0.523399,0.039541,0.134528,0.018063
Mexico,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00161,0.00463,...,0.111551,0.133273,0.157067,0.170452,0.184147,0.216463,0.345292,0.595155,0.102784,0.011916
US,0.133322,0.14062,0.140736,0.15546,0.162271,0.162359,0.165844,0.18673,0.199897,0.204043,...,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555,0.058979,0.100751,0.201148
Total North America,0.133322,0.14062,0.140736,0.15546,0.162271,0.162359,0.165844,0.18673,0.201507,0.208673,...,4.158765,4.475862,4.843633,5.466434,5.915439,6.223889,6.697246,0.076055,0.103173,0.231127


In [155]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [156]:
df['Country'].unique()

array([nan, 'Canada', 'Mexico', 'US', 'Total North America', 'Argentina',
       'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru',
       'Trinidad & Tobago', 'Venezuela', 'Central America',
       'Other Caribbean', 'Other South America',
       'Total S. & Cent. America', 'Austria', 'Belgium', 'Bulgaria',
       'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal',
       'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden',
       'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom',
       'Other Europe', 'Total Europe', 'Azerbaijan', 'Belarus',
       'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'USSR',
       'Uzbekistan', 'Other CIS', 'Total CIS', 'Iran', 'Iraq', 'Israel',
       'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates',
       'Other M

In [157]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'US'])]

In [158]:
df

Unnamed: 0,Country,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,US,0.133322,0.14062,0.140736,0.15546,0.162271,0.162359,0.165844,0.18673,0.199897,...,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555,0.058979,0.100751,0.201148
29,Germany,0.0,0.0,0.0,0.0,0.0,0.0092,0.01001,0.010187,0.010531,...,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454,0.074997,0.093341,0.073075
89,China,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562,0.141678,0.333643,0.228722
91,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505,0.093989,0.170766,0.041672


In [159]:
# Check country name available
# df['Time'].unique()

In [160]:
# Check column characteristics
df.describe()

Unnamed: 0,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,0.033331,0.035155,0.035184,0.038865,0.040568,0.04289,0.043964,0.049229,0.052607,0.053701,...,1.844228,2.071786,2.282182,2.655215,3.157863,3.595634,3.945269,0.092411,0.174625,0.136154
std,0.066661,0.07031,0.070368,0.07773,0.081135,0.079764,0.081391,0.091793,0.098319,0.100356,...,1.379125,1.476152,1.550409,1.823837,2.09756,2.406683,2.681897,0.035827,0.111604,0.092554
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505,0.058979,0.093341,0.041672
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.109492,1.21715,1.393605,1.415348,1.634532,1.753238,1.889967,0.070993,0.098898,0.065224
50%,0.0,0.0,0.0,0.0,0.0,0.0046,0.005005,0.005094,0.005266,0.005381,...,1.551267,1.833991,2.141294,2.541801,3.247659,3.736832,3.973004,0.084493,0.135758,0.137111
75%,0.033331,0.035155,0.035184,0.038865,0.040568,0.04749,0.048969,0.054323,0.057873,0.059083,...,2.286003,2.688627,3.02987,3.781668,4.770989,5.579228,6.028307,0.105911,0.211485,0.208041
max,0.133322,0.14062,0.140736,0.15546,0.162271,0.162359,0.165844,0.18673,0.199897,0.204043,...,3.756286,4.03929,4.248825,4.807667,5.239283,5.805106,6.627562,0.141678,0.333643,0.228722


In [161]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [162]:
df

Unnamed: 0,Country,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,United States,0.133322,0.14062,0.140736,0.15546,0.162271,0.162359,0.165844,0.18673,0.199897,...,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555,0.058979,0.100751,0.201148
29,Germany,0.0,0.0,0.0,0.0,0.0,0.0092,0.01001,0.010187,0.010531,...,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454,0.074997,0.093341,0.073075
89,China,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562,0.141678,0.333643,0.228722
91,India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505,0.093989,0.170766,0.041672


In [163]:
# drop years before 2000
df.drop(df.iloc[:, 1:36], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [164]:
# drop years after 2019
df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
3,United States,0.868178,0.886655,0.996221,1.058536,1.152707,1.241083,1.468457,1.679495,2.107577,...,2.756646,3.090966,3.3251,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555
29,Germany,0.152289,0.169913,0.23747,0.302467,0.395024,0.487227,0.612064,0.772951,0.806957,...,0.905881,1.112392,1.252521,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454
89,China,0.031489,0.033018,0.035471,0.041101,0.05783,0.101235,0.143089,0.19672,0.326148,...,0.768617,1.049599,1.356394,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562
91,India,0.033788,0.042583,0.042954,0.053512,0.084781,0.098864,0.14551,0.184962,0.228126,...,0.318274,0.393996,0.460994,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [165]:
index = ['Country']
df=df.set_index(index)

In [166]:
df

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,0.868178,0.886655,0.996221,1.058536,1.152707,1.241083,1.468457,1.679495,2.107577,2.394484,2.756646,3.090966,3.3251,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555
Germany,0.152289,0.169913,0.23747,0.302467,0.395024,0.487227,0.612064,0.772951,0.806957,0.833847,0.905881,1.112392,1.252521,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454
China,0.031489,0.033018,0.035471,0.041101,0.05783,0.101235,0.143089,0.19672,0.326148,0.524563,0.768617,1.049599,1.356394,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562
India,0.033788,0.042583,0.042954,0.053512,0.084781,0.098864,0.14551,0.184962,0.228126,0.265121,0.318274,0.393996,0.460994,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505


In [167]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,0.868178,0.886655,0.996221,1.058536,1.152707,1.241083,1.468457,1.679495,2.107577,2.394484,2.756646,3.090966,3.3251,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555
Germany,0.152289,0.169913,0.23747,0.302467,0.395024,0.487227,0.612064,0.772951,0.806957,0.833847,0.905881,1.112392,1.252521,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454
China,0.031489,0.033018,0.035471,0.041101,0.05783,0.101235,0.143089,0.19672,0.326148,0.524563,0.768617,1.049599,1.356394,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562
India,0.033788,0.042583,0.042954,0.053512,0.084781,0.098864,0.14551,0.184962,0.228126,0.265121,0.318274,0.393996,0.460994,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505


### Select countries and time range

In [74]:
# Assign selected Countries (carefull with the names: Case sensitive, etc.)
sel_c = ['United States', 'India', 'Germany','China']
# Assign selected Years or interval
sel_y = [2000,2017]#one member more
sel_years = [*range(sel_y[0],sel_y[1])] #using unpacking(*) operator
sel_years_txt = [str(x) for x in sel_years]

In [76]:
# Check Time name available
# df['Time'].unique()

In [None]:
# turn from float into int
df['Time'] = df['Time'].astype(int)

In [None]:
# rename as years
df.rename(columns = {'Time':'Years'}, inplace = True)
df['Years']

In [None]:
# Apply selection creteria
df_p = df[sel_years]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p = df[(df['Years']>0) & (df['Years']<2020)]
df_p = df_p[df_p.index.isin(sel_c)]

In [None]:
df_p

### Reform the DF to a suitable form

In [168]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [169]:
df_p

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,0.868178,0.886655,0.996221,1.058536,1.152707,1.241083,1.468457,1.679495,2.107577,...,2.756646,3.090966,3.3251,3.756286,4.03929,4.248825,4.807667,5.239283,5.503936,5.828555
1,Germany,0.152289,0.169913,0.23747,0.302467,0.395024,0.487227,0.612064,0.772951,0.806957,...,0.905881,1.112392,1.252521,1.306625,1.429576,1.659036,1.643934,1.880427,1.969729,2.117454
2,China,0.031489,0.033018,0.035471,0.041101,0.05783,0.101235,0.143089,0.19672,0.326148,...,0.768617,1.049599,1.356394,1.795909,2.238405,2.623552,3.439668,4.614891,5.805106,6.627562
3,India,0.033788,0.042583,0.042954,0.053512,0.084781,0.098864,0.14551,0.184962,0.228126,...,0.318274,0.393996,0.460994,0.518092,0.579871,0.597314,0.729589,0.896848,1.103764,1.207505


In [None]:
df_p.columns

In [170]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [171]:
df_p

Unnamed: 0,Country,variable,value
0,United States,2000,0.868178
1,Germany,2000,0.152289
2,China,2000,0.031489
3,India,2000,0.033788
4,United States,2001,0.886655
...,...,...,...
75,India,2018,1.103764
76,United States,2019,5.828555
77,Germany,2019,2.117454
78,China,2019,6.627562


In [172]:
Name_ind

'Renewables:Consumption'

In [173]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [174]:
df_p

Unnamed: 0,Country,Years,Renewables:Consumption EJ
0,United States,2000,0.868178
1,Germany,2000,0.152289
2,China,2000,0.031489
3,India,2000,0.033788
4,United States,2001,0.886655
...,...,...,...
75,India,2018,1.103764
76,United States,2019,5.828555
77,Germany,2019,2.117454
78,China,2019,6.627562


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [175]:
Key_ind

'c_r-con'

In [144]:
df_p

Unnamed: 0,Country,Years,Primary Energy: Consumption EJ
0,United States,2000,95.138792
1,Germany,2000,14.307412
2,China,2000,42.453161
3,India,2000,13.341558
4,United States,2001,92.894525
...,...,...,...
75,India,2018,33.301794
76,United States,2019,94.648804
77,Germany,2019,13.139900
78,China,2019,141.699218


In [176]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'