# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'

In [3]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'prevf_icaps'
df = pd.read_csv('result_df/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [4]:
df

Unnamed: 0,Country,Years,Renewable installed PV Power GW
0,United States,2000,0.139
1,Germany,2000,0.089
2,China,2000,0.000
3,India,2000,0.000
4,United States,2001,0.168
...,...,...,...
75,India,2018,0.000
76,United States,2019,75.900
77,Germany,2019,49.200
78,China,2019,204.700


In [224]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [225]:
# Check column names
df.columns

Index(['1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
       '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2019.1', '2008-18', '2019.2'],
      dtype='object')

In [226]:
df.head()

Unnamed: 0_level_0,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,,,,,,,,,,...,,,,,,,,,,
Canada,459.0164,468.584033,496.3977,506.015765,498.092587,480.593242,506.994859,518.626812,530.232161,553.054844,...,655.681025,647.552346,659.34936,662.985032,664.525146,652.2709,660.421858,0.012496,0.002671,0.024456
Mexico,96.207532,99.589895,106.165401,112.230824,120.047161,117.591543,128.571171,132.275665,134.642519,146.127562,...,297.117849,303.315833,310.349642,320.344722,329.085355,349.298889,363.970083,0.042002,0.026352,0.013478
US,2657.150071,2676.113155,2772.205266,2914.445815,3155.449679,3232.779649,3270.724402,3284.389935,3404.677873,3458.526081,...,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402,-0.012589,0.001523,0.162983
Total North America,3212.374003,3244.287083,3374.768367,3532.692403,3773.589427,3830.964434,3906.290432,3935.292412,4069.552553,4157.708487,...,5283.091761,5314.194586,5318.368443,5331.240569,5296.149518,5458.985253,5425.691344,-0.006099,0.003064,0.200917


In [227]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [228]:
df['Country'].unique()

array([nan, 'Canada', 'Mexico', 'US', 'Total North America', 'Argentina',
       'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru',
       'Trinidad & Tobago', 'Venezuela', 'Central America',
       'Other Caribbean', 'Other South America',
       'Total S. & Cent. America', 'Austria', 'Belgium', 'Bulgaria',
       'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal',
       'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden',
       'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom',
       'Other Europe', 'Total Europe', 'Azerbaijan', 'Belarus',
       'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'Uzbekistan',
       'Other CIS', 'Total CIS', 'Iran', 'Iraq', 'Israel', 'Kuwait',
       'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates',
       'Other Middle Ea

In [229]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'US'])]

In [230]:
df

Unnamed: 0,Country,1985,1986,1987,1988,1989,1990,1991,1992,1993,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,US,2657.150071,2676.113155,2772.205266,2914.445815,3155.449679,3232.779649,3270.724402,3284.389935,3404.677873,...,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402,-0.012589,0.001523,0.162983
29,Germany,522.534,523.557,532.442,549.492,559.873,549.9,540.222,538.1657,527.118,...,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325,-0.04826,0.000431,0.022677
88,China,410.69,449.53,497.267,545.21,584.81,621.2,677.55,753.94,811.59,...,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428,0.047068,0.074421,0.277857
90,India,186.385834,202.677201,223.66949,241.30462,272.446431,287.762827,320.457631,337.153321,362.176251,...,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662,0.004684,0.064753,0.05772


In [231]:
# Check country name available
# df['Time'].unique()

In [232]:
# Check column characteristics
df.describe()

Unnamed: 0,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,944.189976,962.969339,1006.395939,1062.613109,1143.144777,1172.910619,1202.238508,1228.412239,1276.390531,1325.690566,...,2886.699255,3011.942119,3032.172463,3133.379981,3258.6,3454.609991,3518.958347,-0.002274,0.035282,0.130309
std,1150.494878,1150.305616,1185.269011,1242.957663,1349.003375,1380.709223,1386.811126,1381.177147,1430.948825,1440.161063,...,2355.38927,2470.793139,2456.235436,2558.384884,2723.716653,2961.176155,3106.187371,0.039599,0.039811,0.115024
min,186.385834,202.677201,223.66949,241.30462,272.446431,287.762827,320.457631,337.153321,362.176251,387.966984,...,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325,-0.04826,0.000431,0.022677
25%,354.613958,387.8168,428.867623,469.233655,488.016358,484.365707,485.280908,487.912605,485.882563,493.343646,...,1019.28685,1103.606251,1150.010918,1213.98316,1268.733138,1324.443625,1322.130828,-0.021507,0.00125,0.048959
50%,466.612,486.5435,514.8545,547.351,572.3415,585.55,608.886,646.05285,669.354,728.1346,...,2738.21601,2812.773187,2832.983785,2874.826289,2888.14915,3004.428482,2980.004032,-0.003953,0.033138,0.110351
75%,1056.188018,1061.696039,1092.382816,1140.730454,1227.46992,1274.094912,1325.8436,1386.552484,1459.861968,1560.48152,...,4605.628416,4721.109055,4715.14533,4794.223111,4878.016013,5134.594848,5176.831552,0.01528,0.06717,0.191701
max,2657.150071,2676.113155,2772.205266,2914.445815,3155.449679,3232.779649,3270.724402,3284.389935,3404.677873,3458.526081,...,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428,0.047068,0.074421,0.277857


In [233]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [234]:
df

Unnamed: 0,Country,1985,1986,1987,1988,1989,1990,1991,1992,1993,...,2013,2014,2015,2016,2017,2018,2019,2019.1,2008-18,2019.2
3,United States,2657.150071,2676.113155,2772.205266,2914.445815,3155.449679,3232.779649,3270.724402,3284.389935,3404.677873,...,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402,-0.012589,0.001523,0.162983
29,Germany,522.534,523.557,532.442,549.492,559.873,549.9,540.222,538.1657,527.118,...,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325,-0.04826,0.000431,0.022677
88,China,410.69,449.53,497.267,545.21,584.81,621.2,677.55,753.94,811.59,...,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428,0.047068,0.074421,0.277857
90,India,186.385834,202.677201,223.66949,241.30462,272.446431,287.762827,320.457631,337.153321,362.176251,...,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662,0.004684,0.064753,0.05772


In [235]:
# drop years before 2000
df.drop(df.iloc[:, 1:16], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [236]:
# drop years after 2019
df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
3,United States,4052.253106,3984.46789,4111.775538,4137.978842,4231.682747,4322.786632,4330.951641,4431.837021,4390.106117,...,4394.250985,4363.377155,4310.569202,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402
29,Germany,576.556,586.4115,586.688,608.78,617.4652,622.575,639.5665,640.578,640.686,...,633.093,613.070326,630.149,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325
88,China,1355.6,1480.802,1654.0,1910.575,2203.31,2500.26,2865.726,3281.553,3495.76,...,4207.16,4713.019,4987.553,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428
90,India,571.393936,586.092738,609.650479,638.13101,698.300975,704.509318,744.426252,796.265175,828.413553,...,937.468038,1034.011325,1091.837883,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [237]:
index = ['Country']
df=df.set_index(index)

In [238]:
df

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,4052.253106,3984.46789,4111.775538,4137.978842,4231.682747,4322.786632,4330.951641,4431.837021,4390.106117,4206.493361,4394.250985,4363.377155,4310.569202,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402
Germany,576.556,586.4115,586.688,608.78,617.4652,622.575,639.5665,640.578,640.686,595.624,633.093,613.070326,630.149,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325
China,1355.6,1480.802,1654.0,1910.575,2203.31,2500.26,2865.726,3281.553,3495.76,3714.651,4207.16,4713.019,4987.553,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428
India,571.393936,586.092738,609.650479,638.13101,698.300975,704.509318,744.426252,796.265175,828.413553,879.70725,937.468038,1034.011325,1091.837883,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662


In [239]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
United States,4052.253106,3984.46789,4111.775538,4137.978842,4231.682747,4322.786632,4330.951641,4431.837021,4390.106117,4206.493361,4394.250985,4363.377155,4310.569202,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402
Germany,576.556,586.4115,586.688,608.78,617.4652,622.575,639.5665,640.578,640.686,595.624,633.093,613.070326,630.149,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325
China,1355.6,1480.802,1654.0,1910.575,2203.31,2500.26,2865.726,3281.553,3495.76,3714.651,4207.16,4713.019,4987.553,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428
India,571.393936,586.092738,609.650479,638.13101,698.300975,704.509318,744.426252,796.265175,828.413553,879.70725,937.468038,1034.011325,1091.837883,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662


### Reform the DF to a suitable form

In [241]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [242]:
df_p

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United States,4052.253106,3984.46789,4111.775538,4137.978842,4231.682747,4322.786632,4330.951641,4431.837021,4390.106117,...,4394.250985,4363.377155,4310.569202,4330.292888,4363.326407,4348.66944,4347.910814,4302.539017,4457.415464,4401.299402
1,Germany,576.556,586.4115,586.688,608.78,617.4652,622.575,639.5665,640.578,640.686,...,633.093,613.070326,630.149,638.73,627.765102,648.149281,650.707344,653.6547,643.45,612.397325
2,China,1355.6,1480.802,1654.0,1910.575,2203.31,2500.26,2865.726,3281.553,3495.76,...,4207.16,4713.019,4987.553,5431.635,5794.457,5814.573,6133.16,6604.447,7166.133,7503.428
3,India,571.393936,586.092738,609.650479,638.13101,698.300975,704.509318,744.426252,796.265175,828.413553,...,937.468038,1034.011325,1091.837883,1146.139133,1262.219968,1317.29813,1401.741765,1473.759284,1551.4415,1558.708662


In [None]:
df_p.columns

In [244]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [245]:
df_p

Unnamed: 0,Country,variable,value
0,United States,2000,4052.253106
1,Germany,2000,576.556000
2,China,2000,1355.600000
3,India,2000,571.393936
4,United States,2001,3984.467890
...,...,...,...
75,India,2018,1551.441500
76,United States,2019,4401.299402
77,Germany,2019,612.397325
78,China,2019,7503.428000


In [246]:
Name_ind

'Electricity generation'

In [247]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [248]:
df_p

Unnamed: 0,Country,Years,Electricity generation TWh
0,United States,2000,4052.253106
1,Germany,2000,576.556000
2,China,2000,1355.600000
3,India,2000,571.393936
4,United States,2001,3984.467890
...,...,...,...
75,India,2018,1551.441500
76,United States,2019,4401.299402
77,Germany,2019,612.397325
78,China,2019,7503.428000


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [249]:
Key_ind

'c_egen'

In [250]:
df_p

Unnamed: 0,Country,Years,Electricity generation TWh
0,United States,2000,4052.253106
1,Germany,2000,576.556000
2,China,2000,1355.600000
3,India,2000,571.393936
4,United States,2001,3984.467890
...,...,...,...
75,India,2018,1551.441500
76,United States,2019,4401.299402
77,Germany,2019,612.397325
78,China,2019,7503.428000


In [251]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'