# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'edata_elec_prod'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [3]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [4]:
# Check column names
df.columns

Index(['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2016 - 2017 (%) ', '2000 - 2017 (%/year) '],
      dtype='object')

In [5]:
df.head()

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2010,2011,2012,2013,2014,2015,2016,2017,2016 - 2017 (%),2000 - 2017 (%/year)
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
World,11894,12173,12284,12574,12881,13327,13754,14045,14414,14823,...,21557,22253,22750,23424,23910,24317,24918,25592,2.7,3.0
OECD,7712,7900,7975,8160,8389,8646,8889,9014,9244,9451,...,10966,10912,10897,10902,10875,10914,10967,11069,0.9,0.7
G7,6089,6236,6277,6414,6578,6760,6944,6985,7142,7272,...,8017,7913,7849,7897,7876,7867,7848,7898,0.6,0.3
BRICS,2386,2468,2510,2583,2637,2754,2863,2961,3026,3170,...,7001,7639,7998,8524,8880,9142,9550,10000,4.7,6.6
Europe,2900,2937,2926,2931,2976,3071,3164,3199,3280,3327,...,3865,3809,3841,3813,3744,3802,3839,3886,1.2,0.7


In [6]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()

In [7]:
df['Country'].unique()

array(['World', 'OECD', 'G7', 'BRICS', 'Europe', 'European Union',
       'Belgium', 'Czech Rep.', 'France', 'Germany', 'Italy',
       'Netherlands', 'Poland', 'Portugal', 'Romania', 'Spain', 'Sweden',
       'United Kingdom', 'Norway', 'Turkey', 'CIS', 'Kazakhstan',
       'Russia', 'Ukraine', 'Uzbekistan', 'America', 'North America',
       'Canada', 'United States', 'Latin America', 'Argentina', 'Brazil',
       'Chile', 'Colombia', 'Mexico', 'Venezuela', 'Asia', 'China',
       'India', 'Indonesia', 'Japan', 'Malaysia', 'South Korea', 'Taiwan',
       'Thailand', 'Pacific', 'Australia', 'New Zealand', 'Africa',
       'Algeria', 'Egypt', 'Nigeria', 'South Africa', 'Middle-East',
       'Iran', 'Kuwait', 'Saudi Arabia', 'United Arab Emirates'],
      dtype=object)

In [8]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'United States'])]
df['Country']=df['Country'].replace('US', 'United States')

In [9]:
df

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2016 - 2017 (%),2000 - 2017 (%/year)
9,Germany,550,540,537,526,529,537,555,552,556,...,633,613,630,639,628,647,648,653,0.6,0.7
28,United States,3219,3276,3291,3411,3473,3582,3677,3698,3830,...,4378,4349,4291,4306,4339,4317,4316,4251,-1.5,0.3
37,China,621,678,754,838,928,1008,1080,1136,1168,...,4208,4716,4994,5447,5679,5860,6165,6529,5.9,9.7
38,India,293,319,337,361,391,424,443,473,504,...,979,1075,1123,1191,1294,1383,1463,1541,5.3,6.0


In [10]:
# Check column characteristics
df.describe()

Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2010,2011,2012,2013,2014,2015,2016,2017,2016 - 2017 (%),2000 - 2017 (%/year)
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,1170.75,1203.25,1229.75,1284.0,1330.25,1387.75,1438.75,1464.75,1514.5,1560.0,...,2549.5,2688.25,2759.5,2895.75,2985.0,3051.75,3148.0,3243.5,2.575,4.175
std,1372.750129,1389.721645,1384.677429,1431.728326,1446.533183,1484.535028,1517.783115,1517.911147,1572.83555,1592.153259,...,2019.362688,2143.136234,2202.376368,2344.97866,2415.63532,2453.207068,2553.150864,2672.000686,3.604973,4.507309
min,293.0,319.0,337.0,361.0,391.0,424.0,443.0,473.0,504.0,546.0,...,633.0,613.0,630.0,639.0,628.0,647.0,648.0,653.0,-1.5,0.3
25%,485.75,484.75,487.0,484.75,494.5,508.75,527.0,532.25,543.0,553.5,...,892.5,959.5,999.75,1053.0,1127.5,1199.0,1259.25,1319.0,0.075,0.6
50%,585.5,609.0,645.5,682.0,728.5,772.5,817.5,844.0,862.0,898.0,...,2593.5,2712.0,2707.0,2748.5,2816.5,2850.0,2889.5,2896.0,2.95,3.35
75%,1270.5,1327.5,1388.25,1481.25,1564.25,1651.5,1729.25,1776.5,1833.5,1904.5,...,4250.5,4440.75,4466.75,4591.25,4674.0,4702.75,4778.25,4820.5,5.45,6.925
max,3219.0,3276.0,3291.0,3411.0,3473.0,3582.0,3677.0,3698.0,3830.0,3898.0,...,4378.0,4716.0,4994.0,5447.0,5679.0,5860.0,6165.0,6529.0,5.9,9.7


In [11]:
# drop all NaNs if any
df.dropna()

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2016 - 2017 (%),2000 - 2017 (%/year)
9,Germany,550,540,537,526,529,537,555,552,556,...,633,613,630,639,628,647,648,653,0.6,0.7
28,United States,3219,3276,3291,3411,3473,3582,3677,3698,3830,...,4378,4349,4291,4306,4339,4317,4316,4251,-1.5,0.3
37,China,621,678,754,838,928,1008,1080,1136,1168,...,4208,4716,4994,5447,5679,5860,6165,6529,5.9,9.7
38,India,293,319,337,361,391,424,443,473,504,...,979,1075,1123,1191,1294,1383,1463,1541,5.3,6.0


### Change the columns order and name

In [12]:
# drop years before 2000
df.drop(df.iloc[:, 1:11], inplace = True, axis = 1) 

#drop last two columns
df.drop(df.iloc[:, -2:], inplace = True, axis = 1) 

In [13]:
df

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
9,Germany,577,586,587,609,617,623,639,641,640,596,633,613,630,639,628,647,648,653
28,United States,4053,3865,4051,4082,4175,4294,4301,4350,4368,4188,4378,4349,4291,4306,4339,4317,4316,4251
37,China,1356,1481,1654,1911,2204,2500,2866,3282,3467,3715,4208,4716,4994,5447,5679,5860,6165,6529
38,India,570,588,611,651,684,716,774,824,848,917,979,1075,1123,1191,1294,1383,1463,1541


In [14]:
# Melt to a Long format
df=df.melt(id_vars='Country')

In [15]:
Name_ind
# Rename column to Years
df=df.rename(columns={'variable':'Years'})
df=df.rename(columns={'value':Name_ind+" "+Units_ind})
df

Unnamed: 0,Country,Years,Electricity Production TWh
0,Germany,2000,577
1,United States,2000,4053
2,China,2000,1356
3,India,2000,570
4,Germany,2001,586
...,...,...,...
67,India,2016,1463
68,Germany,2017,653
69,United States,2017,4251
70,China,2017,6529


### Do further necessary adjustments

In [16]:
Key_ind

'ELECTP_B'

In [17]:
df.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [18]:
pwd

'/home/annalena/Documents/TU/WiSe_2020/DataSciencePraktikumTUBerlin/DataAnalysis/Big_DF'