# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [39]:
# Imports
import pandas as pd
import numpy as np

In [40]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'bloom_renew_invest'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [41]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [42]:
# Check column names
df.columns

Index(['quarter', 'Country', 'value'], dtype='object')

In [43]:
df.head()

Unnamed: 0_level_0,quarter,Country,value
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004,1,China,0.6
2004,2,China,0.3
2004,3,China,0.6
2004,4,China,0.9
2005,1,China,1.4


In [44]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [45]:
df['Country'].unique()

array(['China', 'United States', 'Other', 'Germany', 'Japan', 'U.K.',
       'India', 'Spain', 'Italy', 'Brazil', 'France', 'Australia',
       'Netherlands', 'Canada', 'Belgium', 'Mexico', 'Sweden',
       'South Korea', 'Chile', 'Taiwan', 'Thailand', 'Denmark', 'Poland',
       'Greece', 'Vietnam', 'Portugal', 'Finland', 'Indonesia', 'Austria',
       'Ireland', 'Philippines', 'Hungary', 'Malaysia', 'Peru',
       'New Zealand', 'Singapore', 'Hong Kong', 'Grand Total'],
      dtype=object)

In [46]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'United States'])]

In [47]:
df

Unnamed: 0,year,quarter,Country,value
0,2004,1,China,0.6
1,2004,2,China,0.3
2,2004,3,China,0.6
3,2004,4,China,0.9
4,2005,1,China,1.4
...,...,...,...,...
457,2019,2,India,2.2
458,2019,3,India,2.4
459,2019,4,India,1.6
460,2020,1,India,1.2


In [48]:
# Check column characteristics
df.describe()

Unnamed: 0,year,quarter,value
count,264.0,264.0,264.0
mean,2011.757576,2.469697,7.059091
std,4.776561,1.119746,7.747439
min,2004.0,1.0,0.3
25%,2008.0,1.0,1.8
50%,2012.0,2.0,4.05
75%,2016.0,3.0,9.0
max,2020.0,4.0,40.3


In [49]:
# drop all NaNs if any
df.dropna()

Unnamed: 0,year,quarter,Country,value
0,2004,1,China,0.6
1,2004,2,China,0.3
2,2004,3,China,0.6
3,2004,4,China,0.9
4,2005,1,China,1.4
...,...,...,...,...
457,2019,2,India,2.2
458,2019,3,India,2.4
459,2019,4,India,1.6
460,2020,1,India,1.2


### Change the columns order and name

In [50]:
df = df[['Country', 'year', 'quarter', 'value']]
# Rename columns
df.columns = ['Country', 'Years', 'Quarter', 'B.USD']

In [51]:
df

Unnamed: 0,Country,Years,Quarter,B.USD
0,China,2004,1,0.6
1,China,2004,2,0.3
2,China,2004,3,0.6
3,China,2004,4,0.9
4,China,2005,1,1.4
...,...,...,...,...
457,India,2019,2,2.2
458,India,2019,3,2.4
459,India,2019,4,1.6
460,India,2020,1,1.2


### Mean the values per quarter to a yearly average

In [52]:
df_p = df.groupby([df.Years, 'Country'])['B.USD'].mean().reset_index(name='B.USD')
df = df_p[['Country', 'Years', 'B.USD']]

In [53]:
df

Unnamed: 0,Country,Years,B.USD
0,China,2004,0.600
1,Germany,2004,2.225
2,India,2004,0.675
3,United States,2004,1.200
4,China,2005,1.925
...,...,...,...
63,United States,2019,15.325
64,China,2020,20.800
65,Germany,2020,1.800
66,India,2020,1.350


### Do further necessary adjustments

In [54]:
Key_ind

'ASST_D'

In [56]:
df.to_csv('result_df/'+'prev_'+Key_ind+'.csv')

In [57]:
pwd

'/home/annalena/Documents/TU/WiSe_2020/DataSciencePraktikumTUBerlin/DataAnalysis/Big_DF'