# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'qu_solar_PV_power00-19'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv')

In [3]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [4]:
# Check column names
df.columns

Index(['Unnamed: 0', 'Land', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019'],
      dtype='object')

In [None]:
df.head()

In [7]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
df=df.rename(columns={'Land':'Country'})

In [8]:
# Check country name available
df['Country'].unique()

array(['Japan', 'USA', 'Deutschland', 'Australien', 'Italien', 'Schweiz',
       'Mexiko', 'Frankreich', 'Niederlande', 'Kanada', 'Norwegen',
       'Österreich', 'Südkorea', 'Schweden', 'Finnland', 'Spanien',
       'Großbritannien', 'Dänemark', 'Portugal', 'Israel', 'Andere',
       'China', 'Thailand', 'Türkei', 'Belgien', 'Malaysia', 'Südafrika',
       'Chile', 'Indien', 'Korea'], dtype=object)

In [None]:
# Check column characteristics
df.describe()

In [9]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('USA', 'United States')
df['Country']=df['Country'].replace('Deutschland', 'Germany')
df['Country']=df['Country'].replace('Indien', 'India')

### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [10]:
index = ['Country']
df=df.set_index(index)

### Select countries and time range

In [11]:
# Assign selected Countries (carefull with the names: Case sensitive, etc.)
sel_c = ['United States', 'India', 'Germany','China']
# Assign selected Years or interval
sel_y = [2000,2018]#one member more
sel_years = [*range(sel_y[0],sel_y[1])] #using unpacking(*) operator
sel_years_txt = [str(x) for x in sel_years]

In [12]:
# Apply selection criteria
df_p = df[sel_years_txt]
df_p=df_p[df_p.index.isin(sel_c)]

In [13]:
df_p.head()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
United States,0.139,0.168,0.212,0.275,0.376,0.479,0.624,0.831,1.169,1.616,2.04,3.96,7.33,12.08,18.32,25.67,41.0,51.8
Germany,0.089,0.206,0.324,0.473,1.139,2.072,2.918,4.195,6.153,9.959,17.37,24.86,32.46,35.77,38.25,39.71,40.7,42.5
China,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.1,0.14,0.3,0.8,3.3,7.0,19.72,30.38,43.53,78.1,131.1
India,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Reform the DF to a suitable form

In [14]:
# Reset index to return DF to a tidy state
df_p=df_p.reset_index()

In [15]:
df_p.columns

Index(['Country', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017'],
      dtype='object')

In [16]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [None]:
df_p.head()

In [17]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+""+Units_ind})

### 2.4 Do further necessary adjustments

In [18]:
# Show column types
display(df_p.dtypes) 

Country                            object
Years                              object
Renewable installed PV PowerGW    float64
dtype: object

In [19]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False)#avoiding a warning (carefull)

In [20]:
df_p.head()

Unnamed: 0,Country,Years,Renewable installed PV PowerGW
0,United States,2000,0.139
1,Germany,2000,0.089
2,China,2000,0.0
3,India,2000,0.0
4,United States,2001,0.168


In [21]:
df_p.describe()

Unnamed: 0,Years,Renewable installed PV PowerGW
count,72.0,72.0
mean,2008.5,10.858153
std,5.224536,21.745647
min,2000.0,0.0
25%,2004.0,0.0
50%,2008.5,0.4245
75%,2013.0,10.48925
max,2017.0,131.1


In [23]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')