# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [517]:
# Imports
import pandas as pd
import numpy as np

In [518]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'web_resid_energinten'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [519]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [520]:
# Check column names
df.columns

Index(['End use', 'Indicator', '2000', '2005', '2010', '2015', '2016', '2017',
       '2018'],
      dtype='object')

In [521]:
df.head()

Unnamed: 0_level_0,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,Total Residential,Per capita energy intensity (index 2000) ...,100,99,98,93,92,90,89
Australia,Total Residential,Per dwelling energy intensity (index 2000) ...,100,102,107,103,103,101,100
Australia,Total Residential,Per dwelling TC energy intensity (index 2000) ...,100,103,109,105,106,102,102
Australia,Residential space heating,Per capita energy intensity (index 2000) ...,100,91,82,78,77,75,73
Australia,Residential space heating,Per dwelling energy intensity (index 2000) ...,100,94,90,87,86,84,82


In [522]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [511]:
df['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg', 'Mexico',
       'Netherlands', 'New Zealand', 'Poland', 'Portugal',
       'Slovak Republic', 'Spain', 'Sweden', 'Switzerland', 'Turkey',
       'United Kingdom', 'United States', 'Brazil', 'Chile', 'Lithuania',
       'Morocco', 'Azerbaijan', 'Belarus', 'Kazakhstan', 'Kyrgyzstan'],
      dtype=object)

In [512]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'United States'])]

In [481]:
df

Unnamed: 0,Country,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
80,Germany,Total Residential,Per capita energy intensity (index 2000) ...,100,101,107,89,91,88,86
81,Germany,Total Residential,Per dwelling energy intensity (index 2000) ...,100,98,100,82,84,82,79
82,Germany,Total Residential,Per dwelling TC energy intensity (index 2000) ...,100,89,82,78,78,77,78
83,Germany,Residential space heating,Per capita energy intensity (index 2000) ...,100,98,102,79,82,78,74
84,Germany,Residential space heating,Per dwelling energy intensity (index 2000) ...,100,95,95,73,76,73,69
85,Germany,Residential space heating,Per dwelling TC energy intensity (index 2000) ...,100,84,72,69,69,67,68
86,Germany,Residential lighting,Per capita energy intensity (index 2000) ...,100,106,105,93,88,88,90
87,Germany,Residential lighting,Per dwelling energy intensity (index 2000) ...,100,103,98,87,82,82,83
88,Germany,Residential appliances,Per capita energy intensity (index 2000) ...,100,106,123,115,106,106,103
89,Germany,Residential appliances,Per dwelling energy intensity (index 2000) ...,100,102,115,106,99,98,96


In [482]:
# Check column characteristics
df.describe()

Unnamed: 0,Country,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
count,20,20,20,20,20,20,20,20,20,20
unique,2,4,20,1,14,17,16,17,15,18
top,United States,Residential space heating,Per dwelling TC energy intensity (index 2000),100,98,115,63,64,62,90
freq,10,6,1,20,3,2,2,2,2,2


In [483]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [484]:
df

Unnamed: 0,Country,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
80,Germany,Total Residential,Per capita energy intensity (index 2000) ...,100,101,107,89,91,88,86
81,Germany,Total Residential,Per dwelling energy intensity (index 2000) ...,100,98,100,82,84,82,79
82,Germany,Total Residential,Per dwelling TC energy intensity (index 2000) ...,100,89,82,78,78,77,78
83,Germany,Residential space heating,Per capita energy intensity (index 2000) ...,100,98,102,79,82,78,74
84,Germany,Residential space heating,Per dwelling energy intensity (index 2000) ...,100,95,95,73,76,73,69
85,Germany,Residential space heating,Per dwelling TC energy intensity (index 2000) ...,100,84,72,69,69,67,68
86,Germany,Residential lighting,Per capita energy intensity (index 2000) ...,100,106,105,93,88,88,90
87,Germany,Residential lighting,Per dwelling energy intensity (index 2000) ...,100,103,98,87,82,82,83
88,Germany,Residential appliances,Per capita energy intensity (index 2000) ...,100,106,123,115,106,106,103
89,Germany,Residential appliances,Per dwelling energy intensity (index 2000) ...,100,102,115,106,99,98,96


In [485]:
# drop years before 2000
df.drop(df.iloc[:, 1:3], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [486]:
# drop years after 2019
#df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2005,2010,2015,2016,2017,2018
80,Germany,100,101,107,89,91,88,86
81,Germany,100,98,100,82,84,82,79
82,Germany,100,89,82,78,78,77,78
83,Germany,100,98,102,79,82,78,74
84,Germany,100,95,95,73,76,73,69
85,Germany,100,84,72,69,69,67,68
86,Germany,100,106,105,93,88,88,90
87,Germany,100,103,98,87,82,82,83
88,Germany,100,106,123,115,106,106,103
89,Germany,100,102,115,106,99,98,96


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [487]:
index = ['Country']
df=df.set_index(index)

In [488]:
df

Unnamed: 0_level_0,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Germany,100,101,107,89,91,88,86
Germany,100,98,100,82,84,82,79
Germany,100,89,82,78,78,77,78
Germany,100,98,102,79,82,78,74
Germany,100,95,95,73,76,73,69
Germany,100,84,72,69,69,67,68
Germany,100,106,105,93,88,88,90
Germany,100,103,98,87,82,82,83
Germany,100,106,123,115,106,106,103
Germany,100,102,115,106,99,98,96


In [489]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Germany,100,101,107,89,91,88,86
Germany,100,98,100,82,84,82,79
Germany,100,89,82,78,78,77,78
Germany,100,98,102,79,82,78,74
Germany,100,95,95,73,76,73,69
Germany,100,84,72,69,69,67,68
Germany,100,106,105,93,88,88,90
Germany,100,103,98,87,82,82,83
Germany,100,106,123,115,106,106,103
Germany,100,102,115,106,99,98,96


### Reform the DF to a suitable form

In [490]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [491]:
df_p

Unnamed: 0,Country,2000,2005,2010,2015,2016,2017,2018
0,Germany,100,101,107,89,91,88,86
1,Germany,100,98,100,82,84,82,79
2,Germany,100,89,82,78,78,77,78
3,Germany,100,98,102,79,82,78,74
4,Germany,100,95,95,73,76,73,69
5,Germany,100,84,72,69,69,67,68
6,Germany,100,106,105,93,88,88,90
7,Germany,100,103,98,87,82,82,83
8,Germany,100,106,123,115,106,106,103
9,Germany,100,102,115,106,99,98,96


In [492]:
df_p.columns

Index(['Country', '2000', '2005', '2010', '2015', '2016', '2017', '2018'], dtype='object')

In [493]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [494]:
df_p

Unnamed: 0,Country,variable,value
0,Germany,2000,100
1,Germany,2000,100
2,Germany,2000,100
3,Germany,2000,100
4,Germany,2000,100
...,...,...,...
135,United States,2018,79
136,United States,2018,59
137,United States,2018,58
138,United States,2018,116


In [495]:
Name_ind

'WEB Residential Energy Indicators'

In [496]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [497]:
df_p

Unnamed: 0,Country,Years,WEB Residential Energy Indicators indexed
0,Germany,2000,100
1,Germany,2000,100
2,Germany,2000,100
3,Germany,2000,100
4,Germany,2000,100
...,...,...,...
135,United States,2018,79
136,United States,2018,59
137,United States,2018,58
138,United States,2018,116


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [498]:
Key_ind

'e_resd'

In [499]:
df_p

Unnamed: 0,Country,Years,WEB Residential Energy Indicators indexed
0,Germany,2000,100
1,Germany,2000,100
2,Germany,2000,100
3,Germany,2000,100
4,Germany,2000,100
...,...,...,...
135,United States,2018,79
136,United States,2018,59
137,United States,2018,58
138,United States,2018,116


In [500]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'