# Generate Dataframe for each indicator, with homogene format
This **pipeline** shows how to generate a a consistent dataframe, and its .csv file, from the *source* folder. It is a generic version, which means that for each file minor changes will be needed.

## Preliminars

In [402]:
# Imports
import pandas as pd
import numpy as np

In [403]:
# Call desired data
df_meta = pd.read_csv('Indicators_metadata.csv')
source_name = 'web_serv_energinten'
df = pd.read_csv('source_data/Energy/' + source_name + '.csv', index_col=0)  # index_col=0 reads without 'Unnamed: 0'

In [404]:
# call metadata of the indicator
df_meta_temp = df_meta.set_index('SOURCE FILE')
Units_ind= df_meta_temp['UNIT'][source_name]
Origin_ind= df_meta_temp['SITE'][source_name]
Name_ind =  df_meta_temp['INDICATOR'][source_name]
Key_ind =  df_meta_temp['KEY'][source_name]
Desc_ind =  df_meta_temp['DESCRIPTION'][source_name]

### Preview of the DataFrame

In [405]:
# Check column names
df.columns

Index(['End use', 'Indicator', '2000', '2005', '2010', '2015', '2016', '2017',
       '2018'],
      dtype='object')

In [406]:
df.head()

Unnamed: 0_level_0,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Australia,Total Services,Per value added energy intensity (index 2000 -...,100,99,95,91,89,88,87
Austria,Total Services,Per value added energy intensity (index 2000 -...,100,100,79,70,67,71,67
Belgium,Total Services,Per value added energy intensity (index 2000 -...,100,107,116,100,99,96,94
Canada,Total Services,Per value added energy intensity (index 2000 -...,100,89,78,75,74,76,77
Czech Republic,Total Services,Per value added energy intensity (index 2000 -...,100,90,85,71,71,72,68


In [407]:
# Reset index to return DF to a tidy state IF NEEDED
df=df.reset_index()
# Rename column Country column
# df=df.rename(columns={'Country Name':'Country'})

In [408]:
df['Country'].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Ireland', 'Italy', 'Japan', 'Korea', 'Luxembourg', 'Mexico',
       'Netherlands', 'New Zealand', 'Poland', 'Portugal',
       'Slovak Republic', 'Spain', 'Sweden', 'Switzerland', 'Turkey',
       'United Kingdom', 'United States', 'Brazil', 'Chile', 'Lithuania',
       'Morocco', 'Belarus'], dtype=object)

In [409]:
# selecting rows based on condition 
df = df[df.Country.isin(['China', 'Germany', 'India', 'United States'])]

In [410]:
df

Unnamed: 0,Country,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
8,Germany,Total Services,Per value added energy intensity (index 2000 -...,100,97,91,81,83,78,71
27,United States,Total Services,Per value added energy intensity (index 2000 -...,100,90,86,81,79,76,76


In [411]:
# Check column characteristics
df.describe()

Unnamed: 0,2015,2016,2017,2018
count,2.0,2.0,2.0,2.0
mean,81.0,81.0,77.0,73.5
std,0.0,2.828427,1.414214,3.535534
min,81.0,79.0,76.0,71.0
25%,81.0,80.0,76.5,72.25
50%,81.0,81.0,77.0,73.5
75%,81.0,82.0,77.5,74.75
max,81.0,83.0,78.0,76.0


In [412]:
# Replace country names for consistency with other DataFrames
df['Country']=df['Country'].replace('US', 'United States')
# df['Country']=df['Country'].replace('Deutschland', 'Germany')
# df['Country']=df['Country'].replace('Indien', 'India')

In [413]:
df

Unnamed: 0,Country,End use,Indicator,2000,2005,2010,2015,2016,2017,2018
8,Germany,Total Services,Per value added energy intensity (index 2000 -...,100,97,91,81,83,78,71
27,United States,Total Services,Per value added energy intensity (index 2000 -...,100,90,86,81,79,76,76


In [414]:
# drop years before 2000
df.drop(df.iloc[:, 1:3], inplace = True, axis = 1) 
#df = df.iloc[:, 1:3]

In [415]:
# drop years after 2019
#df.drop(df.iloc[:, -3:], inplace = True, axis = 1) 
df

Unnamed: 0,Country,2000,2005,2010,2015,2016,2017,2018
8,Germany,100,97,91,81,83,78,71
27,United States,100,90,86,81,79,76,76


### Set Index temporarily to ease manipulation and guarantee final homogeneity

In [416]:
index = ['Country']
df=df.set_index(index)

In [417]:
df

Unnamed: 0_level_0,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Germany,100,97,91,81,83,78,71
United States,100,90,86,81,79,76,76


In [418]:
# drop all NaNs if any
df.dropna()

Unnamed: 0_level_0,2000,2005,2010,2015,2016,2017,2018
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Germany,100,97,91,81,83,78,71
United States,100,90,86,81,79,76,76


### Reform the DF to a suitable form

In [419]:
# Reset index to return DF to a tidy state
df_p=df.reset_index()

In [420]:
df_p

Unnamed: 0,Country,2000,2005,2010,2015,2016,2017,2018
0,Germany,100,97,91,81,83,78,71
1,United States,100,90,86,81,79,76,76


In [421]:
df_p.columns

Index(['Country', '2000', '2005', '2010', '2015', '2016', '2017', '2018'], dtype='object')

In [422]:
# Melt to a Long format
df_p=df_p.melt(id_vars='Country')
#df_p2=df_p.melt()

In [423]:
df_p

Unnamed: 0,Country,variable,value
0,Germany,2000,100
1,United States,2000,100
2,Germany,2005,97
3,United States,2005,90
4,Germany,2010,91
5,United States,2010,86
6,Germany,2015,81
7,United States,2015,81
8,Germany,2016,83
9,United States,2016,79


In [424]:
Name_ind

'WEB Services Energy Indicators'

In [425]:
# Rename column to Years
df_p=df_p.rename(columns={'variable':'Years'})
df_p=df_p.rename(columns={'value':Name_ind+" "+Units_ind})

In [426]:
df_p

Unnamed: 0,Country,Years,WEB Services Energy Indicators indexed
0,Germany,2000,100
1,United States,2000,100
2,Germany,2005,97
3,United States,2005,90
4,Germany,2010,91
5,United States,2010,86
6,Germany,2015,81
7,United States,2015,81
8,Germany,2016,83
9,United States,2016,79


### 2.4 Do further necessary adjustments

In [None]:
# Show column types
display(df_p.dtypes)

In [None]:
# Coerce column types when needed
df_p.Years=df_p.Years.astype(int, copy=False) #avoiding a warning (careful)

In [None]:
df_p.head()

In [None]:
df_p.describe()

In [427]:
Key_ind

'e_serv'

In [428]:
df_p

Unnamed: 0,Country,Years,WEB Services Energy Indicators indexed
0,Germany,2000,100
1,United States,2000,100
2,Germany,2005,97
3,United States,2005,90
4,Germany,2010,91
5,United States,2010,86
6,Germany,2015,81
7,United States,2015,81
8,Germany,2016,83
9,United States,2016,79


In [429]:
df_p.to_csv('result_df/'+'prev'+Key_ind+'.csv')

In [51]:
pwd

'/Users/paul/Desktop/TU/1datascience/groupwork/DataAnalysis/Big_DF'