# Merge the generated *.csv*
This **pipeline** shows how to generate a general dataframe, and its .csv file, from all the *.csv* files on the *select_df* folder. The files need to be manually selected and copy/pasted form the *result_df* folder.

## Preliminars
Build the list of .csv, then dataframes, to be merged

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob

In [2]:
# Identify files
csv_list = glob.glob('select_df/*.csv', recursive=True)
#csv_list = [file.replace('select_df\\', '') for file in csv_list]
#print(csv_list)

In [3]:
# Generate all dataframes and store them in a dict
df_dict = {}
for csv_file in csv_list:
    df_dict[csv_file.replace('select_df\\', '').replace('.csv', '')] = pd.read_csv(csv_file)

In [4]:
# Check the keys (csv's transformed/stored
df_dict.keys()

dict_keys(['preva_econ', 'preva_eprod', 'preva_inten', 'preva_sharo', 'preva_shart', 'prevb_sharr', 'prevb_sharw', 'prevc_egen', 'prevc_egeno', 'prevc_icapg', 'prevc_icaps', 'prevc_icapw', 'prevc_pecon', 'prevc_r-con', 'preve_serv', 'prevf_icaps', 'prevf_icapw'])

In [5]:
# Check the columns of the selected DataFrames
df_columns = []
i=0
# Generate list of column names
for csv_file in csv_list:
    df_columns = df_columns + df_dict[csv_file.replace('select_df\\', '').replace('.csv', '')].columns.tolist()
# Find and print the unique values
df_columns = np.array(df_columns)
df_columns_u = np.unique(df_columns)
df_columns_u

array(['Country', 'Electricity generation TWh',
       'Electricity generation from other TWh',
       'Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY]',
       'Primary Energy: Consumption EJ',
       'Renewable Energy Geothermal (Installed capacity)MW',
       'Renewable Energy Solar (Installed capacity) MW',
       'Renewable Energy Wind  (Installed capacity) MW',
       'Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY]',
       'Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC]',
       'Renewable installed PV Power GW',
       'Renewable installed Wind Power GW',
       'Renewable share electricity production',
       'Renewables:Consumption EJ',
       'Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT]',
       'Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]',
       'Unnamed: 0', 'WEB Services Energy Indicators indexed',
       'Wind and sola

In [None]:
# Columns to be erased using:
# del df['column name'] 

## Merge the stored DataFrames

In [6]:
# Create empty DataFrame to begin 
df_result = pd.DataFrame(columns =['Country', 'Years'])

In [7]:
# Merge all the dataframes in the dict
for df_name, df in df_dict.items():
    df_result = pd.merge(df_result, df, how="outer", on=["Country", "Years"])

In [8]:
df_result.head()

Unnamed: 0.1,Unnamed: 0_x,Country,Years,Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM],Unnamed: 0_y,Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT],Unnamed: 0_x.1,Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY],Unnamed: 0_y.1,Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY],...,Unnamed: 0_x.2,Primary Energy: Consumption EJ,Unnamed: 0_y.2,Renewables:Consumption EJ,Unnamed: 0_x.3,WEB Services Energy Indicators indexed,Unnamed: 0_y.3,Renewable installed PV Power GW,Unnamed: 0,Renewable installed Wind Power GW
0,0.0,China,2000,30187188.73,0.0,1355738,0.0,10.23305051,0.0,16.63949819,...,2,42.453161,2,0.031489,,,2,0.0,3,0.352
1,1.0,China,2001,30940867.04,1.0,1480949,1.0,9.833583306,1.0,18.95851917,...,6,44.841977,6,0.033018,,,6,0.0,7,0.406
2,2.0,China,2002,32272543.97,2.0,1654164,2.0,9.621842057,2.0,17.6186279,...,10,48.844607,10,0.035471,,,10,0.0,11,0.473
3,3.0,China,2003,35505254.39,3.0,1910755,3.0,9.994209983,3.0,15.03704033,...,14,56.875222,14,0.041101,,,14,0.0,15,0.568
4,4.0,China,2004,41469975.08,4.0,2203502,4.0,10.35599738,4.0,16.22326642,...,18,66.545716,18,0.05783,,,18,0.0,19,0.764


In [11]:
df_result.columns

Index(['Country', 'Years',
       'Total final energy consumption (TFEC) (TJ) [1.1_TOTAL.FINAL.ENERGY.CONSUM]',
       'Total electricity output (GWh) [4.1.1_TOTAL.ELECTRICITY.OUTPUT]',
       'Energy intensity level of primary energy (MJ/2011 USD PPP) [6.1_PRIMARY.ENERGY.INTENSITY]',
       'Renewable electricity share of total electricity output (%) [4.1_SHARE.RE.IN.ELECTRICITY]',
       'Renewable energy share of TFEC (%) [2.1_SHARE.TOTAL.RE.IN.TFEC]',
       'Renewable share electricity production',
       'Wind and solar share electricity production%',
       'Electricity generation TWh', 'Electricity generation from other TWh',
       'Renewable Energy Geothermal (Installed capacity)MW',
       'Renewable Energy Solar (Installed capacity) MW',
       'Renewable Energy Wind  (Installed capacity) MW',
       'Primary Energy: Consumption EJ', 'Renewables:Consumption EJ',
       'WEB Services Energy Indicators indexed',
       'Renewable installed PV Power GW', 'Renewable installed W

In [10]:
# Erase unwanted coluns (this needs to be solved beforehand, when creating the csv's)
del df_result['Unnamed: 0_y'] 
del df_result['Unnamed: 0_x']
del df_result['Unnamed: 0']

## Generate a *.csv* from the resulting DataFrame

In [12]:
df_result.to_csv('General_Dataframe.csv')