In [1]:
import pandas as pd
import importlib as il
import sys, os

src = os.path.abspath(os.path.join('..', '..', 'src'))
sys.path.append(src)

# project modules
import ingest.ingest as ing
import modules.aggregate as agg
from modules.helpers import show_all, print_na

In [2]:
v_print_na = False # Toggels print rows with na values at each stage of building the dataset
quiet_read_in = True # Toggels quiet mode for reading in data to notebook 
verbose_read_in = False  # Toggels verbose mode for reading in data to notebook 

In [3]:
data_dir = '../ingest/data_in/'
schema_file = '../ingest/schema.json'
ingest = ing.Ingest(schema_file)
parsed_datasets = ingest.read_and_parse(data_dir, v=verbose_read_in, q=quiet_read_in)

In [4]:
print('Imported datasets:\n', ingest.ds_names)

Imported datasets:
 ['areas_countries_data', 'areas_zones_data', 'building_statistics_fin', 'climatetrace_countries', 'countries_data', 'heating_demand_data', 'manual_continents_data', 'ne_countries_continents', 'on_site_heat_data']


In [5]:
areas_countries_data = parsed_datasets['areas_countries_data']
areas_zones_data = parsed_datasets['areas_zones_data']
building_statistics_fin = parsed_datasets['building_statistics_fin']
climatetrace_countries = parsed_datasets['climatetrace_countries']
countries_data = parsed_datasets['countries_data']
heating_demand_data = parsed_datasets['heating_demand_data']
manual_continents_data = parsed_datasets['manual_continents_data']
ne_countries_continents = parsed_datasets['ne_countries_continents']
on_site_heat_data = parsed_datasets['on_site_heat_data']

In [11]:
# combines continents is a full list of countries codes and continents
combined_continents = agg.Aggregate.combine_continents(ne_countries_continents, manual_continents_data, columns=['iso3_code'])
# Uncomment the following lines to see the dataset at this stage
#combined_continents.info()
#show_all(combined_continents.sort_values('iso3_code'))

In [28]:
# based on the continent information, heated floor area factors are added
print('Heated floor area factors columns:\n',list(on_site_heat_data.columns))
# we are interested in the 'ohfa_factor', that is "On-site heated floor area factor,
# the percentage of non-district heated floor area, product of oheg and hfa"

df = combined_continents
other = on_site_heat_data
on = 'continent_name'
cols = ['ohfa_factor']
ohfa_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(ohfa_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#ohfa_factor_countries.info()
#show_all(ohfa_factor_countries.sort_values('iso3_code'))

Heated floor area factors columns:
 ['continent_name', 'oheg_factor', 'hfa_factor', 'ohfa_factor']

Number of NA values in new column ohfa_factor:
 0
***


In [29]:
# The country list is reconciled with the Climate Trace country list
df = climatetrace_countries
other = ohfa_factor_countries
on = 'iso3_code'
cols = ['continent_name', 'ohfa_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column continent_name:
 3
***

Number of NA values in new column ohfa_factor:
 3
***


In [30]:
# Country metadata is added: country area (m2_area),

df = meta_countries
other = areas_countries_data
on = 'iso3_code'
cols = ['m2_area']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column m2_area:
 4
***


In [31]:
# Country metadata is added:
# population density (pop_density) 
# and emission factors (emission_factor): Emission factor in tonnes CO2 per GWh

df = meta_countries
other = countries_data
on = 'iso3_code'
cols = ['pop_density', 'emission_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column pop_density:
 3
***

Number of NA values in new column emission_factor:
 3
***


In [33]:
# Building statisticts are added:

year = 2019
data = building_statistics_fin[building_statistics_fin['observation_year'] == year]['gross_floor_area']
col_name = 'building_statistics_fin_' + str(year)
col = [data.values[0] for i in range(len(meta_countries))]
meta_countries[col_name] = col
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))

In [34]:
# Climate zone areas are added

df = meta_countries.rename(columns={'m2_area': 'country_area'})
other = areas_zones_data
on = 'iso3_code'
cols = ['climate_zone', 'm2_area']
zoned_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(zoned_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#zoned_countries.info()
#show_all(zoned_countries.sort_values('iso3_code'))


Number of NA values in new column climate_zone:
 37
***

Number of NA values in new column m2_area:
 37
***


In [35]:
# Heating demand data is added based on climate zones

df = zoned_countries
other = heating_demand_data
on = 'climate_zone'
cols = ['heating_demand_factor']
demand_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(demand_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#demand_factor_countries.info()
#show_all(demand_factor_countries.sort_values('iso3_code'))


Number of NA values in new column heating_demand_factor:
 37
***


In [36]:
data = demand_factor_countries.copy()

In [37]:
show_all(data.sort_values('iso3_code'))

Unnamed: 0,country_name,iso3_code,continent_name,ohfa_factor,country_area,pop_density,emission_factor,building_statistics_fin_2019,climate_zone,m2_area,heating_demand_factor
0,Aruba,ABW,North America,0.63,181938400.0,0.000624,199.4,494625956.0,1.0,123830500.0,2e-05
1,Afghanistan,AFG,Asia,0.56,643858000000.0,4.889e-05,199.4,494625956.0,3.0,622801200000.0,5.2e-05
1,Afghanistan,AFG,Asia,0.56,643858000000.0,4.889e-05,199.4,494625956.0,2.0,19482730000.0,2.3e-05
1,Afghanistan,AFG,Asia,0.56,643858000000.0,4.889e-05,199.4,494625956.0,5.0,1573999000.0,0.00016
2,Angola,AGO,Africa,0.45,1247422000000.0,2.346e-05,199.4,494625956.0,1.0,1237890000000.0,2e-05
2,Angola,AGO,Africa,0.45,1247422000000.0,2.346e-05,199.4,494625956.0,2.0,9410578000.0,2.3e-05
3,Anguilla,AIA,North America,0.63,83303310.0,0.00014013,199.4,494625956.0,1.0,83303310.0,2e-05
4,Åland Islands,ALA,Northern Europe,0.27,1506313000.0,1.919e-05,199.4,494625956.0,4.0,1074806000.0,0.000133
4,Åland Islands,ALA,Northern Europe,0.27,1506313000.0,1.919e-05,199.4,494625956.0,5.0,1379091.0,0.00016
5,Albania,ALB,Europe,0.54,28735420000.0,9.973e-05,199.4,494625956.0,4.0,8305079000.0,0.000133
