In [62]:
import pandas as pd
import importlib as il
import sys, os

src = os.path.abspath(os.path.join('..', '..', 'src'))
sys.path.append(src)

# project modules
import ingest.ingest as ing
import modules.aggregate as agg
import modules.calculate as calc
from modules.helpers import show_all, print_na

In [2]:
v_print_na = False # Toggels print rows with na values at each stage of building the dataset
quiet_read_in = True # Toggels quiet mode for reading in data to notebook 
verbose_read_in = False  # Toggels verbose mode for reading in data to notebook 

In [3]:
data_dir = '../ingest/data_in/'
schema_in_file = '../ingest/schema.json'
ingest = ing.Ingest(schema_in_file)
parsed_datasets = ingest.read_and_parse(data_dir, v=verbose_read_in, q=quiet_read_in)

In [4]:
print('Imported datasets:\n', ingest.ds_names)

Imported datasets:
 ['areas_countries_data', 'areas_zones_data', 'building_statistics_fin', 'climatetrace_countries', 'countries_data', 'heating_demand_data', 'manual_continents_data', 'ne_countries_continents', 'on_site_heat_data']


In [5]:
areas_countries_data = parsed_datasets['areas_countries_data']
areas_zones_data = parsed_datasets['areas_zones_data']
building_statistics_fin = parsed_datasets['building_statistics_fin']
climatetrace_countries = parsed_datasets['climatetrace_countries']
countries_data = parsed_datasets['countries_data']
heating_demand_data = parsed_datasets['heating_demand_data']
manual_continents_data = parsed_datasets['manual_continents_data']
ne_countries_continents = parsed_datasets['ne_countries_continents']
on_site_heat_data = parsed_datasets['on_site_heat_data']

In [6]:
# combines continents is a full list of countries codes and continents
combined_continents = agg.Aggregate.combine_continents(ne_countries_continents, manual_continents_data, columns=['iso3_code'])
# Uncomment the following lines to see the dataset at this stage
#combined_continents.info()
#show_all(combined_continents.sort_values('iso3_code'))

In [7]:
# based on the continent information, heated floor area factors are added
print('Heated floor area factors columns:\n',list(on_site_heat_data.columns))
# we are interested in the 'ohfa_factor', that is "On-site heated floor area factor,
# the percentage of non-district heated floor area, product of oheg and hfa"

df = combined_continents
other = on_site_heat_data
on = 'continent_name'
cols = ['ohfa_factor']
ohfa_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(ohfa_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#ohfa_factor_countries.info()
#show_all(ohfa_factor_countries.sort_values('iso3_code'))

Heated floor area factors columns:
 ['continent_name', 'oheg_factor', 'hfa_factor', 'ohfa_factor']

Number of NA values in new column ohfa_factor:
 0
***


In [8]:
# The country list is reconciled with the Climate Trace country list
df = climatetrace_countries
other = ohfa_factor_countries
on = 'iso3_code'
cols = ['continent_name', 'ohfa_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column continent_name:
 3
***

Number of NA values in new column ohfa_factor:
 3
***


In [9]:
# Country metadata is added: country area (m2_area),

df = meta_countries
other = areas_countries_data
on = 'iso3_code'
cols = ['m2_area']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column m2_area:
 4
***


In [10]:
# Country metadata is added:
# population density (pop_density) 
# and emission factors (emission_factor): Emission factor in tonnes CO2 per GWh

df = meta_countries
other = countries_data
on = 'iso3_code'
cols = ['pop_density', 'emission_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in new column pop_density:
 3
***

Number of NA values in new column emission_factor:
 3
***


In [11]:
# Building statisticts are added:

year = 2019
data = building_statistics_fin[building_statistics_fin['observation_year'] == year]['gross_floor_area']
col_name = 'building_statistics_fin_' + str(year)
col = [data.values[0] for i in range(len(meta_countries))]
meta_countries[col_name] = col
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))

In [12]:
# Climate zone areas are added

df = meta_countries.rename(columns={'m2_area': 'country_area'})
other = areas_zones_data
on = 'iso3_code'
cols = ['climate_zone', 'm2_area']
zoned_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(zoned_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#zoned_countries.info()
#show_all(zoned_countries.sort_values('iso3_code'))


Number of NA values in new column climate_zone:
 37
***

Number of NA values in new column m2_area:
 37
***


In [13]:
# Heating demand data is added based on climate zones

df = zoned_countries
other = heating_demand_data
on = 'climate_zone'
cols = ['heating_demand_factor']
demand_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(demand_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#demand_factor_countries.info()
#show_all(demand_factor_countries.sort_values('iso3_code'))


Number of NA values in new column heating_demand_factor:
 37
***


In [18]:
data = demand_factor_countries.copy()
#show_all(data.sort_values('iso3_code'))
display(data)

Unnamed: 0,country_name,iso3_code,continent_name,ohfa_factor,country_area,pop_density,emission_factor,building_statistics_fin_2019,climate_zone,m2_area,heating_demand_factor
0,Aruba,ABW,North America,0.63,1.819384e+08,0.000624,199.4,494625956.0,1.0,1.238305e+08,0.000020
1,Afghanistan,AFG,Asia,0.56,6.438580e+11,0.000049,199.4,494625956.0,3.0,6.228012e+11,0.000052
1,Afghanistan,AFG,Asia,0.56,6.438580e+11,0.000049,199.4,494625956.0,2.0,1.948273e+10,0.000023
1,Afghanistan,AFG,Asia,0.56,6.438580e+11,0.000049,199.4,494625956.0,5.0,1.573999e+09,0.000160
2,Angola,AGO,Africa,0.45,1.247422e+12,0.000023,199.4,494625956.0,1.0,1.237890e+12,0.000020
...,...,...,...,...,...,...,...,...,...,...,...
249,South Africa,ZAF,Africa,0.45,1.219709e+12,0.000048,199.4,494625956.0,3.0,1.683765e+11,0.000052
249,South Africa,ZAF,Africa,0.45,1.219709e+12,0.000048,199.4,494625956.0,1.0,1.458865e+11,0.000020
249,South Africa,ZAF,Africa,0.45,1.219709e+12,0.000048,199.4,494625956.0,2.0,9.054464e+11,0.000023
250,Zambia,ZMB,Africa,0.45,7.504916e+11,0.000022,199.4,494625956.0,1.0,7.504916e+11,0.000020


In [69]:
# Calculate floor area estimates
import calculate as calc
import numpy as np
il.reload(calc)

cm = {
    'pd_i': 'pop_density',
    'ca_i': 'm2_area'
}
year = 2019
ref_iso3_code = 'FIN'
rv = {
    'fa_r': building_statistics_fin[building_statistics_fin['observation_year'] == year]['gross_floor_area'].values[0],
    'ca_r': meta_countries['m2_area'][meta_countries['iso3_code'] == ref_iso3_code].values[0],
    'pd_r': meta_countries['pop_density'][meta_countries['iso3_code'] == ref_iso3_code].values[0]
}
estimated_floor_area = calc.estimate_floor_area(data, cm, rv)
print_na(estimated_floor_area, ['estimated_floor_area'], v=v_print_na)


Number of NA values in new column estimated_floor_area:
 37
***


In [78]:
# Check calculation
ca_ir = np.sum(data[data['iso3_code'] == ref_iso3_code]['m2_area'])
efa_r = np.sum(estimated_floor_area[estimated_floor_area['iso3_code'] == ref_iso3_code]['estimated_floor_area'])
ee = (rv['fa_r'] - efa_r) / rv['fa_r']
me = (rv['ca_r'] - ca_ir) / rv['ca_r'] 
print('\
Floor area estimate for reference country: {}\n\
Actual floor area for reference country: {}\n\
Estimate error: {}\n*'.format(efa_r, rv['fa_r'], ee))
print('\
Whole country measured area: {}\n\
Zone ares sums: {}\n\
Area measurement error:{}\n*'.format(rv['ca_r'], ca_ir, me))
# If the estimation error is the same size as the measurement error, the calculation worked
print('Estimate error is large part due to measurement error: {}\n***'.format(np.isclose(ee, me)))

Floor area estimate for reference country: 494409695.1783241
Actual floor area for reference country: 494625956.0
Estimate error: 0.0004372209324087647
*
Whole country measured area: 336940111431.491
Zone ares sums: 336792794161.805
Area measurement error:0.00043722093240889997
*
Estimate error is large part due to measurement error: True
***


In [None]:
# Calculate emissions estimates

In [None]:
# Aggregate data
#data.groupby(['iso3_code'])['climate_zone'].sum()

In [None]:
# Prepare data for publication