In [1]:
import pandas as pd
import numpy as np
from functools import partial
import datetime as dt
import importlib as il
import sys, os
src = os.path.abspath(os.path.join('..', '..', 'src'))
sys.path.append(src)

# project modules
import ingest.ingest as ing
import modules.aggregate as agg
import modules.calculate as calc
import modules.assemble as asse
from modules.helpers import show_all, print_na

In [2]:
# Configurations

v_print_na = False # Toggels print rows with na values at each stage of building the dataset
quiet_read_in = True # Toggels quiet mode for reading in data to notebook 
verbose_read_in = False  # Toggels verbose mode for reading in data to notebook
data_dir = os.path.join('..', 'ingest', 'data_in') + '/'
schema_in_file = os.path.join('..', 'ingest', 'schema.json')
# archive timestamp
now = dt.datetime.now()
month =  now.month if len(str(now.month)) > 2 else str(0) + str(now.month)
archival_time = '{}{}{}'.format(now.year, month, now.day)
# Unaggregated estimates
unaggregated_estimates_file = os.path.join('..', 'data_out/') + archival_time + '_unaggregated_estimates.csv'

In [3]:
# archive timestamp
now = dt.datetime.now()
month =  now.month if len(str(now.month)) > 2 else str(0) + str(now.month)
archival_time = '{}{}{}'.format(now.year, month, now.day)
# Aggregated estimates
aggregated_estimates_file = os.path.join('..', 'data_out/') + archival_time + '_aggregated_estimates.csv'

In [4]:
# archive timestamp
now = dt.datetime.now()
month =  now.month if len(str(now.month)) > 2 else str(0) + str(now.month)
archival_time = '{}{}{}'.format(now.year, month, now.day)
# Production data
prod_data_estimates_file = os.path.join('..', 'data_out/') + archival_time + '_prod_data_estimates.csv'
# Note that the following path contains a folder not followed by git, 
# take care to make sure it exists before running the following line
delivery_file_name = os.path.join('..', 'prod_data/') + 'buildings_avoin_' + archival_time + '.csv'

In [5]:
ingest = ing.Ingest(schema_in_file)
parsed_datasets = ingest.read_and_parse(data_dir, v=verbose_read_in, q=quiet_read_in)

In [6]:
print('Imported datasets:\n', ingest.ds_names)

Imported datasets:
 ['areas_countries_data', 'areas_zones_data', 'building_statistics_fin', 'climatetrace_countries', 'countries_data', 'heating_demand_data', 'manual_continents_data', 'ne_countries_continents', 'on_site_heat_data']


In [7]:
areas_countries_data = parsed_datasets['areas_countries_data']
areas_zones_data = parsed_datasets['areas_zones_data']
building_statistics_fin = parsed_datasets['building_statistics_fin']
climatetrace_countries = parsed_datasets['climatetrace_countries']
countries_data = parsed_datasets['countries_data']
heating_demand_data = parsed_datasets['heating_demand_data']
manual_continents_data = parsed_datasets['manual_continents_data']
ne_countries_continents = parsed_datasets['ne_countries_continents']
on_site_heat_data = parsed_datasets['on_site_heat_data']

In [8]:
# combines continents is a full list of countries codes and continents
combined_continents = agg.Aggregate.combine_continents(ne_countries_continents, manual_continents_data, columns=['iso3_code'])
# Uncomment the following lines to see the dataset at this stage
#combined_continents.info()
#show_all(combined_continents.sort_values('iso3_code'))

In [9]:
# based on the continent information, heated floor area factors are added
print('Heated floor area factors columns:\n',list(on_site_heat_data.columns))
# we are interested in the 'ohfa_factor', that is "On-site heated floor area factor,
# the percentage of non-district heated floor area, product of oheg and hfa"

df = combined_continents
other = on_site_heat_data
on = 'continent_name'
cols = ['ohfa_factor']
ohfa_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(ohfa_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#ohfa_factor_countries.info()
#show_all(ohfa_factor_countries.sort_values('iso3_code'))

Heated floor area factors columns:
 ['continent_name', 'oheg_factor', 'hfa_factor', 'ohfa_factor']

Number of NA values in column ohfa_factor:
 0
***


In [10]:
# The country list is reconciled with the Climate Trace country list
df = climatetrace_countries
other = ohfa_factor_countries
on = 'iso3_code'
cols = ['continent_name', 'ohfa_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in column continent_name:
 3
***

Number of NA values in column ohfa_factor:
 3
***


In [11]:
# Country metadata is added: country area (m2_area),

df = meta_countries
other = areas_countries_data
on = 'iso3_code'
cols = ['m2_area']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in column m2_area:
 4
***


In [12]:
# Country metadata is added:
# population density (pop_density) 
# and emission factors (emission_factor): Emission factor in tonnes CO2 per GWh

df = meta_countries
other = countries_data
on = 'iso3_code'
cols = ['pop_density', 'emission_factor']
meta_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(meta_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#meta_countries.info()
#show_all(meta_countries.sort_values('iso3_code'))


Number of NA values in column pop_density:
 3
***

Number of NA values in column emission_factor:
 3
***


In [13]:
# Climate zone areas are added

df = meta_countries.rename(columns={'m2_area': 'country_area'})
other = areas_zones_data
on = 'iso3_code'
cols = ['climate_zone', 'm2_area']
zoned_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(zoned_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#zoned_countries.info()
#show_all(zoned_countries.sort_values('iso3_code'))


Number of NA values in column climate_zone:
 37
***

Number of NA values in column m2_area:
 37
***


In [14]:
# Heating demand data is added based on climate zones

df = zoned_countries
other = heating_demand_data
on = 'climate_zone'
cols = ['heating_demand_factor']
demand_factor_countries = agg.Aggregate.add_cols(df, other, on=on, cols=cols)
print_na(demand_factor_countries, cols, v=v_print_na)
# Uncomment the following lines to see the dataset at this stage
#demand_factor_countries.info()
#show_all(demand_factor_countries.sort_values('iso3_code'))


Number of NA values in column heating_demand_factor:
 37
***


In [15]:
data = demand_factor_countries.copy()
# column map
cm = {
    'pd_i': 'pop_density',
    'ca_i': 'm2_area'
}
year = 2019
ref_iso3_code = 'FIN'
# reference values
rv = {
    'fa_r': building_statistics_fin[building_statistics_fin['observation_year'] == year]['gross_floor_area'].values[0],
    'ca_r': meta_countries['m2_area'][meta_countries['iso3_code'] == ref_iso3_code].values[0],
    'pd_r': meta_countries['pop_density'][meta_countries['iso3_code'] == ref_iso3_code].values[0]
}
estimated_floor_area = calc.estimate_floor_area(data, cm, rv)
print_na(estimated_floor_area, ['estimated_floor_area'], v=v_print_na)


Number of NA values in column estimated_floor_area:
 37
***


In [16]:
# Check calculation
ca_ir = np.sum(data[data['iso3_code'] == ref_iso3_code]['m2_area'])
efa_r = np.sum(estimated_floor_area[estimated_floor_area['iso3_code'] == ref_iso3_code]['estimated_floor_area'])
ee = (rv['fa_r'] - efa_r) / rv['fa_r']
me = (rv['ca_r'] - ca_ir) / rv['ca_r'] 
print('\
Floor area estimate for reference country: {}\n\
Actual floor area for reference country: {}\n\
Estimate error: {}\n*'.format(efa_r, rv['fa_r'], ee))
print('\
Whole country measured area: {}\n\
Zone ares sums: {}\n\
Area measurement error:{}\n*'.format(rv['ca_r'], ca_ir, me))
# If the estimation error is the same size as the measurement error, the calculation worked
print('Estimate error is large part due to measurement error: {}\n***'.format(np.isclose(ee, me)))

Floor area estimate for reference country: 494409695.1783241
Actual floor area for reference country: 494625956.0
Estimate error: 0.0004372209324087647
*
Whole country measured area: 336940111431.491
Zone ares sums: 336792794161.805
Area measurement error:0.00043722093240889997
*
Estimate error is large part due to measurement error: True
***


In [17]:
# Calculate emissions estimates
il.reload(calc)
data = estimated_floor_area.copy()
# column map
cm = {
    'area': 'estimated_floor_area',
    'heating_demand_factor': 'heating_demand_factor',
}
estimated_heat_demand = calc.calculate_heat_demand(data, cm)
print_na(estimated_heat_demand, ['estimated_heat_demand'], v=v_print_na)


Number of NA values in column estimated_heat_demand:
 37
***


In [18]:
# Calculate emissions estimates
il.reload(calc)
data = estimated_heat_demand.copy()
# column map
cm = {
    'area': 'estimated_floor_area',
    'ohfa': 'ohfa_factor',
    'heating_demand': 'heating_demand_factor',
    'emission': 'emission_factor',
}
estimated_emissions = calc.calculate_emissions(data, cm)
print_na(estimated_emissions, ['estimated_emissions'], v=v_print_na)


Number of NA values in column estimated_emissions:
 37
***


In [19]:
unaggregated = estimated_emissions.copy()
# Uncomment the following 3 lsine to see info about the unaggregated dataset
#display(unaggregated)
#unaggregated.info()
#print_na(unaggregated, unaggregated.columns, v=v_print_na)
# Uncomment the following line to save the unaggregated dataset
#unaggregated.to_csv(unaggregated_estimates_file)

In [20]:
index = 'iso3_code'
cols = ['estimated_floor_area', 'estimated_heat_demand', 'estimated_emissions']
s_na_sum = partial(pd.Series.sum, skipna = False)
aggregated = asse.reduce_cols(unaggregated, index, cols, s_na_sum)
# Uncomment the following 3 lsine to see info about the aggregated dataset
#display(aggregated)
#aggregated.info()
#print_na(aggregated, aggregated.columns, v=v_print_na)
# Uncomment the following line to save the aggregated dataset
#aggregated.to_csv(aggregated_estimates_file)

In [21]:
# Prepare data for publication
# rename columns
cm = {'iso3_code': 'iso3_country', 'total_estimated_emissions': 'tCO2eq'}
prod_data = aggregated.copy()
prod_data = prod_data.rename(columns=cm)
# add columns begin_date, end_date
# 2019-01-01,2019-12-31
add_dates = {'begin_date': '2019-01-01', 'end_date': '2019-12-31'}
for key in add_dates.keys():
    col = [add_dates[key] for n in range(len(prod_data))]
    prod_data[key] = col
# remove extra columns
drop_cols = ['total_estimated_floor_area', 'total_estimated_heat_demand']
prod_data = prod_data.drop(columns=drop_cols)
# Uncomment the following 3 lsine to see info about the prod_data dataset
#display(prod_data)
#prod_data.info()
#print_na(prod_data, prod_data.columns, v=v_print_na)
# Uncomment the following line to save the prod_data dataset
#prod_data.to_csv(prod_data_estimates_file)
# File name for delivery
#prod_data.to_csv(delivery_file_name)