In [1]:
import sys
sys.path.append('./../')

In [2]:
import pandas as pd
from pathlib import Path
from src.input_data.table import Table
from src.io import read_jsonl

# Example statements

In [3]:
# read data (download data to appropriate directory before running the following cells)
dfile = Path("./../training_data/se_direct/test.jsonl",)
data = read_jsonl(dfile.parent, dfile.name, lines_to_read=100)
# let's see the keys inside one item
data[2].keys()

dict_keys(['structured_data', 'source', 'cell_annotations', 'unstructured_data', 'ud_type', 'augmentation_status', 'current_hash'])

In [4]:
# is this table augmented or original
data[2]['augmentation_status']

'augmented'

In [5]:
# original table
t = Table(table=pd.DataFrame(data[2]['unstructured_data']), annotations=data[2]['structured_data'],)
t.to_dataframe()

Unnamed: 0,0,1,2,3
0,Australia - Scope 1,2021,2020,2022
1,Emissions from fuel consumption (tCO$_{2}$e),17957,14574,19561
2,Emissions from Killara Feedlot cattle (tCO$_{2...,37462,-,44826
3,Fuel consumption (GJ),255873,207569,278969
4,Australia - Scope 2,,,
5,Electricity consumption from the grid (GJ),24201,41051,29104
6,Total renewable electricity (MWh),-,-,"8,084 (100%)"
7,Voluntary LGCs,-,-,6581
8,Electricity consumption from the grid (MWh),6722,11403,8084
9,Mandatory LGCs 1,-,-,1503


In [6]:
t.annotations

[{'property': ['Australia - Scope 1 : Emissions from fuel consumption (tCO$_{2}$e) ',
   'time'],
  'property_value': ['17,957 ', '2021 '],
  'unit': ['', ''],
  'subject': ['', ''],
  'subject_value': ['', ''],
  'predicate_hash': ['595552ba529257f919c4da9696e07990',
   '595552ba529257f919c4da9696e07990']},
 {'property': ['Australia - Scope 1 : Emissions from fuel consumption (tCO$_{2}$e) ',
   'time'],
  'property_value': ['14,574 ', '2020 '],
  'unit': ['', ''],
  'subject': ['', ''],
  'subject_value': ['', ''],
  'predicate_hash': ['3c47364a596f1ca1d3850cd41d87568c',
   '3c47364a596f1ca1d3850cd41d87568c']},
 {'property': ['Australia - Scope 1 : Emissions from fuel consumption (tCO$_{2}$e) ',
   'time'],
  'property_value': ['19,561 ', '2022 '],
  'unit': ['', ''],
  'subject': ['', ''],
  'subject_value': ['', ''],
  'predicate_hash': ['de51e3a418aa32420ca753dbc6726e74',
   'de51e3a418aa32420ca753dbc6726e74']},
 {'property': ['Australia - Scope 1 : Emissions from Killara Feedlot c

# Example of converting labels table to statements

In [7]:
# read data (download data to appropriate directory before running the following cells)
dfile = Path("./../training_data/se_indirect_2d/test.jsonl",)
data = read_jsonl(dfile.parent, dfile.name, lines_to_read=100)
data[0].keys()

dict_keys(['table', 'collection', 'annotations', 'verifications', 'classification', 'cell_annotations', 'ds_table', 'source', 'verified', 'augmentation_status'])

In [8]:
# is this table augmented or original
data[0]['augmentation_status']

'original'

In [9]:
# original table
t = Table(table=pd.DataFrame(data[0]['table']), annotations=data[0]['annotations'],cell_annotations=pd.DataFrame(data[0]['cell_annotations']))
t.to_dataframe()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Emissions in scope,Base year emissions (kiloton CO$_{2}$e),Reduction target (%),Base year,Target year,Scope,Use of offsets or removals,SBTi status
1,Scope 1,75,,,,,,
2,Scope 2 (market-based),185,,,,,,
3,Scope 3:,,35 (all scopes),2016,2022,Company-wide,,Verified (2017)
4,Business travel,154,,,,,,
5,Downstream transportation,146,,,,,,
6,Total,560,,,,,,


In [10]:
# currently the statements are empty
t.annotations

[]

In [11]:
# labels table
t.cell_annotations

Unnamed: 0,0,1,2,3,4,5,6,7
0,header_1,unit_value,key,time_property,key,key,key,key
1,property,property_value,empty,empty,empty,empty,empty,empty
2,property,property_value,empty,empty,empty,empty,empty,empty
3,property,empty,key_value,time_value,key_value,key_value,key_value,key_value
4,property,property_value,empty,empty,empty,empty,empty,empty
5,property,property_value,empty,empty,empty,empty,empty,empty
6,property,property_value,empty,empty,empty,empty,empty,empty


In [12]:
# convert labels table to statements:
t.convert_to_structured_data()
t.annotations

[{'property': ['Emissions in scope : Scope 1 '],
  'property_value': ['75 '],
  'unit': ['Base year emissions (kiloton CO$_{2}$e) '],
  'subject': [''],
  'subject_value': ['']},
 {'property': ['Emissions in scope : Scope 2 (market-based) '],
  'property_value': ['185 '],
  'unit': ['Base year emissions (kiloton CO$_{2}$e) '],
  'subject': [''],
  'subject_value': ['']},
 {'property': ['Emissions in scope : Business travel '],
  'property_value': ['154 '],
  'unit': ['Base year emissions (kiloton CO$_{2}$e) '],
  'subject': [''],
  'subject_value': ['']},
 {'property': ['Emissions in scope : Downstream transportation '],
  'property_value': ['146 '],
  'unit': ['Base year emissions (kiloton CO$_{2}$e) '],
  'subject': [''],
  'subject_value': ['']},
 {'property': ['Emissions in scope : Total '],
  'property_value': ['560 '],
  'unit': ['Base year emissions (kiloton CO$_{2}$e) '],
  'subject': [''],
  'subject_value': ['']}]