In [57]:
import os
from pathlib import Path
from shutil import copyfile

%load_ext autoreload
%autoreload 2

ROOT = Path("msunique/Data")
DATA_DIR = Path("data")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
for dir, _, files in os.walk(ROOT):
    dir = Path(dir)
    for file in files:
        if file.endswith('.json'):
            path = dir / file
            dest = DATA_DIR / f"{dir.stem}_{file}"
            print(f"Copying {path} to {dest}")
            copyfile(path, dest)


Copying msunique/Data/Raiffeisen/2022.json to data/Raiffeisen_2022.json
Copying msunique/Data/Raiffeisen/2023.json to data/Raiffeisen_2023.json
Copying msunique/Data/Raiffeisen/2021.json to data/Raiffeisen_2021.json
Copying msunique/Data/ABB/2022.json to data/ABB_2022.json
Copying msunique/Data/ABB/2023.json to data/ABB_2023.json
Copying msunique/Data/ABB/2021.json to data/ABB_2021.json
Copying msunique/Data/Siemens/2022.json to data/Siemens_2022.json
Copying msunique/Data/Siemens/2023.json to data/Siemens_2023.json
Copying msunique/Data/Siemens/2021.json to data/Siemens_2021.json
Copying msunique/Data/IBM/2022.json to data/IBM_2022.json
Copying msunique/Data/IBM/2023.json to data/IBM_2023.json
Copying msunique/Data/IBM/2021.json to data/IBM_2021.json
Copying msunique/Data/PostFinance/2022.json to data/PostFinance_2022.json
Copying msunique/Data/PostFinance/2023.json to data/PostFinance_2023.json
Copying msunique/Data/PostFinance/2021.json to data/PostFinance_2021.json


In [60]:
from src.ingestion.report import Report

report = Report.from_json(DATA_DIR/"ABB_2021.json")

Parsing texts for  ABB   2021  ...
Embedding texts for  ABB   2021  ...


In [55]:
texts

['ABB\n—\nAnnual\nreport',
 '1',
 '\nHighlights 2021\n===\n## Operational performance\nStrongly increased demand for\nABB\'s offering from the low level\nin the previous year period when\nthe adverse business impact\nof the COVID-19 pandemic was\nsignificant.\nOrders +20% (+17% comparable(1)\nand revenues +11% (+8% compa-\nrable) increased in all Business Ar-\neas and regions.\nAdverse impact from imbalances\nin the supply chain to some ex-\ntent hampered the ability to con-\nvert orders into actual deliveries,\nresulting in an order backlog of\n$16.6 billion, +16% (+21% compa-\nrable), year-on-year.\nStrong improvement in Opera-\ntional EBITA margin(1) to 14.2%,\n+310 basis points, higher in all\nBusiness Areas.\nLifted long-term targets as ABB\nexpects to drive through-the-cycle\nrevenue growth to 4-7% (3-5% or-\nganic and 1-2% acquired), in con-\nstant currency, and sharpened Op-\nerational EBITA margin target to\nbe at least 15% as from 2023, in\nany given year.\n## Portfolio manag

In [56]:
metadatas

[{'page_number': 1, 'markdown_header': {}},
 {'page_number': 2, 'markdown_header': {}},
 {'page_number': 3,
  'markdown_header': {'Header 2': ['Operational performance',
    'Portfolio management',
    'Capital allocation']}},
 {'page_number': 4, 'markdown_header': {}},
 {'page_number': 5, 'markdown_header': {'Header 2': ['Key figures']}},
 {'page_number': 6, 'markdown_header': {}},
 {'page_number': 7, 'markdown_header': {}},
 {'page_number': 8, 'markdown_header': {}},
 {'page_number': 9, 'markdown_header': {}},
 {'page_number': 10,
  'markdown_header': {'Header 2': ['Positioned for stronger growth',
    'Cultural change'],
   'Header 3': ['Financial performance']}},
 {'page_number': 11,
  'markdown_header': {'Header 2': ['Strengthening our portfolio']}},
 {'page_number': 12,
  'markdown_header': {'Header 2': ['Groundbreaking innovations',
    'Progress on sustainability']}},
 {'page_number': 13,
  'markdown_header': {'Header 3': ['Strong future prospects']}},
 {'page_number': 14, 'mar

In [23]:
import re
from collections import defaultdict

patterns = {
    "#": "Header 1",
    "##": "Header 2",
    "###": "Header 3",
}

compiled_patterns = {re.compile(f"^{k} (.+)"): v for k, v in patterns.items()}


In [43]:
content = []
for page in data["analyzeResult"]['pages']:
    
    headers = defaultdict(list)
    lines = []
    for line in page["lines"]:
        line_content = line['content']
        for pattern, header in compiled_patterns.items():
            match = pattern.match(line_content)
            if match:
                headers[header].append(match.group(1))
        lines.append(line_content)
    content.append({
        "metadata": {"page_number": page["pageNumber"]} | dict(headers),
        "text": "\n".join(lines)
    })

In [44]:
content[4]

{'metadata': {'page_number': 5, 'Header 2': ['Key figures']},
 'text': '## Key figures\n$ in millions, unless otherwise indicated\nFY 2021\nFY 2020\nUS$\nComparable(4)\nOrders\n31,868\n26,512\n+20%\n+17%\nOrder backlog (end December)\n16,607\n14,303\n+16%\n+21%\nRevenues\n28,945\n26,134\n+11%\n+8%\nIncome from operations\n5,718\n1,593\n+259%\nOperational EBITA(1)\n4,122\n2,899\n+42%\n+37%(5)\nas % of operational revenues\n14.2%\n11.1%\n+3.1 pts\nIncome from continuing operations, net of tax\n4,730\n345\nn.a.\nNet income attributable to ABB\n4,546\n5,146\n-12%\nBasic Earnings per share ($)\n2.27\n2.44\n-7%(2)\nDividend per share\n0.82\n0.80\nCash flow from operating activities(3)\n3,330\n1,693\n+97%\nCash flow from operating activities in continuing\noperations\n3,338\n1,875\n+78%\nNet (cash) debt (end December)(1)\n(98)\n112\nFY2021\nFY2020\nChange\nCO2e own operations emissions, kt scope 1 and 2\n405 kt\n561 kt\n-28%\nLost Time Injury Frequency Rate (LTIFR),\nfrequency / 200,000 worki

In [36]:
page.keys()

dict_keys(['pageNumber', 'angle', 'width', 'height', 'unit', 'words', 'lines', 'spans'])