_**Imports for document extraction, text processing, and writing the text into chunks**_

In [17]:
import os
from pathlib import Path
import re
import PyPDF2
import pandas as pd

_**Necessary functions for the document preprocessing and text clean up**_

In [2]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    """
    Extract text from a PDF file
    """
    if pdf_path.suffix.lower() == ".pdf":
        text = ""
        with pdf_path.open("rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text



def clean_text(text: str) -> str:
    """
    Clean the text by removing unnecesary sections
    """
    match = re.search(r'Bibliography|Acknowledgements|Index|Contents|Carbon', text, re.IGNORECASE)
    if match:
        text = text[match.start():]
    return text




def chunk_text(text: str, max_chunk_size: int = 2500) -> list[str]:
    """
    Chunk the text into smaller chunks of max_chunk_size
    """
    paragraphs = text.split(".\n")
    chunks=[]
    current_chunk=""
    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) + 1 > max_chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"
        else:
            current_chunk += paragraph + "\n\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

_**Adding document text to a list of chunks which will then be processed by BERTopic**_

In [3]:
folder_year_2013_2016 = Path("ESG_reports/Siemens/2013-2016")
folder_year_2017_2020 = Path("ESG_reports/Siemens/2017-2020")
folder_year_2021_2024 = Path("ESG_reports/Siemens/2021-2024")
document_text_2016 = []
document_text_2020 = []
document_text_2024 = []

In [4]:
for file in folder_year_2013_2016.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2016.append(chunk)
print("Done")

for file in folder_year_2017_2020.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2020.append(chunk)
print("Done 2")

for file in folder_year_2021_2024.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2024.append(chunk)
print("Done 3")

Done
Done 2
Done 3


_**Necessary imports for BERTopic**_

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP
from bertopic.representation import KeyBERTInspired
from flair.embeddings import TransformerDocumentEmbeddings
from bertopic import BERTopic

_**Configurations for the steps of BERTopic**_

In [7]:
cluster_model = HDBSCAN(min_cluster_size=20, min_samples=10,metric='euclidean')
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(stop_words='english')
representation_model=KeyBERTInspired()

In [8]:
topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2',
                       umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probabilities = topic_model.fit_transform(document_text_2020)

In [9]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,391,-1_siemens_fiscal_tax_board,"[siemens, fiscal, tax, board, business, 2018, ...",[Our reportable segments may do business with ...
1,0,67,0_100_gmbh_siemens_nited,"[100, gmbh, siemens, nited, states, spain, lim...","[T\n.NET Houston, LLC, Austin, TX\n /\n U\nnit..."
2,1,56,1_emissions_environmental_carbon_portfolio,"[emissions, environmental, carbon, portfolio, ...",[The calculation of the reduction of carbon di...
3,2,55,2_human_rights_suppliers_chain,"[human, rights, suppliers, chain, supply, sust...",[Index according to the ten principles of the ...
4,3,51,3_sustainability_sustainable_world_development,"[sustainability, sustainable, world, developme...",[Business to Society® – measuring our \nsocial...
5,4,40,4_audit_group_statements_financial,"[audit, group, statements, financial, report, ...",[DETAILED DISCUSSION OF THE AUDIT \nOF THE FI...
6,5,36,5_100_gmbh_arrangements_contractual,"[100, gmbh, arrangements, contractual, siemens...","[2\n Contr\nol due to rights to appoint, reas..."
7,6,35,6_shares_cash_income_capital,"[shares, cash, income, capital, stock, bonds, ...",[The Company may not repurchase its own shares...
8,7,34,7_contract_estimates_contracts_costs,"[contract, estimates, contracts, costs, procee...",[Provisions for proceedings out of or in conne...
9,8,32,8_stock_awards_target_granted,"[stock, awards, target, granted, attainment, p...",[4 For one half of the Siemens Stock Awards 2...


In [14]:
df=topic_model.get_document_info(document_text_2020)
df

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Annual Report \n20\n18\nsiemens.com\nA.1 p 2 ...,9,9_income_assets_fiscal_year,"[income, assets, fiscal, year, 2017, million, ...","[The tax rate for fiscal 2017 was 26\n %, posi...",income - assets - fiscal - year - 2017 - milli...,1.000000,False
1,T\no further increase the entrepreneurial free...,20,20_healthcare_healthineers_services_providers,"[healthcare, healthineers, services, providers...","[For fiscal 2019, Siemens Healthineers expects...",healthcare - healthineers - services - provide...,0.782722,False
2,Our reportable segments may do business with e...,-1,-1_siemens_fiscal_tax_board,"[siemens, fiscal, tax, board, business, 2018, ...",[Our reportable segments may do business with ...,siemens - fiscal - tax - board - business - 20...,0.000000,True
3,In line with common practice in the financial ...,-1,-1_siemens_fiscal_tax_board,"[siemens, fiscal, tax, board, business, 2018, ...",[Our reportable segments may do business with ...,siemens - fiscal - tax - board - business - 20...,0.000000,False
4,A.2.4 Capit al structure\nSustainable revenue...,38,38_dividend_debt_income_net,"[dividend, debt, income, net, shareholders, pl...","[At the Annual Shareholders’ Meeting, the Mana...",dividend - debt - income - net - shareholders ...,1.000000,False
...,...,...,...,...,...,...,...,...
1380,Profile of required skills and expertise\nThe ...,12,12_board_supervisory_shall_managing,"[board, supervisory, shall, managing, committe...",[Taking the Company’s international orientatio...,board - supervisory - shall - managing - commi...,0.739982,False
1381,Diversity\nWith regard to the composition of t...,12,12_board_supervisory_shall_managing,"[board, supervisory, shall, managing, committe...",[Taking the Company’s international orientatio...,board - supervisory - shall - managing - commi...,0.962484,False
1382,Limits on age and on length of membership\nIn ...,12,12_board_supervisory_shall_managing,"[board, supervisory, shall, managing, committe...",[Taking the Company’s international orientatio...,board - supervisory - shall - managing - commi...,0.963465,True
1383,The Supervisory Board is of the opinion that i...,12,12_board_supervisory_shall_managing,"[board, supervisory, shall, managing, committe...",[Taking the Company’s international orientatio...,board - supervisory - shall - managing - commi...,1.000000,False


In [18]:
def concatenate_documents_by_topic(df: pd.DataFrame, n_docs: int = 3) -> pd.DataFrame:

    """

    Groups the DataFrame by 'Topic', and for each topic concatenates up to `n_docs`

    documents into a single string like: "Document 1: ... Document 2: ...".

 

 

    Parameters:

        df (pd.DataFrame): DataFrame with 'Topic' and 'Document' columns.

        n_docs (int): Number of documents to include per topic.

 

 

    Returns:

        pd.DataFrame: A DataFrame with columns ['Topic', 'ConcatenatedDocuments'].

    """

    result = []

 

 

    for topic, group in df.groupby("Topic"):

        docs = group["Document"].head(n_docs).tolist()

        concatenated = "\n\n".join([f"Document {i+1}: {doc}" for i, doc in enumerate(docs)])

        result.append({"Topic": topic, "ConcatenatedDocuments": concatenated})

 
    return pd.DataFrame(result)

In [21]:
df_2=concatenate_documents_by_topic(df)
df_2

Unnamed: 0,Topic,ConcatenatedDocuments
0,-1,Document 1: Our reportable segments may do bus...
1,0,Document 1: 2 of t\nhe German \nCommercial Cod...
2,1,Document 1: Combating climate change is one of...
3,2,Document 1: WWW.SIEMENS.COM/INTEGRITY-INITIATI...
4,3,Document 1: Sustainability \nInformation 2017\...
5,4,Document 1: Our Consolidated Financial Stateme...
6,5,Document 1: 2 \n Contr\nol due to rights to ap...
7,6,Document 1: The main factors for the change in...
8,7,Document 1: Cost overruns or additional paymen...
9,8,Document 1: Beneficiaries receive one free sha...


In [24]:
first_value = df_2["ConcatenatedDocuments"].iloc[0]

<class 'str'>


In [37]:
open_ai_key="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"
import openai

# Option 1: Store your API key as an environment variable
# export OPENAI_API_KEY="your_api_key_here" (in terminal or .env file)
openai.api_key = open_ai_key

def summarize_paragraph(paragraph, model="gpt-4.1-mini"):
    prompt = f"Summarize the following paragraph in 2-3 sentences:\n\n{paragraph}"

    response = openai.responses.create(
        model=model,
        input=[
            {"role": "user", "content": prompt}
        ]
    )
    
    summary = response.output[0].content[0].text
    return summary


summary = summarize_paragraph(first_value)
print("Summary:\n",summary)

Summary:
 The Siemens Group's report outlines its organizational structure, non-financial policies, and enhanced financial framework effective from April 2019, targeting annual revenue growth of 4-5% and improved profit margins across various business segments. The company uses measures like return on equity (ROE) and return on capital employed (ROCE) to manage capital efficiency, aiming for a long-term ROCE between 15-20%. Additionally, the method for calculating capital employed and interim ROCE is detailed, with adjustments following the adoption of IFRS 9 standards.
