_**Imports for document extraction, text processing, and writing the text into chunks**_

In [1]:
import os
from pathlib import Path
import re
import PyPDF2
import pandas as pd

ModuleNotFoundError: No module named 'PyPDF2'

_**Necessary functions for the document preprocessing and text clean up**_

In [128]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    """
    Extract text from a PDF file
    """
    if pdf_path.suffix.lower() == ".pdf":
        text = ""
        with pdf_path.open("rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text



def clean_text(text: str) -> str:
    """
    Clean the text by removing unnecesary sections
    """
    match = re.search(r'Bibliography|Acknowledgements|Index|Contents|Carbon', text, re.IGNORECASE)
    if match:
        text = text[match.start():]
    return text




def chunk_text(text: str, max_chunk_size: int = 2500) -> list[str]:
    """
    Chunk the text into smaller chunks of max_chunk_size
    """
    paragraphs = text.split(".\n")
    chunks=[]
    current_chunk=""
    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) + 1 > max_chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"
        else:
            current_chunk += paragraph + "\n\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

_**Adding document text to a list of chunks which will then be processed by BERTopic**_

In [129]:
folder_year_2013_2016 = Path("ESG_reports/Siemens/2013-2016")
folder_year_2017_2020 = Path("ESG_reports/Siemens/2017-2020")
folder_year_2021_2024 = Path("ESG_reports/Siemens/2021-2024")
document_text_2016 = []
document_text_2020 = []
document_text_2024 = []

In [130]:
for file in folder_year_2013_2016.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2016.append(chunk)
print("Done")

for file in folder_year_2017_2020.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2020.append(chunk)
print("Done 2")

for file in folder_year_2021_2024.glob("*.pdf"):
    text = extract_text_from_pdf(file)
    cleaned_text = clean_text(text)
    chunks = chunk_text(text)
    for chunk in chunks:
        document_text_2024.append(chunk)
print("Done 3")

Done
Done 2
Done 3


_**Necessary imports for BERTopic**_

In [132]:
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP
from bertopic.representation import KeyBERTInspired
from flair.embeddings import TransformerDocumentEmbeddings
from bertopic import BERTopic

_**Configurations for the steps of BERTopic**_

In [133]:
cluster_model = HDBSCAN()
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(stop_words='english')
representation_model=KeyBERTInspired()

In [134]:
topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2', nr_topics=20,
                       umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probabilities = topic_model.fit_transform(document_text_2020)

In [153]:
df_topic_info=topic_model.get_topic_info()
df_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,391,-1_siemens_fiscal_board_tax,"[siemens, fiscal, board, tax, business, 2018, ...",[Our reportable segments may do business with ...
1,0,119,0_human_rights_sustainability_business,"[human, rights, sustainability, business, siem...",[Global agreements \nSiemens has been an activ...
2,1,103,1_financial_instruments_rate_assets,"[financial, instruments, rate, assets, cash, l...",[3\n \n Report\ned in the following line items...
3,2,100,2_environmental_water_emissions_energy,"[environmental, water, emissions, energy, port...",[Direct comparison with a reference technology...
4,3,85,3_stock_compensation_awards_000,"[stock, compensation, awards, 000, target, boa...",[10\n Prof. Dr. Russwurm left the Managing Bo...
5,4,84,4_fiscal_year_revenue_orders,"[fiscal, year, revenue, orders, income, billio...","[Orders for fiscal 2017 were € 85.7 billion,..."
6,5,67,5_100_gmbh_siemens_nited,"[100, gmbh, siemens, nited, states, spain, lim...","[T\n.NET Houston, LLC, Austin, TX\n /\n U\nnit..."
7,6,57,6_100_limited_siemens_india,"[100, limited, siemens, india, private, chenna...",[11\n Siemens\n \nAG is a shareholder with unl...
8,7,56,7_audit_group_statements_financial,"[audit, group, statements, financial, report, ...",[Dr\n.-Ing. E. h.) 5/6 83 3/\n3 100\nDame Nem...
9,8,50,8_board_supervisory_chairman_managing,"[board, supervisory, chairman, managing, posit...","[The Chairman’s Committee makes proposals, in..."


In [141]:
total=df_topic_info["Count"].sum()
outliers=df_topic_info["Count"].iloc[0]
percentage_outliers=(outliers/total)*100
print(f"We have a total of {total} documents.\n{outliers} are outliers.\nAbout {round(percentage_outliers,2)}% are outliers")

We have a total of 1385 documents.
391 are outliers.
About 28.23% are outliers


In [155]:
df_document_info=topic_model.get_document_info(document_text_2020)

In [156]:
#Removed the outliers
df_document_info["topic"] = topics  # Add the topic assignments to the DataFrame
df_clean = df_document_info[df_document_info["topic"] != -1]  # Keep only documents with valid topics
df_clean

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,topic
0,Annual Report \n20\n18\nsiemens.com\nA.1 p 2 ...,4,4_fiscal_year_revenue_orders,"[fiscal, year, revenue, orders, income, billio...","[Orders for fiscal 2017 were € 85.7 billion,...",fiscal - year - revenue - orders - income - bi...,1.000000,False,4
1,T\no further increase the entrepreneurial free...,16,16_healthcare_healthineers_services_wind,"[healthcare, healthineers, services, wind, pro...","[For fiscal 2019, Siemens Healthineers expects...",healthcare - healthineers - services - wind - ...,0.782722,False,16
4,A.2.4 Capit al structure\nSustainable revenue...,9,9_shares_income_cash_capital,"[shares, income, cash, capital, shareholders, ...",[The Company may not repurchase its own shares...,shares - income - cash - capital - shareholder...,1.000000,False,9
5,"At the Annual Shareholders’ Meeting, the Manag...",9,9_shares_income_cash_capital,"[shares, income, cash, capital, shareholders, ...",[The Company may not repurchase its own shares...,shares - income - cash - capital - shareholder...,1.000000,False,9
10,"Orders 13,717 13,329 3 % 12 %\nRevenue 12,44...",4,4_fiscal_year_revenue_orders,"[fiscal, year, revenue, orders, income, billio...","[Orders for fiscal 2017 were € 85.7 billion,...",fiscal - year - revenue - orders - income - bi...,1.000000,False,4
...,...,...,...,...,...,...,...,...,...
1379,Long-term succession planning \nfor t\nhe Man...,8,8_board_supervisory_chairman_managing,"[board, supervisory, chairman, managing, posit...","[The Chairman’s Committee makes proposals, in...",board - supervisory - chairman - managing - po...,1.000000,False,8
1380,Profile of required skills and expertise\nThe ...,8,8_board_supervisory_chairman_managing,"[board, supervisory, chairman, managing, posit...","[The Chairman’s Committee makes proposals, in...",board - supervisory - chairman - managing - po...,0.739982,False,8
1381,Diversity\nWith regard to the composition of t...,8,8_board_supervisory_chairman_managing,"[board, supervisory, chairman, managing, posit...","[The Chairman’s Committee makes proposals, in...",board - supervisory - chairman - managing - po...,0.962484,False,8
1382,Limits on age and on length of membership\nIn ...,8,8_board_supervisory_chairman_managing,"[board, supervisory, chairman, managing, posit...","[The Chairman’s Committee makes proposals, in...",board - supervisory - chairman - managing - po...,0.963465,True,8


In [157]:
def concatenate_documents_by_topic(df: pd.DataFrame, n_docs: int = 3) -> pd.DataFrame:
    """
    Groups the DataFrame by 'Topic', and for each topic concatenates up to `n_docs`
    documents into a single string like: "Document 1: ... Document 2: ...".

    Parameters:
        df (pd.DataFrame): DataFrame with 'Topic' and 'Document' columns.
        n_docs (int): Number of documents to include per topic.

    Returns:
        pd.DataFrame: A DataFrame with columns ['Topic', 'ConcatenatedDocuments'].
    """
    
    result = []

    for topic, group in df.groupby("Topic"):
        docs = group["Document"].head(n_docs).tolist()
        concatenated = "\n\n".join([f"Document {i+1}: {doc}" for i, doc in enumerate(docs)])
        result.append({"Topic": topic, "ConcatenatedDocuments": concatenated})

    return pd.DataFrame(result)

In [158]:
df_concatenated_docs=concatenate_documents_by_topic(df_clean)
print(df_concatenated_docs)

    Topic                              ConcatenatedDocuments
0       0  Document 1: Sustainability \nInformation 2017\...
1       1  Document 1: Cost overruns or additional paymen...
2       2  Document 1: Combating climate change is one of...
3       3  Document 1: Remuneration system for Managing B...
4       4  Document 1: Annual Report  \n20\n18\nsiemens.c...
5       5  Document 1: 2 of t\nhe German \nCommercial Cod...
6       6  Document 1: 2 \n Contr\nol due to rights to ap...
7       7  Document 1: Our Consolidated Financial Stateme...
8       8  Document 1: The regular terms of office of sev...
9       9  Document 1: A.2.4  Capit al structure\nSustain...
10     10  Document 1: Audits by tax authorities and chan...
11     11  Document 1: With this, Siemens lays the founda...
12     12  Document 1: Portfolio measures, at-equity inve...
13     13  Document 1: Overall, the actual development fo...
14     14  Document 1: R & D activities of the Division f...
15     15  Document 1: S

In [159]:
print(df_topic_info)

    Topic  Count                                         Name  \
0      -1    391                  -1_siemens_fiscal_board_tax   
1       0    119       0_human_rights_sustainability_business   
2       1    103          1_financial_instruments_rate_assets   
3       2    100       2_environmental_water_emissions_energy   
4       3     85              3_stock_compensation_awards_000   
5       4     84                 4_fiscal_year_revenue_orders   
6       5     67                     5_100_gmbh_siemens_nited   
7       6     57                  6_100_limited_siemens_india   
8       7     56           7_audit_group_statements_financial   
9       8     50        8_board_supervisory_chairman_managing   
10      9     46                 9_shares_income_cash_capital   
11     10     43  10_compliance_business_corruption_integrity   
12     11     35       11_diversity_people_employees_training   
13     12     29       12_goodwill_cash_generating_impairment   
14     13     26       13

In [150]:
df_concatenated_docs_merge=pd.merge(df_concatenated_docs,df_topic_info[["Topic","Representation"]], on='Topic', how="left")
df_concatenated_docs_merge

Unnamed: 0,Topic,ConcatenatedDocuments,Representation
0,0,Document 1: 2 of t\nhe German \nCommercial Cod...,"[human, rights, sustainability, business, siem..."
1,1,Document 1: Combating climate change is one of...,"[financial, instruments, rate, assets, cash, l..."
2,2,Document 1: WWW.SIEMENS.COM/INTEGRITY-INITIATI...,"[environmental, water, emissions, energy, port..."
3,3,Document 1: Sustainability \nInformation 2017\...,"[stock, compensation, awards, 000, target, boa..."
4,4,Document 1: Our Consolidated Financial Stateme...,"[fiscal, year, revenue, orders, income, billio..."
5,5,Document 1: 2 \n Contr\nol due to rights to ap...,"[100, gmbh, siemens, nited, states, spain, lim..."
6,6,Document 1: The main factors for the change in...,"[100, limited, siemens, india, private, chenna..."
7,7,Document 1: Cost overruns or additional paymen...,"[audit, group, statements, financial, report, ..."
8,8,Document 1: Beneficiaries receive one free sha...,"[board, supervisory, chairman, managing, posit..."
9,9,Document 1: Annual Report \n20\n18\nsiemens.c...,"[shares, income, cash, capital, shareholders, ..."


In [104]:
first_value = df_concatenated_docs["ConcatenatedDocuments"].iloc[0]

In [105]:
open_ai_key="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"
import openai

# Option 1: Store your API key as an environment variable
# export OPENAI_API_KEY="your_api_key_here" (in terminal or .env file)
openai.api_key = open_ai_key

def summarize_paragraph(paragraph, model="gpt-4.1-nano"):
    prompt = f"Provide short and descriptive title that describes the main ESG goals:\n\n{paragraph}"

    response = openai.responses.create(
        model=model,
        input=[
            {"role": "user", "content": prompt}
        ]
    )
    
    summary = response.output[0].content[0].text
    return summary

df_concatenated_docs['Summary'] = df_concatenated_docs['ConcatenatedDocuments'].apply(summarize_paragraph)

# Print the DataFrame with summaries
print(df_concatenated_docs[['Topic', 'Summary']])

    Topic                                            Summary
0       0  "Main ESG Goals Focused on Sustainable Busines...
1       1  "Siemens’ ESG Goals: Climate Action, Decarboni...
2       2  Sustainable Supply Chain Management and Respon...
3       3  "Siemens’ Commitment to Sustainable Developmen...
4       4  "ESG Goals Focused on Financial Integrity, Con...
5       5  "ESG Goals: Ensuring Responsible Governance, M...
6       6  "Strategic Financial Management and Shareholde...
7       7  "ESG and Financial Risk Management in Project ...
8       8  ESG Goals Focused on Performance and Responsib...
9       9  "Siemens' ESG Goals Focused on Sustainable Inn...
10     10  ESG Goals Focused on Market Resilience, Operat...
11     11  "Managing Financial Risks and Currency Exposur...
12     12    ESG Goals in Corporate Governance and Diversity
13     13  "ESG Goals: Enhancing Energy Efficiency, Waste...
14     14  "Siemens' ESG Goals Focused on Sustainable Gro...
15     15  "Promoting Oc

In [108]:
df_topics_summaries=pd.merge(df_topic_info,df_concatenated_docs[["Topic","Summary"]], on='Topic', how="left")
print(df_topics_summaries)

    Topic  Count                                         Name  \
0      -1    391                  -1_siemens_fiscal_board_tax   
1       0    119       0_human_rights_sustainability_business   
2       1    103          1_financial_instruments_rate_assets   
3       2    100       2_environmental_water_emissions_energy   
4       3     85              3_stock_compensation_awards_000   
5       4     84                 4_fiscal_year_revenue_orders   
6       5     67                     5_100_gmbh_siemens_nited   
7       6     57                  6_100_limited_siemens_india   
8       7     56           7_audit_group_statements_financial   
9       8     50        8_board_supervisory_chairman_managing   
10      9     46                 9_shares_income_cash_capital   
11     10     43  10_compliance_business_corruption_integrity   
12     11     35       11_diversity_people_employees_training   
13     12     29       12_goodwill_cash_generating_impairment   
14     13     26       13