_**Imports for document extraction, text processing, and writing the text into chunks**_

In [95]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-base-v2')
input_texts = [
    'query: how much protein should a female eat',
    'query: summit define',
    "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
]
embeddings = model.encode(input_texts, normalize_embeddings=True)
print(embeddings)



Error while downloading from https://cdn-lfs.hf.co/repos/99/30/9930a5f938d94a0e280918c741b627e2951e95b397da975ed97b7642e379cab5/d0d559c47d5f71b1d280b13b62a2657f3e3bc70c0786f9ab91a36545e6a8f693?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1746904088&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NjkwNDA4OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy85OS8zMC85OTMwYTVmOTM4ZDk0YTBlMjgwOTE4Yzc0MWI2MjdlMjk1MWU5NWIzOTdkYTk3NWVkOTdiNzY0MmUzNzljYWI1L2QwZDU1OWM0N2Q1ZjcxYjFkMjgwYjEzYjYyYTI2NTdmM2UzYmM3MGMwNzg2ZjlhYjkxYTM2NTQ1ZTZhOGY2OTM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=TqAfk8JD-jkrBGbc4EaXo4lCETwmhYKp66NJHiqG5f850Ig3XCIc2vZWygB-4YWOPWJVKM8VCjvXAxwjUTBipy7O1YaNvI9hHi4rdPAmiD-9dmShEZ%7E1casd-qr6dxyqDkDm3ReLuLpErE4jUBCKbDH8D8lvrjrO58LLRST%7EpsBkAZC1MB7c2L7uCtKygLLU6ScRvKiF7-g7tIlmRvz9c1Y7CCbxTsXHBPF2mofhV9zJh8uFJ8Keo%7EWGrIOL-y5wNALLE

OSError: intfloat/e5-base-v2 does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

In [1]:
import os
from pathlib import Path
import re
import PyPDF2
import pandas as pd

_**Necessary functions for the document preprocessing and text clean up**_

In [39]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    """
    Extract text from a PDF file
    """
    if pdf_path.suffix.lower() == ".pdf":
        text = ""
        with pdf_path.open("rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text



def clean_text(text: str) -> str:
    """
    Clean the text by removing unnecesary sections
    """
    match = re.search(r'Bibliography|Acknowledgements|Index|Contents|Carbon', text, re.IGNORECASE)
    if match:
        text = text[match.start():]
    return text


def chunk_text(text: str, max_token_size: int = 128) -> list[str]:
    """
    Chunk the text into smaller chunks of approximately max_token_size tokens
    Uses a simple estimation method where 1 token ≈ 4 characters
    """
    paragraphs = text.split(".\n")
    chunks = []
    current_chunk = ""
    current_token_count = 0
    
    # Simple token estimation function
    def estimate_tokens(text: str) -> int:
        # Roughly 4 characters per token for English text
        return len(text) // 4
    
    for paragraph in paragraphs:
        paragraph_tokens = estimate_tokens(paragraph)
        
        # Check if adding this paragraph would exceed the token limit
        if current_token_count + paragraph_tokens + 1 > max_token_size:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"
            current_token_count = paragraph_tokens + 2  # +2 for the newlines
        else:
            current_chunk += paragraph + "\n\n"
            current_token_count += paragraph_tokens + 2  # +2 for the newlines
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


def remove_empty_strings_from_list(input_list: list[str]) -> list[str]:
    """
    Removes all empty strings (e.g., "", "   ") from a list of strings.
    It also removes strings that are None.
    """
    return [s for s in input_list if s and s.strip()]

_**Adding document text to a list of chunks which will then be processed by BERTopic**_

In [3]:
folder_year_2013 = Path("ESG_reports/Siemens/2013")
folder_year_2014 = Path("ESG_reports/Siemens/2014")
folder_year_2015 = Path("ESG_reports/Siemens/2015")
folder_year_2016 = Path("ESG_reports/Siemens/2016")
folder_year_2017 = Path("ESG_reports/Siemens/2017")
folder_year_2018 = Path("ESG_reports/Siemens/2018")
folder_year_2019 = Path("ESG_reports/Siemens/2019")
folder_year_2020 = Path("ESG_reports/Siemens/2020")
folder_year_2021 = Path("ESG_reports/Siemens/2021")
folder_year_2022 = Path("ESG_reports/Siemens/2022")
folder_year_2023 = Path("ESG_reports/Siemens/2023")
folder_year_2024 = Path("ESG_reports/Siemens/2024")
document_text_2013 = []
document_text_2014 = []
document_text_2015 = []
document_text_2016 = []
document_text_2017 = []
document_text_2018 = []
document_text_2019 = []
document_text_2020 = []
document_text_2021 = []
document_text_2022 = []
document_text_2023 = []
document_text_2024 = []

In [41]:
def process_pdf_folder(folder_path: Path) -> list[str]:
    document_chunks = []
    for file in folder_path.glob("*.pdf"):
        text = extract_text_from_pdf(file)
        cleaned_text = clean_text(text)
        chunks = chunk_text(cleaned_text)
        document_chunks.extend(chunks)
    print(len(document_chunks))
    return document_chunks

document_text_2013 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2013))
document_text_2014 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2014))
document_text_2015 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2015))
document_text_2016 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2016))
document_text_2017 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2017))
document_text_2018 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2018))
document_text_2019 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2019))
document_text_2020 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2020))
document_text_2021 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2021))
document_text_2022 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2022))
document_text_2023 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2023))
document_text_2024 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2024))

1453
994
628
609
832
869
840
317
394
462
590
611


In [50]:
dictionary_year_chunks = {
    2013: document_text_2013,
    2014: document_text_2014,
    2015: document_text_2015,
    2016: document_text_2016,
    2017: document_text_2017,
    2018: document_text_2018,
    2019: document_text_2019,
    2020: document_text_2020,
    2021: document_text_2021,
    2022: document_text_2022,
    2023: document_text_2023,
    2024: document_text_2024
}

years = []
flat_chunks = []

# Iterate through the dictionary in sorted order
for year, chunks in sorted(dictionary_year_chunks.items()):
    # For each string in the chunks list
    for chunk in chunks:
        # Add the year to years list
        years.append(year)
        # Add the chunk to flat_chunks list
        flat_chunks.append(chunk)

df_chunks = pd.DataFrame(dictionary_year_chunks.items(), columns=['Year', 'Chunk'])

In [None]:
print(flat_chunks[0])


carbon 
 dioxide per kilo
watt-hour,” notes Kirill 
Gamburger.   PAGE 211
2Protecting the environment Generating power more efficiently12Lots of electricity from  
hot exhaust gases 
The retrofitted Unit  6 at the Kir ishi facility 
is a state-of-the-art combined cycle power 
plant of the kind that Siemens has installed 
in many parts of the world over the last few 
years. Our advanced techno  logy enables 
the highly efficient use of fuel (natural gas 
in Kirishi’s case). When the fuel is burned, thermal energy is converted into mechani -
cal energy, which drives a gas turbine – as in 
a jet engine. The gas turbine is connected via a shaft to a generator, which produces electricity. When emitted, the turbine’s ex -
haust gases have a temperature of  be-
tween 500°C and 600°C. At Kirishi, these gases no long
er go unused but are har -
nessed to produce steam, which is then 
used to generate additional energy. Thanks 
to this technology, combined cycle power 
plants are the most efficie

_**Necessary imports for BERTopic**_

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP
from bertopic import BERTopic

_**Configurations for the steps of BERTopic**_

In [27]:
cluster_model = HDBSCAN()
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(stop_words='english')

In [63]:
topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2', nr_topics=30,
                       umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probabilities = topic_model.fit_transform(flat_chunks)



In [65]:
topics_over_time = topic_model.topics_over_time(flat_chunks, years)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [64]:
df_topic_info=topic_model.get_topic_info()
df_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2099,-1_siemens_fiscal_board_business,"[siemens, fiscal, board, business, financial, ...","[Year ended September 30,\n(in millions of €)..."
1,0,1272,0_sustainability_environmental_gri_emissions,"[sustainability, environmental, gri, emissions...","[So far, with regards to project business, cus..."
2,1,866,1_board_stock_managing_compensation,"[board, stock, managing, compensation, supervi...",[14 Peter Y. Solmssen resigned from the Manag...
3,2,659,2_assets_tax_million_income,"[assets, tax, million, income, cash, financial...",[The determination of the recoverable amount o...
4,3,463,3_employees_diversity_people_learning,"[employees, diversity, people, learning, sieme...",[Siemens employee exits (in thousands)\nFiscal...
5,4,387,4_compliance_business_integrity_corruption,"[compliance, business, integrity, corruption, ...",[Compliance indicators and \nwhistle-blowing\n...
6,5,386,5_financial_audit_statements_risk,"[financial, audit, statements, risk, report, m...",[The Audit Committee met six times. In the pr...
7,6,369,6_growth_markets_year_fiscal,"[growth, markets, year, fiscal, orders, revenu...",[Profit and Profit margin by Business\nProfit ...
8,7,287,7_siemens_100_ag_india,"[siemens, 100, ag, india, company, private, li...",[PROCEEDINGS OUT OF OR IN CONNECTION \nWITH A...
9,8,250,8_100_gmbh_siemens_limited,"[100, gmbh, siemens, limited, healthcare, grün...",[10\n Ex\nemption pursuant to Section 264 (3) ...


In [67]:
total=df_topic_info["Count"].sum()
outliers=df_topic_info["Count"].iloc[0]
percentage_outliers=(outliers/total)*100
print(f"We have a total of {total} documents.\n{outliers} are outliers.\nAbout {round(percentage_outliers,2)}% are outliers")

We have a total of 8585 documents.
2099 are outliers.
About 24.45% are outliers


In [68]:
df_document_info=topic_model.get_document_info(flat_chunks)

In [69]:
#Removed the outliers
df_document_info["topic"] = topics  # Add the topic assignments to the DataFrame
df_clean = df_document_info[df_document_info["topic"] != -1]  # Keep only documents with valid topics
df_clean

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,topic
1,Efficiency\nTwo gas turbinesPrior to the upgra...,15,15_power_energy_gas_consumption,"[power, energy, gas, consumption, division, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - energy - gas - consumption - division ...,1.000000,False,15
2,We’ve sold more than 300 of our SGT5-4000F tur...,15,15_power_energy_gas_consumption,"[power, energy, gas, consumption, division, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - energy - gas - consumption - division ...,1.000000,False,15
3,"Extremel\ny reliable and easy to operate, our ...",15,15_power_energy_gas_consumption,"[power, energy, gas, consumption, division, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - energy - gas - consumption - division ...,1.000000,False,15
4,2\n –\n The Kirishi power plant was \nbuilt in...,15,15_power_energy_gas_consumption,"[power, energy, gas, consumption, division, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - energy - gas - consumption - division ...,1.000000,False,15
5,You trust us so much that OGK-2 has commission...,15,15_power_energy_gas_consumption,"[power, energy, gas, consumption, division, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - energy - gas - consumption - division ...,0.969781,False,15
...,...,...,...,...,...,...,...,...,...
8580,In a limited assurance engagement the assuranc...,5,5_financial_audit_statements_risk,"[financial, audit, statements, risk, report, m...",[The Audit Committee met six times. In the pr...,financial - audit - statements - risk - report...,1.000000,False,5
8581,"Within the scope of our assurance engagement, ...",5,5_financial_audit_statements_risk,"[financial, audit, statements, risk, report, m...",[The Audit Committee met six times. In the pr...,financial - audit - statements - risk - report...,0.392662,False,5
8582,Intended Use of the Assurance Report\nWe issue...,5,5_financial_audit_statements_risk,"[financial, audit, statements, risk, report, m...",[The Audit Committee met six times. In the pr...,financial - audit - statements - risk - report...,0.868776,False,5
8583,"Munich, 2 December 2024\nPricewaterhouseCooper...",5,5_financial_audit_statements_risk,"[financial, audit, statements, risk, report, m...",[The Audit Committee met six times. In the pr...,financial - audit - statements - risk - report...,0.348257,False,5


In [70]:
def concatenate_documents_by_topic(df: pd.DataFrame, n_docs: int = 20) -> pd.DataFrame:
    """
    Groups the DataFrame by 'Topic', and for each topic concatenates up to `n_docs`
    documents into a single string like: "Document 1: ... Document 2: ...".

    Parameters:
        df (pd.DataFrame): DataFrame with 'Topic' and 'Document' columns.
        n_docs (int): Number of documents to include per topic.

    Returns:
        pd.DataFrame: A DataFrame with columns ['Topic', 'ConcatenatedDocuments'].
    """
    
    result = []

    for topic, group in df.groupby("Topic"):
        docs = group["Document"].head(n_docs).tolist()
        concatenated = "\n\n".join([f"TextSample {i+1}: {doc}" for i, doc in enumerate(docs)])
        result.append({"Topic": topic, "ConcatenatedDocuments": concatenated})

    return pd.DataFrame(result)

In [71]:
df_concatenated_docs=concatenate_documents_by_topic(df_clean)
print(df_concatenated_docs)

    Topic                              ConcatenatedDocuments
0       0  TextSample 1: Our Code of Conduct for Siemens ...
1       1  TextSample 1: 2  October 1, 2012 – Sep\ntember...
2       2  TextSample 1: At the end of fiscal 2013, the n...
3       3  TextSample 1: 6 Continuing and discontinued op...
4       4  TextSample 1: Empower our diverse and \nengage...
5       5  TextSample 1: For further information on our f...
6       6  TextSample 1: Revenue growth  \nThe most impor...
7       7  TextSample 1: 2 Average number of employees in...
8       8  TextSample 1: 10\n Ex\nemption pursuant to Sec...
9       9  TextSample 1: Sports and painting are my great...
10     10  TextSample 1: Corpor\nate  Go\nvernance  155 C...
11     11  TextSample 1: B.3.7 Transfer of responsibility...
12     12  TextSample 1: 2\n Contr\nol due to contractual...
13     13  TextSample 1: A\ndditional  Inf\normation\n 24...
14     14  TextSample 1: Asian competitors are generally ...
15     15  TextSample 1:

In [72]:
df_concatenated_docs_merge=pd.merge(df_concatenated_docs,df_topic_info[["Topic","Representation"]], on='Topic', how="left")
df_concatenated_docs_merge

Unnamed: 0,Topic,ConcatenatedDocuments,Representation
0,0,TextSample 1: Our Code of Conduct for Siemens ...,"[sustainability, environmental, gri, emissions..."
1,1,"TextSample 1: 2 October 1, 2012 – Sep\ntember...","[board, stock, managing, compensation, supervi..."
2,2,"TextSample 1: At the end of fiscal 2013, the n...","[assets, tax, million, income, cash, financial..."
3,3,TextSample 1: 6 Continuing and discontinued op...,"[employees, diversity, people, learning, sieme..."
4,4,TextSample 1: Empower our diverse and \nengage...,"[compliance, business, integrity, corruption, ..."
5,5,TextSample 1: For further information on our f...,"[financial, audit, statements, risk, report, m..."
6,6,TextSample 1: Revenue growth \nThe most impor...,"[growth, markets, year, fiscal, orders, revenu..."
7,7,TextSample 1: 2 Average number of employees in...,"[siemens, 100, ag, india, company, private, li..."
8,8,TextSample 1: 10\n Ex\nemption pursuant to Sec...,"[100, gmbh, siemens, limited, healthcare, grün..."
9,9,TextSample 1: Sports and painting are my great...,"[digital, solutions, production, mobility, tec..."


In [83]:
open_ai_key=os.getenv("OPENAI_API_KEY")
import openai

In [None]:
# Option 1: Store your API key as an environment variable
# export OPENAI_API_KEY="your_api_key_here" (in terminal or .env file)
openai.api_key = open_ai_key

def summarize_documents(documents, representations, model="gpt-4.1-nano"):
    prompt = f"""Below are 20 text samples from a single topic cluster derived from Siemens AG sustainability reports between 2013 and 2024. 
    These are the most representative passages in the cluster.
Please:
	1.	Determine whether this cluster reflects a distinct, meaningful sustainability topic (yes/no).
	2.	If yes, provide a short label (3–6 words) summarizing the topic.
	3.	Write a 1–2 sentence summary of what this cluster is about.
	4.	If relevant, list which UN SDGs (by number and name) this topic relates to.
	5.	If the topic is generic, boilerplate, or non-substantive, say: “This cluster contains generic or administrative content and does not reflect a real sustainability trend.”
    
    <Keywords>{representations}</Keywords>
    <TextSamples>{documents}</TextSamples>
    """
    
    response = openai.responses.create(
        model=model,
        input=[
            {"role": "user", "content": prompt}
        ]
    )
    
    summary = response.output[0].content[0].text
    return summary

df_concatenated_docs_merge['Summary'] = df_concatenated_docs_merge.apply(lambda x: summarize_documents(x['ConcatenatedDocuments'], x['Representation']), axis=1)

# Print the DataFrame with summaries
print(df_concatenated_docs_merge[['Topic', 'Summary']])

    Topic                                            Summary
0       0  1. **Does this cluster reflect a distinct, mea...
1       1  1. **Does this cluster reflect a distinct, mea...
2       2  1. **Does this cluster reflect a distinct, mea...
3       3  1. Yes, this cluster reflects a distinct, mean...
4       4  1. **Does this cluster reflect a distinct, mea...
5       5  1. **Does this cluster reflect a distinct, mea...
6       6  1. Yes, this cluster reflects a distinct, mean...
7       7  1. **Does this cluster reflect a distinct, mea...
8       8  1. **Does this cluster reflect a distinct, mea...
9       9  1. **Does this cluster reflect a distinct, mea...
10     10  1. Does this cluster reflect a distinct, meani...
11     11  1. **Does this cluster reflect a distinct, mea...
12     12  1. **Does this cluster reflect a distinct, mea...
13     13  1. **Does this cluster reflect a distinct, mea...
14     14  1. **Does this cluster reflect a distinct, mea...
15     15  1. **Does thi

In [86]:
print(df_concatenated_docs_merge['Summary'].loc[6])

1. Yes, this cluster reflects a distinct, meaningful sustainability topic.
2. **Sustainable Business Performance and Financial Metrics**
3. This cluster revolves around Siemens' focus on financial performance, growth strategies, market positioning, and economic analysis, emphasizing revenue growth, profitability, capital efficiency, and market dynamics across regions and sectors.
4. Relevant UN SDGs include:
   - SDG 8: Decent Work and Economic Growth
   - SDG 9: Industry, Innovation, and Infrastructure
   - SDG 12: Responsible Consumption and Production
   - SDG 13: Climate Action (indirectly through sustainable economic growth)
5. This cluster contains generic or administrative content and does not reflect a real sustainability trend.


In [85]:
df_topics_summaries=pd.merge(df_topic_info,df_concatenated_docs_merge[["Topic","Summary"]], on='Topic', how="left")
df_topics_summaries

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Summary
0,-1,297,-1_siemens_board_fiscal_compensation,"[siemens, board, fiscal, compensation, 2018, m...","[Assets\nSep 30,\n(in millions of €) 2019 2018...",
1,0,275,0_business_sustainability_siemens_compliance,"[business, sustainability, siemens, compliance...",[At Siemens we offer secure reporting channels...,"""Advancing Business Sustainability Through Hum..."
2,1,126,1_stock_shares_income_target,"[stock, shares, income, target, share, awards,...",[Stock awards are tied to performance criteria...,"ESG Goals: Sustainable Financial Growth, Respo..."
3,2,84,2_financial_instruments_rate_credit,"[financial, instruments, rate, credit, cash, v...",[The Company limits default risks resulting fr...,Sustainable Financial Management and Risk Miti...
4,3,82,3_water_environmental_emissions_energy,"[water, environmental, emissions, energy, port...",[The calculation of the reduction of carbon di...,"""Comprehensive ESG Goals for Decarbonization, ..."
5,4,75,4_audit_statements_financial_group,"[audit, statements, financial, group, accounti...",[In accordance with Sec. 322 (3) Sentence 1 HG...,"""ESG Transparency and Financial Integrity: Str..."
6,5,68,5_100_siemens_gmbh_nited,"[100, siemens, gmbh, nited, states, spain, lim...","[T\n.NET Houston, LLC, Austin, TX\n /\n U\nnit...","""Main ESG Goals: Sustainable Energy, Responsib..."
7,6,64,6_board_supervisory_managing_chairman,"[board, supervisory, managing, chairman, membe...","[The Chairman’s Committee makes proposals, in...","""Strengthening Governance and Leadership for S..."
8,7,53,7_year_fiscal_revenue_orders,"[year, fiscal, revenue, orders, growth, busine...",[Fiscal year % Change\n(in millions of €) 2019...,"""Strategic Growth and Profitability amid Marke..."
9,8,48,8_100_india_limited_siemens,"[100, india, limited, siemens, private, chenna...",[11\n Siemens\n \nAG is a shareholder with unl...,"""Global Renewable Energy Investment and Govern..."
