_**Imports for document extraction, text processing, and writing the text into chunks**_

In [1]:
import os
from pathlib import Path
import re
import PyPDF2
import pandas as pd

_**Necessary functions for the document preprocessing and text clean up**_

In [2]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    """
    Extract text from a PDF file
    """
    if pdf_path.suffix.lower() == ".pdf":
        text = ""
        with pdf_path.open("rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
        return text



def clean_text(text: str) -> str:
    """
    Clean the text by removing unnecesary sections
    """
    match = re.search(r'Bibliography|Acknowledgements|Index|Contents|Carbon', text, re.IGNORECASE)
    if match:
        text = text[match.start():]
    return text


def chunk_text(text: str, max_token_size: int = 128) -> list[str]:
    """
    Chunk the text into smaller chunks of approximately max_token_size tokens
    Uses a simple estimation method where 1 token ≈ 4 characters
    """
    paragraphs = text.split(".\n")
    chunks = []
    current_chunk = ""
    current_token_count = 0
    
    # Simple token estimation function
    def estimate_tokens(text: str) -> int:
        # Roughly 4 characters per token for English text
        return len(text) // 4
    
    for paragraph in paragraphs:
        paragraph_tokens = estimate_tokens(paragraph)
        
        # Check if adding this paragraph would exceed the token limit
        if current_token_count + paragraph_tokens + 1 > max_token_size:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n\n"
            current_token_count = paragraph_tokens + 2  # +2 for the newlines
        else:
            current_chunk += paragraph + "\n\n"
            current_token_count += paragraph_tokens + 2  # +2 for the newlines
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks


def remove_empty_strings_from_list(input_list: list[str]) -> list[str]:
    """
    Removes all empty strings (e.g., "", "   ") from a list of strings.
    It also removes strings that are None.
    """
    return [s for s in input_list if s and s.strip()]

_**Adding document text to a list of chunks which will then be processed by BERTopic**_

In [3]:
folder_year_2013 = Path("ESG_reports/Siemens/2013")
folder_year_2014 = Path("ESG_reports/Siemens/2014")
folder_year_2015 = Path("ESG_reports/Siemens/2015")
folder_year_2016 = Path("ESG_reports/Siemens/2016")
folder_year_2017 = Path("ESG_reports/Siemens/2017")
folder_year_2018 = Path("ESG_reports/Siemens/2018")
folder_year_2019 = Path("ESG_reports/Siemens/2019")
folder_year_2020 = Path("ESG_reports/Siemens/2020")
folder_year_2021 = Path("ESG_reports/Siemens/2021")
folder_year_2022 = Path("ESG_reports/Siemens/2022")
folder_year_2023 = Path("ESG_reports/Siemens/2023")
folder_year_2024 = Path("ESG_reports/Siemens/2024")
document_text_2013 = []
document_text_2014 = []
document_text_2015 = []
document_text_2016 = []
document_text_2017 = []
document_text_2018 = []
document_text_2019 = []
document_text_2020 = []
document_text_2021 = []
document_text_2022 = []
document_text_2023 = []
document_text_2024 = []

In [4]:
def process_pdf_folder(folder_path: Path) -> list[str]:
    document_chunks = []
    for file in folder_path.glob("*.pdf"):
        text = extract_text_from_pdf(file)
        #Disabling cleaning since I'm not sure if it's working properly, might be cutting too much text?
        # since we are clustering chunks, then filtering the chunks, we don't need to clean 
        #cleaned_text = clean_text(text)
        cleaned_text=text
        chunks = chunk_text(cleaned_text)
        document_chunks.extend(chunks)
    print(len(document_chunks))
    return document_chunks

document_text_2013 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2013))
document_text_2014 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2014))
document_text_2015 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2015))
document_text_2016 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2016))
document_text_2017 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2017))
document_text_2018 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2018))
document_text_2019 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2019))
document_text_2020 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2020))
document_text_2021 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2021))
document_text_2022 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2022))
document_text_2023 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2023))
document_text_2024 = remove_empty_strings_from_list(process_pdf_folder(folder_year_2024))

1466
995
631
612
834
872
841
324
395
462
590
611


In [5]:
dictionary_year_chunks = {
    2013: document_text_2013,
    2014: document_text_2014,
    2015: document_text_2015,
    2016: document_text_2016,
    2017: document_text_2017,
    2018: document_text_2018,
    2019: document_text_2019,
    2020: document_text_2020,
    2021: document_text_2021,
    2022: document_text_2022,
    2023: document_text_2023,
    2024: document_text_2024
}

years = []
flat_chunks = []

# Iterate through the dictionary in sorted order
for year, chunks in sorted(dictionary_year_chunks.items()):
    # For each string in the chunks list
    for chunk in chunks:
        # Add the year to years list
        years.append(year)
        # Add the chunk to flat_chunks list
        flat_chunks.append(chunk)

df_chunks = pd.DataFrame(dictionary_year_chunks.items(), columns=['Year', 'Chunk'])
df_chunks

Unnamed: 0,Year,Chunk
0,2013,[Industrial productivity\nIntelligent infrastr...
1,2014,[siemens.comto the Siemens Annual Report 2014\...
2,2015,[Sustainability \nInformation 2015\n As adden...
3,2016,[Sustainability \nInformation 2016\nsiemens.co...
4,2017,[Sustainability \nInformation 2017\nsiemens.co...
5,2018,[Annual Report \n20\n18\nsiemens.com\nA.1 p 2...
6,2019,[Sustainability \nInformation 2019\nsiemens.co...
7,2020,[Sustainability \ninformation \n2020Foreword\...
8,2021,[Sustainability \nreport\n2021 Technology to ...
9,2022,[Sustainability \nreport\n2022 Technology to ...


_**Necessary imports for BERTopic**_

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


_**Configurations for the steps of BERTopic**_

In [7]:
umap_model = UMAP(random_state=42)
vectorizer_model = CountVectorizer(stop_words='english')


#https://maartengr.github.io/BERTopic/faq.html#which-embedding-model-should-i-choose
#based on the doc above, we can try all-mpnet-base-v2 for better results
#or we can try all-MiniLM-L6-v2 for faster results for development

embedding_model = 'all-mpnet-base-v2'
nr_topics = 30

In [8]:
topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2', nr_topics=nr_topics,
                       umap_model=umap_model, vectorizer_model=vectorizer_model)
topics, probabilities = topic_model.fit_transform(flat_chunks)



In [9]:
topics_over_time = topic_model.topics_over_time(flat_chunks, years)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)

In [10]:
df_topic_info=topic_model.get_topic_info()
df_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2163,-1_siemens_fiscal_financial_year,"[siemens, fiscal, financial, year, million, in...",[The largest equity instrument allocated to le...
1,0,1192,0_financial_cash_million_income,"[financial, cash, million, income, shares, sta...",[The change in Income taxes resulted from hig...
2,1,936,1_sustainability_environmental_gri_emissions,"[sustainability, environmental, gri, emissions...","[For consistency reasons, we generally apply g..."
3,2,797,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...
4,3,563,3_board_supervisory_siemens_chairman,"[board, supervisory, siemens, chairman, ag, co...",[Details regarding the work of the Supervisory...
5,4,408,4_people_diversity_learning_siemens,"[people, diversity, learning, siemens, program...","[5.1 Working at Siemens5.2 \nDiversity, Equity..."
6,5,348,5_compensation_board_managing_benefit,"[compensation, board, managing, benefit, pensi...",[10\n Prof. Dr. Russwurm left the Managing Bo...
7,6,300,6_growth_markets_year_revenue,"[growth, markets, year, revenue, orders, fisca...","[C.9.1.2 MARKET DEVEL OPMENT\nIn fiscal 2015,..."
8,7,300,7_digital_solutions_customers_production,"[digital, solutions, customers, production, bu...",[→Innovation and development cycles have \nsho...
9,8,276,8_100_gmbh_siemens_spain,"[100, gmbh, siemens, spain, limited, healthcar...",[10\n Ex\nemption pursuant to Section 264 (3) ...


In [11]:
total=df_topic_info["Count"].sum()
outliers=df_topic_info["Count"].iloc[0]
percentage_outliers=(outliers/total)*100
print(f"We have a total of {total} documents.\n{outliers} are outliers.\nAbout {round(percentage_outliers,2)}% are outliers")

We have a total of 8622 documents.
2163 are outliers.
About 25.09% are outliers


In [12]:
df_document_info=topic_model.get_document_info(flat_chunks)

In [13]:
#Removed the outliers
df_document_info["topic"] = topics  # Add the topic assignments to the DataFrame
df_clean = df_document_info[df_document_info["topic"] != -1]  # Keep only documents with valid topics
df_clean

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,topic
0,Industrial productivity\nIntelligent infrastru...,7,7_digital_solutions_customers_production,"[digital, solutions, customers, production, bu...",[→Innovation and development cycles have \nsho...,digital - solutions - customers - production -...,0.723011,False,7
1,WWW.SIEMENS.COM/AR/REPORT-INDUSTRYWelcome to S...,14,14_healthcare_medical_care_imaging,"[healthcare, medical, care, imaging, diagnosti...",[Under the new organizational structure report...,healthcare - medical - care - imaging - diagno...,0.559207,False,14
2,"N\naresh Trehan, founder of “Medanta –\n \nTh...",14,14_healthcare_medical_care_imaging,"[healthcare, medical, care, imaging, diagnosti...",[Under the new organizational structure report...,healthcare - medical - care - imaging - diagno...,1.000000,False,14
3,WWW.SIEMENS.COM/AR/REPORT-HEALTHCARE\nWhat mak...,18,18_vienna_city_häupl_michael,"[vienna, city, häupl, michael, transportation,...",[What initiatives would you like to launch in ...,vienna - city - häupl - michael - transportati...,1.000000,False,18
5,WWW.SIEMENS.COM/AR/STRATEGYGurgaon\nVienna4Gen...,15,15_power_gas_energy_division,"[power, gas, energy, division, consumption, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - gas - energy - division - consumption ...,1.000000,False,15
...,...,...,...,...,...,...,...,...,...
8617,In a limited assurance engagement the assuranc...,2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.759931,False,2
8618,"Within the scope of our assurance engagement, ...",2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.513443,False,2
8619,Intended Use of the Assurance Report\nWe issue...,2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.908644,False,2
8620,"Munich, 2 December 2024\nPricewaterhouseCooper...",2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,1.000000,False,2


In [14]:
def concatenate_documents_by_topic(df: pd.DataFrame, n_docs: int = 20) -> pd.DataFrame:
    """
    Groups the DataFrame by 'Topic', and for each topic concatenates up to `n_docs`
    documents into a single string like: "Document 1: ... Document 2: ...".

    Parameters:
        df (pd.DataFrame): DataFrame with 'Topic' and 'Document' columns.
        n_docs (int): Number of documents to include per topic.

    Returns:
        pd.DataFrame: A DataFrame with columns ['Topic', 'ConcatenatedDocuments'].
    """
    
    result = []

    for topic, group in df.groupby("Topic"):
        docs = group["Document"].head(n_docs).tolist()
        concatenated = "\n\n".join([f"TextSample {i+1}: {doc}" for i, doc in enumerate(docs)])
        result.append({"Topic": topic, "ConcatenatedDocuments": concatenated})

    return pd.DataFrame(result)

In [15]:
df_concatenated_docs=concatenate_documents_by_topic(df_clean)
print(df_concatenated_docs)

    Topic                              ConcatenatedDocuments
0       0  TextSample 1: Revenue growth  \nThe most impor...
1       1  TextSample 1: Is Siemens following a separate ...
2       2  TextSample 1: Empower our diverse and \nengage...
3       3  TextSample 1: Ralf P\n. Thomas a full member o...
4       4  TextSample 1: I’m aware that you, our sharehol...
5       5  TextSample 1: At our meeting on November 7, 20...
6       6  TextSample 1: 2 Average number of employees in...
7       7  TextSample 1: Industrial productivity\nIntelli...
8       8  TextSample 1: 10\n Ex\nemption pursuant to Sec...
9       9  TextSample 1: Corpor\nate  Go\nvernance  155 C...
10     10  TextSample 1: B.3.7 Transfer of responsibility...
11     11  TextSample 1: SFS’ business is geared to the S...
12     12  TextSample 1: 2\n  Includes los\ns of variable...
13     13  TextSample 1: 9\n Ex\nemption pursuant to Sect...
14     14  TextSample 1: WWW.SIEMENS.COM/AR/REPORT-INDUST...
15     15  TextSample 1:

In [16]:
df_concatenated_docs_merge=pd.merge(df_concatenated_docs,df_topic_info[["Topic","Representation"]], on='Topic', how="left")
df_concatenated_docs_merge

Unnamed: 0,Topic,ConcatenatedDocuments,Representation
0,0,TextSample 1: Revenue growth \nThe most impor...,"[financial, cash, million, income, shares, sta..."
1,1,TextSample 1: Is Siemens following a separate ...,"[sustainability, environmental, gri, emissions..."
2,2,TextSample 1: Empower our diverse and \nengage...,"[compliance, human, rights, risk, business, ri..."
3,3,TextSample 1: Ralf P\n. Thomas a full member o...,"[board, supervisory, siemens, chairman, ag, co..."
4,4,"TextSample 1: I’m aware that you, our sharehol...","[people, diversity, learning, siemens, program..."
5,5,"TextSample 1: At our meeting on November 7, 20...","[compensation, board, managing, benefit, pensi..."
6,6,TextSample 1: 2 Average number of employees in...,"[growth, markets, year, revenue, orders, fisca..."
7,7,TextSample 1: Industrial productivity\nIntelli...,"[digital, solutions, customers, production, bu..."
8,8,TextSample 1: 10\n Ex\nemption pursuant to Sec...,"[100, gmbh, siemens, spain, limited, healthcar..."
9,9,TextSample 1: Corpor\nate Go\nvernance 155 C...,"[safety, health, work, occupational, employees..."


In [17]:
open_ai_key=os.getenv("OPENAI_API_KEY")
import openai
import json

In [19]:
nano_model='gpt-4.1-nano'
mini_model='gpt-4.1-mini'
full_model='gpt-4.1'
#use full model for your thesis results

def summarize_documents(documents, representations, model=mini_model):
    prompt = f"""Below are 20 text samples and corresponding keywords from a single topic cluster derived from Siemens AG sustainability reports between 2013 and 2024. 
    These are representative passages in the cluster.
Please provide your analysis in JSON format with the following structure:
{{
    "is_meaningful_topic": true/false,
    "topic_label": "short label (3-6 words)",
    "summary": "1-2 sentence summary",
    "primary_sdg": {{"number": "X", "name": "SDG name"}},
    "is_generic": true/false,
    "justification": "2-3 sentences explaining your rationale for the choices made",
    "confidence_score": X.X (a number between 0.0-1.0 indicating your confidence in this analysis)
}}

If the topic is generic, set "is_meaningful_topic" to false, "is_generic" to true, and include "summary": "This cluster contains generic or administrative content and does not reflect a real sustainability trend."

If "is_meaningful_topic" is true, set "primary_sdg" to {{"number": "X", "name": "SDG name"}}. If "is_meaningful_topic" is false, set "primary_sdg" to null.

Note: Please select only ONE primary SDG that best matches the topic.
    
    <Keywords>{representations}</Keywords>
    <TextSamples>{documents}</TextSamples>
"""

    response = openai.responses.create(
        model=model,
        input=[
            {"role": "user", "content": prompt}
        ]
    )
    
    # Extract the text from the response
    response_text = response.output[0].content[0].text
    
    # Try to parse the JSON from the response
    try:
        # Remove any potential markdown code block formatting
        if "```json" in response_text:
            response_text = response_text.split("```json")[1].split("```")[0].strip()
        elif "```" in response_text:
            response_text = response_text.split("```")[1].split("```")[0].strip()
        
        # Parse the JSON
        summary_json = json.loads(response_text)
        return summary_json
    except json.JSONDecodeError:
        # If parsing fails, return a structured error message
        return {
            "error": True,
            "message": "Failed to parse JSON from model response",
            "raw_response": response_text
        }

# Create JSON output for all topics
topics_json = []
for index, row in df_concatenated_docs_merge.iterrows():
    topic_data = {
        "topic_id": row.get('Topic', f"Topic_{index}"),
        "summary_data": summarize_documents(row['ConcatenatedDocuments'], row['Representation'])
    }
    topics_json.append(topic_data)

# Convert to JSON string
json_output = json.dumps(topics_json, indent=2)

# Print or save the JSON
print(json_output)

# Optionally save to file
# with open("sustainability_topics_summary.json", "w") as f:
#     f.write(json_output)

[
  {
    "topic_id": 0,
    "summary_data": {
      "is_meaningful_topic": false,
      "topic_label": "Financial and Shareholder Reporting",
      "summary": "This cluster mainly contains financial reporting data such as revenue, earnings, EBITDA, share statistics, dividends, and capital structure, with no distinct sustainability theme or trend.",
      "primary_sdg": null,
      "is_generic": true,
      "justification": "The text samples are predominantly excerpts of financial statements, shareholder letters, and financial metrics. They focus on profitability, capital efficiency, revenue growth, dividends, and key performance indicators. These contents represent generic financial disclosure rather than sustainability-related topics, and thus do not map cleanly to any Sustainable Development Goal (SDG).",
      "confidence_score": 0.95
    }
  },
  {
    "topic_id": 1,
    "summary_data": {
      "is_meaningful_topic": true,
      "topic_label": "Siemens Environmental Portfolio and 

In [20]:
# Create a DataFrame from the JSON output

# If the JSON is still in the topics_json variable from the previous cell
# If not, you can uncomment the file loading code below
# with open("sustainability_topics_summary.json", "r") as f:
#     topics_data = json.load(f)
topics_data = topics_json

# Initialize an empty list to store flattened data
flattened_data = []

# Process each topic entry
for topic_entry in topics_data:
    topic_id = topic_entry["topic_id"]
    summary_data = topic_entry["summary_data"]
    
    # Create a dictionary with topic_id and flattened summary data
    topic_dict = {"Topic": topic_id}
    
    # Check if there was an error parsing JSON
    if "error" in summary_data:
        topic_dict["error"] = summary_data["error"]
        topic_dict["error_message"] = summary_data["message"]
        topic_dict["raw_response"] = summary_data["raw_response"]
    else:
        # Add all the summary fields to the dictionary
        topic_dict["is_meaningful_topic"] = summary_data.get("is_meaningful_topic")
        topic_dict["topic_label"] = summary_data.get("topic_label")
        topic_dict["summary"] = summary_data.get("summary")
        topic_dict["is_generic"] = summary_data.get("is_generic")
        topic_dict["justification"] = summary_data.get("justification", "")
        topic_dict["confidence_score"] = summary_data.get("confidence_score")
        
        # Handle primary SDG
        if "primary_sdg" in summary_data and summary_data["primary_sdg"]:
            sdg = summary_data["primary_sdg"]
            topic_dict["sdg_number"] = sdg.get("number", "")
            topic_dict["sdg_name"] = sdg.get("name", "")
        else:
            topic_dict["sdg_number"] = ""
            topic_dict["sdg_name"] = ""
    
    flattened_data.append(topic_dict)

# Create a DataFrame from the flattened data
df_summary_results = pd.DataFrame(flattened_data)

# Display the DataFrame
display(df_summary_results)

# Optional: Save to CSV for further analysis
# df_summary_results.to_csv("sustainability_topics_analysis.csv", index=False)

# You can also group by Topic if needed
# df_summary_by_topic = df_summary_results.groupby('Topic').first().reset_index()
# display(df_summary_by_topic)

Unnamed: 0,Topic,is_meaningful_topic,topic_label,summary,is_generic,justification,confidence_score,sdg_number,sdg_name,error,error_message,raw_response
0,0,False,Financial and Shareholder Reporting,This cluster mainly contains financial reporti...,True,The text samples are predominantly excerpts of...,0.95,,,,,
1,1,True,Siemens Environmental Portfolio and Sustainabi...,This topic cluster centers on Siemens AG's int...,False,The cluster consistently references Siemens' E...,0.95,13.0,Climate Action,,,
2,2,True,Corporate Compliance and Ethics,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16.0,"Peace, Justice and Strong Institutions",,,
3,3,False,Corporate Governance and Board Management,This cluster contains information related to b...,True,The text samples overwhelmingly cover administ...,0.95,,,,,
4,4,,,,,,,,,True,Failed to parse JSON from model response,"{\n ""is_meaningful_topic"": true,\n ""topic_la..."
5,5,True,Board Compensation and Benefits,This topic cluster focuses on the compensation...,False,The text samples consistently discuss details ...,0.95,16.0,"Peace, Justice and Strong Institutions",,,
6,6,True,Global Market and Economic Growth,This topic cluster discusses Siemens AG's mark...,False,The cluster predominantly contains economic an...,0.9,8.0,Decent Work and Economic Growth,,,
7,7,True,Industrial Digital Manufacturing Solutions,This topic cluster focuses on Siemens' use of ...,False,The cluster centers explicitly on digital and ...,0.95,9.0,"Industry, Innovation and Infrastructure",,,
8,8,False,Subsidiaries and Corporate Structure,"This cluster predominantly lists subsidiaries,...",True,The text samples largely consist of detailed l...,0.95,,,,,
9,9,True,Occupational Health and Safety,This cluster focuses on Siemens AG's commitmen...,False,The cluster is focused specifically on health ...,0.95,8.0,Decent Work and Economic Growth,,,


In [21]:
print(df_summary_results.loc[0]['justification'])

The text samples are predominantly excerpts of financial statements, shareholder letters, and financial metrics. They focus on profitability, capital efficiency, revenue growth, dividends, and key performance indicators. These contents represent generic financial disclosure rather than sustainability-related topics, and thus do not map cleanly to any Sustainable Development Goal (SDG).


In [22]:
# Merge df_summary_results with df_topic_info on the Topic field
# Assuming the column in df_topic_info that corresponds to Topic is also named "Topic"
merged_df = df_topic_info.merge(df_summary_results, on="Topic", how="left")

# Display the merged DataFrame
display(merged_df)

# Optional: Save to CSV if needed
# merged_df.to_csv("merged_sustainability_topics.csv", index=False)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,is_meaningful_topic,topic_label,summary,is_generic,justification,confidence_score,sdg_number,sdg_name,error,error_message,raw_response
0,-1,2163,-1_siemens_fiscal_financial_year,"[siemens, fiscal, financial, year, million, in...",[The largest equity instrument allocated to le...,,,,,,,,,,,
1,0,1192,0_financial_cash_million_income,"[financial, cash, million, income, shares, sta...",[The change in Income taxes resulted from hig...,False,Financial and Shareholder Reporting,This cluster mainly contains financial reporti...,True,The text samples are predominantly excerpts of...,0.95,,,,,
2,1,936,1_sustainability_environmental_gri_emissions,"[sustainability, environmental, gri, emissions...","[For consistency reasons, we generally apply g...",True,Siemens Environmental Portfolio and Sustainabi...,This topic cluster centers on Siemens AG's int...,False,The cluster consistently references Siemens' E...,0.95,13.0,Climate Action,,,
3,2,797,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,True,Corporate Compliance and Ethics,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16.0,"Peace, Justice and Strong Institutions",,,
4,3,563,3_board_supervisory_siemens_chairman,"[board, supervisory, siemens, chairman, ag, co...",[Details regarding the work of the Supervisory...,False,Corporate Governance and Board Management,This cluster contains information related to b...,True,The text samples overwhelmingly cover administ...,0.95,,,,,
5,4,408,4_people_diversity_learning_siemens,"[people, diversity, learning, siemens, program...","[5.1 Working at Siemens5.2 \nDiversity, Equity...",,,,,,,,,True,Failed to parse JSON from model response,"{\n ""is_meaningful_topic"": true,\n ""topic_la..."
6,5,348,5_compensation_board_managing_benefit,"[compensation, board, managing, benefit, pensi...",[10\n Prof. Dr. Russwurm left the Managing Bo...,True,Board Compensation and Benefits,This topic cluster focuses on the compensation...,False,The text samples consistently discuss details ...,0.95,16.0,"Peace, Justice and Strong Institutions",,,
7,6,300,6_growth_markets_year_revenue,"[growth, markets, year, revenue, orders, fisca...","[C.9.1.2 MARKET DEVEL OPMENT\nIn fiscal 2015,...",True,Global Market and Economic Growth,This topic cluster discusses Siemens AG's mark...,False,The cluster predominantly contains economic an...,0.9,8.0,Decent Work and Economic Growth,,,
8,7,300,7_digital_solutions_customers_production,"[digital, solutions, customers, production, bu...",[→Innovation and development cycles have \nsho...,True,Industrial Digital Manufacturing Solutions,This topic cluster focuses on Siemens' use of ...,False,The cluster centers explicitly on digital and ...,0.95,9.0,"Industry, Innovation and Infrastructure",,,
9,8,276,8_100_gmbh_siemens_spain,"[100, gmbh, siemens, spain, limited, healthcar...",[10\n Ex\nemption pursuant to Section 264 (3) ...,False,Subsidiaries and Corporate Structure,"This cluster predominantly lists subsidiaries,...",True,The text samples largely consist of detailed l...,0.95,,,,,


In [23]:
# Filter out non-meaningful topics from the merged DataFrame
filtered_df = merged_df[merged_df['is_meaningful_topic'] == True]

# Display the count of meaningful topics vs all topics
print(f"Total topics: {len(merged_df)}")
print(f"Meaningful topics: {len(filtered_df)}")

# Display the filtered DataFrame
display(filtered_df)

# Optional: Save to CSV if needed
# filtered_df.to_csv("meaningful_sustainability_topics.csv", index=False)

# Print the topic numbers of meaningful topics
meaningful_topics = filtered_df['Topic'].tolist()
print("Meaningful topic numbers:")
print(meaningful_topics)

# Count of meaningful topics
print(f"\nNumber of meaningful topics: {len(meaningful_topics)}")

# Alternatively, display in a more readable format if there are many topics
print("\nMeaningful topic numbers (sorted):")
print(sorted(meaningful_topics))


Total topics: 30
Meaningful topics: 17


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,is_meaningful_topic,topic_label,summary,is_generic,justification,confidence_score,sdg_number,sdg_name,error,error_message,raw_response
2,1,936,1_sustainability_environmental_gri_emissions,"[sustainability, environmental, gri, emissions...","[For consistency reasons, we generally apply g...",True,Siemens Environmental Portfolio and Sustainabi...,This topic cluster centers on Siemens AG's int...,False,The cluster consistently references Siemens' E...,0.95,13,Climate Action,,,
3,2,797,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,True,Corporate Compliance and Ethics,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16,"Peace, Justice and Strong Institutions",,,
6,5,348,5_compensation_board_managing_benefit,"[compensation, board, managing, benefit, pensi...",[10\n Prof. Dr. Russwurm left the Managing Bo...,True,Board Compensation and Benefits,This topic cluster focuses on the compensation...,False,The text samples consistently discuss details ...,0.95,16,"Peace, Justice and Strong Institutions",,,
7,6,300,6_growth_markets_year_revenue,"[growth, markets, year, revenue, orders, fisca...","[C.9.1.2 MARKET DEVEL OPMENT\nIn fiscal 2015,...",True,Global Market and Economic Growth,This topic cluster discusses Siemens AG's mark...,False,The cluster predominantly contains economic an...,0.9,8,Decent Work and Economic Growth,,,
8,7,300,7_digital_solutions_customers_production,"[digital, solutions, customers, production, bu...",[→Innovation and development cycles have \nsho...,True,Industrial Digital Manufacturing Solutions,This topic cluster focuses on Siemens' use of ...,False,The cluster centers explicitly on digital and ...,0.95,9,"Industry, Innovation and Infrastructure",,,
10,9,167,9_safety_health_work_occupational,"[safety, health, work, occupational, employees...",[→ Managers and employees can take advantage o...,True,Occupational Health and Safety,This cluster focuses on Siemens AG's commitmen...,False,The cluster is focused specifically on health ...,0.95,8,Decent Work and Economic Growth,,,
11,10,156,10_cybersecurity_data_security_privacy,"[cybersecurity, data, security, privacy, ai, p...","[Proactively dealing with threats, \nincidents...",True,Corporate Cybersecurity and Data Privacy,The cluster focuses on Siemens' comprehensive ...,False,The cluster contains detailed references to cy...,0.95,9,"Industry, Innovation and Infrastructure",,,
12,11,122,11_rate_currency_financial_foreign,"[rate, currency, financial, foreign, fair, ins...",[The Company had interest rate swap contracts ...,True,Currency & Financial Risk Management,This topic cluster centers on Siemens AG's man...,False,The cluster is clearly focused on specific con...,0.92,9,"Industry, Innovation and Infrastructure",,,
15,14,109,14_healthcare_medical_care_imaging,"[healthcare, medical, care, imaging, diagnosti...",[Under the new organizational structure report...,True,Affordable High-Quality Healthcare Solutions,The cluster focuses on Siemens’ contributions ...,False,The collection of texts clearly revolves aroun...,0.95,3,Good Health and Well-being,,,
16,15,107,15_power_gas_energy_division,"[power, gas, energy, division, consumption, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,True,Efficient Power Plant Modernization,This cluster discusses Siemens’ efforts to mod...,False,The content centers on improving energy effici...,0.95,7,Affordable and Clean Energy,,,


Meaningful topic numbers:
[1, 2, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 21, 22, 23]

Number of meaningful topics: 17

Meaningful topic numbers (sorted):
[1, 2, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 21, 22, 23]


In [24]:
#Option 1: Using topic_label as custom labels

# Create a dictionary mapping from Topic ID to topic_label
topic_label_mapping = {}

# Loop through the filtered dataframe (or use merged_df if you want all topics)
for index, row in filtered_df.iterrows():
    topic_id = row['Topic']
    if pd.notna(row['topic_label']):  # Ensure the topic_label is not NaN
        topic_label_mapping[topic_id] = row['topic_label']

# Set the custom labels in the topic model
topic_model.set_topic_labels(topic_label_mapping)

# View the custom labels
print(topic_model.custom_labels_)

['-1_siemens_fiscal_financial_year', '0_financial_cash_million_income', 'Siemens Environmental Portfolio and Sustainability', 'Corporate Compliance and Ethics', '3_board_supervisory_siemens_chairman', '4_people_diversity_learning_siemens', 'Board Compensation and Benefits', 'Global Market and Economic Growth', 'Industrial Digital Manufacturing Solutions', '8_100_gmbh_siemens_spain', 'Occupational Health and Safety', 'Corporate Cybersecurity and Data Privacy', 'Currency & Financial Risk Management', '12_arrangements_contractual_ol_circumstances', '13_emption_264_immateriality_pursuant', 'Affordable High-Quality Healthcare Solutions', 'Efficient Power Plant Modernization', 'Employee Turnover and Diversity', 'Water Management and Sustainability', 'Smart Sustainable Urban Infrastructure', '19_000_500_140_dr', 'Atmospheric Pollutant Emissions Management', 'Nuclear Facility Decommissioning & Waste Management', 'Customer Satisfaction & Key Account Mgmt', 'Baseline comparison for CO2 reduction

In [25]:
#Option 2: Using SDG number and name concatenated

# Create a dictionary mapping from Topic ID to SDG info (number + name)
sdg_label_mapping = {}

# Loop through the filtered dataframe (or use merged_df if you want all topics)
for index, row in filtered_df.iterrows():
    topic_id = row['Topic']
    sdg_number = row['sdg_number']
    sdg_name = row['sdg_name']
    
    # Only create a label if both SDG number and name are available
    if pd.notna(sdg_number) and pd.notna(sdg_name) and sdg_number != "" and sdg_name != "":
        sdg_label = f"SDG {sdg_number}: {sdg_name}"
        sdg_label_mapping[topic_id] = sdg_label
    else:
        # Optional: use topic_label as fallback if no SDG info is available
        if pd.notna(row['topic_label']):
            sdg_label_mapping[topic_id] = row['topic_label']

# Set the custom labels in the topic model
topic_model.set_topic_labels(sdg_label_mapping)

# View the custom labels
print(topic_model.custom_labels_)

['-1_siemens_fiscal_financial_year', '0_financial_cash_million_income', 'SDG 13: Climate Action', 'SDG 16: Peace, Justice and Strong Institutions', '3_board_supervisory_siemens_chairman', '4_people_diversity_learning_siemens', 'SDG 16: Peace, Justice and Strong Institutions', 'SDG 8: Decent Work and Economic Growth', 'SDG 9: Industry, Innovation and Infrastructure', '8_100_gmbh_siemens_spain', 'SDG 8: Decent Work and Economic Growth', 'SDG 9: Industry, Innovation and Infrastructure', 'SDG 9: Industry, Innovation and Infrastructure', '12_arrangements_contractual_ol_circumstances', '13_emption_264_immateriality_pursuant', 'SDG 3: Good Health and Well-being', 'SDG 7: Affordable and Clean Energy', 'SDG 8: Decent Work and Economic Growth', 'SDG 6: Clean Water and Sanitation', 'SDG 11: Sustainable Cities and Communities', '19_000_500_140_dr', 'SDG 13: Climate Action', 'SDG 12: Responsible Consumption and Production', 'SDG 9: Industry, Innovation and Infrastructure', 'SDG 13: Climate Action',

In [26]:
#Option 3: Combining SDG information with topic labels

# Create a dictionary mapping from Topic ID to combined labels
combined_label_mapping = {}

# Loop through the filtered dataframe (or use merged_df if you want all topics)
for index, row in filtered_df.iterrows():
    topic_id = row['Topic']
    sdg_number = row['sdg_number']
    sdg_name = row['sdg_name']
    topic_label = row['topic_label']
    
    # Start with an empty label
    combined_label = ""
    
    # Add SDG info if available
    if pd.notna(sdg_number) and pd.notna(sdg_name) and sdg_number != "" and sdg_name != "":
        combined_label += f"SDG {sdg_number}: {sdg_name}"
    
    # Add topic label if available
    if pd.notna(topic_label) and topic_label != "":
        # Add a separator if we already have SDG info
        if combined_label:
            combined_label += " - "
        combined_label += f"{topic_label}"
    
    # Only add to mapping if we have some label content
    if combined_label:
        combined_label_mapping[topic_id] = combined_label
    
# Set the custom labels in the topic model
topic_model.set_topic_labels(combined_label_mapping)

# View the custom labels
print(topic_model.custom_labels_)

['-1_siemens_fiscal_financial_year', '0_financial_cash_million_income', 'SDG 13: Climate Action - Siemens Environmental Portfolio and Sustainability', 'SDG 16: Peace, Justice and Strong Institutions - Corporate Compliance and Ethics', '3_board_supervisory_siemens_chairman', '4_people_diversity_learning_siemens', 'SDG 16: Peace, Justice and Strong Institutions - Board Compensation and Benefits', 'SDG 8: Decent Work and Economic Growth - Global Market and Economic Growth', 'SDG 9: Industry, Innovation and Infrastructure - Industrial Digital Manufacturing Solutions', '8_100_gmbh_siemens_spain', 'SDG 8: Decent Work and Economic Growth - Occupational Health and Safety', 'SDG 9: Industry, Innovation and Infrastructure - Corporate Cybersecurity and Data Privacy', 'SDG 9: Industry, Innovation and Infrastructure - Currency & Financial Risk Management', '12_arrangements_contractual_ol_circumstances', '13_emption_264_immateriality_pursuant', 'SDG 3: Good Health and Well-being - Affordable High-Qu

In [27]:
topics_over_time = topic_model.topics_over_time(flat_chunks, years)
#we set custom labels to True to use the labels we set before
topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True, topics=meaningful_topics)

In [28]:
print(topics_over_time)

     Topic                                              Words  Frequency  \
0       -1             2013, siemens, financial, fiscal, 2012        418   
1        0               2013, financial, cash, 2012, million        284   
2        1  environmental, sustainability, portfolio, emis...         66   
3        2         compliance, risks, risk, business, conduct         71   
4        3          board, supervisory, siemens, chairman, ag        132   
..     ...                                                ...        ...   
269     20                      ozone, ods, metric, tons, r11          5   
270     21  waste, hazardous, construction, demolition, re...          2   
271     22      score, customer, key, customers, satisfaction          4   
272     23  comparison, counterfactual, refers, solution, ...          4   
273     28            shs, siemens, bargaining, gbk, brightly          8   

     Timestamp                                               Name  
0         2013     

In [29]:
#If you have repeated SDGs, you could merge the topics for which the SDG is duplicate, and merge them, see docs below for how
#https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.merge_models

#VISUALIZATIONS

#you can try more visualizations, check out the docs:
#https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html#visualize-probabilities-or-distribution

In [30]:
# Group topics by both sdg_number and sdg_name, collecting actual topic values
grouped_topics = filtered_df.groupby(['sdg_number', 'sdg_name'])['Topic'].apply(list).tolist()

# Review the result
print(f"Found {len(grouped_topics)} distinct SDG groups")
for i, group_idx in enumerate(filtered_df.groupby(['sdg_number', 'sdg_name']).groups):
    sdg_number, sdg_name = group_idx
    topics = filtered_df[
        (filtered_df['sdg_number'] == sdg_number) & 
        (filtered_df['sdg_name'] == sdg_name)
    ]['Topic'].tolist()
    
    print(f"Group {i+1}: SDG {sdg_number} - {sdg_name}")
    print(f"  Topics: {topics}")
    print()

print(grouped_topics)
print(meaningful_topics)

Found 9 distinct SDG groups
Group 1: SDG 11 - Sustainable Cities and Communities
  Topics: [18]

Group 2: SDG 12 - Responsible Consumption and Production
  Topics: [21]

Group 3: SDG 13 - Climate Action
  Topics: [1, 20, 23]

Group 4: SDG 16 - Peace, Justice and Strong Institutions
  Topics: [2, 5]

Group 5: SDG 3 - Good Health and Well-being
  Topics: [14]

Group 6: SDG 6 - Clean Water and Sanitation
  Topics: [17]

Group 7: SDG 7 - Affordable and Clean Energy
  Topics: [15]

Group 8: SDG 8 - Decent Work and Economic Growth
  Topics: [6, 9, 16]

Group 9: SDG 9 - Industry, Innovation and Infrastructure
  Topics: [7, 10, 11, 22]

[[18], [21], [1, 20, 23], [2, 5], [14], [17], [15], [6, 9, 16], [7, 10, 11, 22]]
[1, 2, 5, 6, 7, 9, 10, 11, 14, 15, 16, 17, 18, 20, 21, 22, 23]


In [31]:
# Create a DataFrame with document-year mapping
years_list = []
chunks_list = []

# Flattening the dictionary into two lists
for year, chunks in sorted(dictionary_year_chunks.items()):
    for chunk in chunks:
        years_list.append(year)
        chunks_list.append(chunk)

# Create a simple dataframe with document-year mapping
df_years = pd.DataFrame({
    'Document': chunks_list,
    'Year': years_list
})

# Join df_clean with df_summary_results on the Topic column
# This adds all the LLM-generated analysis to your document data
df_with_llm = pd.merge(df_clean, df_summary_results, left_on='topic', right_on='Topic', how='left')

# Now join with the year information
# This assumes document text in df_with_llm matches exactly with chunks_list
document_info_with_llm_data = pd.merge(df_with_llm, df_years, on='Document', how='left')

# Optional: Drop duplicate Topic column if needed (since we joined on 'topic' and 'Topic')
if 'Topic' in document_info_with_llm_data.columns and 'topic' in document_info_with_llm_data.columns:
    document_info_with_llm_data = document_info_with_llm_data.drop(columns=['Topic'])

# Display the first few rows to verify
display(document_info_with_llm_data)

# Check if we have any NaN values in the Year column
missing_years = document_info_with_llm_data['Year'].isna().sum()
print(f"Number of rows with missing years: {missing_years}")

# Optional if button doesn't show

Unnamed: 0,Document,Topic_x,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,topic,Topic_y,...,summary,is_generic,justification,confidence_score,sdg_number,sdg_name,error,error_message,raw_response,Year
0,Industrial productivity\nIntelligent infrastru...,7,7_digital_solutions_customers_production,"[digital, solutions, customers, production, bu...",[→Innovation and development cycles have \nsho...,digital - solutions - customers - production -...,0.723011,False,7,7,...,This topic cluster focuses on Siemens' use of ...,False,The cluster centers explicitly on digital and ...,0.95,9,"Industry, Innovation and Infrastructure",,,,2013
1,WWW.SIEMENS.COM/AR/REPORT-INDUSTRYWelcome to S...,14,14_healthcare_medical_care_imaging,"[healthcare, medical, care, imaging, diagnosti...",[Under the new organizational structure report...,healthcare - medical - care - imaging - diagno...,0.559207,False,14,14,...,The cluster focuses on Siemens’ contributions ...,False,The collection of texts clearly revolves aroun...,0.95,3,Good Health and Well-being,,,,2013
2,"N\naresh Trehan, founder of “Medanta –\n \nTh...",14,14_healthcare_medical_care_imaging,"[healthcare, medical, care, imaging, diagnosti...",[Under the new organizational structure report...,healthcare - medical - care - imaging - diagno...,1.000000,False,14,14,...,The cluster focuses on Siemens’ contributions ...,False,The collection of texts clearly revolves aroun...,0.95,3,Good Health and Well-being,,,,2013
3,WWW.SIEMENS.COM/AR/REPORT-HEALTHCARE\nWhat mak...,18,18_vienna_city_häupl_michael,"[vienna, city, häupl, michael, transportation,...",[What initiatives would you like to launch in ...,vienna - city - häupl - michael - transportati...,1.000000,False,18,18,...,This cluster highlights Vienna’s development a...,False,The cluster’s recurring emphasis on smart and ...,0.95,11,Sustainable Cities and Communities,,,,2013
4,WWW.SIEMENS.COM/AR/STRATEGYGurgaon\nVienna4Gen...,15,15_power_gas_energy_division,"[power, gas, energy, division, consumption, tu...",[A.1.1.2 BUSINES S DESCRIPTION\nThe Power and...,power - gas - energy - division - consumption ...,1.000000,False,15,15,...,This cluster discusses Siemens’ efforts to mod...,False,The content centers on improving energy effici...,0.95,7,Affordable and Clean Energy,,,,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8898,In a limited assurance engagement the assuranc...,2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.759931,False,2,2,...,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16,"Peace, Justice and Strong Institutions",,,,2024
8899,"Within the scope of our assurance engagement, ...",2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.513443,False,2,2,...,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16,"Peace, Justice and Strong Institutions",,,,2024
8900,Intended Use of the Assurance Report\nWe issue...,2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,0.908644,False,2,2,...,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16,"Peace, Justice and Strong Institutions",,,,2024
8901,"Munich, 2 December 2024\nPricewaterhouseCooper...",2,2_compliance_human_rights_risk,"[compliance, human, rights, risk, business, ri...",[Global agreements \nSiemens has been an activ...,compliance - human - rights - risk - business ...,1.000000,False,2,2,...,This cluster focuses on Siemens’ comprehensive...,False,The text samples and keywords consistently con...,0.95,16,"Peace, Justice and Strong Institutions",,,,2024


Number of rows with missing years: 0


In [34]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# Create a combined SDG label for better visualization
document_info_with_llm_data['sdg_label'] = document_info_with_llm_data.apply(
    lambda x: f"SDG {x['sdg_number']}: {x['sdg_name']}" if pd.notna(x['sdg_number']) and x['sdg_number'] != "" else "No SDG", 
    axis=1
)

# Clean up the SDG labels to standardize them (without dropping rows)
# Extract the SDG number to help with matching duplicates
document_info_with_llm_data['sdg_number_clean'] = document_info_with_llm_data['sdg_number'].astype(str).str.strip()

# Create a standardized label for visualization (keep original data intact)
# This will be used only for grouping in the visualization
document_info_with_llm_data['sdg_viz_label'] = document_info_with_llm_data.apply(
    lambda x: f"SDG {x['sdg_number_clean']}: {x['sdg_name']}" 
    if pd.notna(x['sdg_number']) and x['sdg_number'] != "" else "No SDG", 
    axis=1
)

# Filter to include only meaningful sustainability topics with SDG assignments
sdg_data = document_info_with_llm_data[
    (document_info_with_llm_data['sdg_label'] != "No SDG") & 
    (document_info_with_llm_data['is_meaningful_topic'] == True)
].copy()

# Count document chunks by SDG and Year, using the standardized visualization label
sdg_counts = sdg_data.groupby(['sdg_viz_label', 'Year']).size().reset_index(name='count')

# Pivot the data for plotting
sdg_pivot = sdg_counts.pivot(index='Year', columns='sdg_viz_label', values='count').fillna(0)

# Get unique SDG labels and years for plotting
sdg_labels = sdg_pivot.columns.tolist()
years = sdg_pivot.index.tolist()

# Create line chart with Plotly
fig_line = go.Figure()

# Add traces for each SDG
for sdg in sdg_labels:
    fig_line.add_trace(go.Scatter(
        x=years,
        y=sdg_pivot[sdg],
        mode='lines+markers',
        name=sdg,
        hovertemplate='Year: %{x}<br>Count: %{y}<extra></extra>'
    ))

# Update layout
fig_line.update_layout(
    title='SDG Trends Over Time (2013-2024)',
    xaxis_title='Year',
    yaxis_title='Number of Document Chunks',
    legend_title='SDG',
    hovermode='closest',
    height=600,
    width=1000
)

# Display the line chart
fig_line.show()

# Create a stacked area chart for relative proportions
# Calculate percentages for each year
sdg_pivot_pct = sdg_pivot.div(sdg_pivot.sum(axis=1), axis=0) * 100

# Create stacked area chart
fig_area = go.Figure()

# Add traces for each SDG
for sdg in sdg_labels:
    fig_area.add_trace(go.Scatter(
        x=years,
        y=sdg_pivot_pct[sdg],
        mode='lines',
        name=sdg,
        stackgroup='one',  # This makes it a stacked area chart
        hovertemplate='Year: %{x}<br>Percentage: %{y:.1f}%<extra></extra>'
    ))

# Update layout
fig_area.update_layout(
    title='Relative Proportion of SDGs Over Time',
    xaxis_title='Year',
    yaxis_title='Percentage of Document Chunks (%)',
    legend_title='SDG',
    hovermode='closest',
    height=600,
    width=1000
)

# Display the stacked area chart
fig_area.show()

# Create a heatmap to visualize SDG intensity by year
pivot_for_heatmap = sdg_counts.pivot(index='sdg_viz_label', columns='Year', values='count').fillna(0)

# Create heatmap
fig_heatmap = px.imshow(
    pivot_for_heatmap,
    labels=dict(x="Year", y="SDG", color="Count"),
    x=years,
    y=sdg_labels,
    color_continuous_scale="YlGnBu",
    aspect="auto"
)

# Update layout
fig_heatmap.update_layout(
    title='SDG Occurrence Heatmap by Year',
    xaxis_title='Year',
    yaxis_title='SDG',
    height=600,
    width=1000
)

# Add text annotations on the heatmap
for i, sdg in enumerate(sdg_labels):
    for j, year in enumerate(years):
        value = pivot_for_heatmap.loc[sdg, year]
        if value > 0:  # Only show values greater than 0
            fig_heatmap.add_annotation(
                x=year,
                y=sdg,
                text=str(int(value)),
                showarrow=False,
                font=dict(color="black" if value < pivot_for_heatmap.max().max()/2 else "white")
            )

# Display the heatmap
fig_heatmap.show()