In [1]:
import nltk
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('punt_tab')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re

# from textblob import TextBlob
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import contractions
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import pos_tag

In [3]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
# Initialize stemmer/lemmatizer (run once)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 1: Cleaning Functions

In [4]:
def take_out_mentions_to_the_president(text):
    """Remove introductory phrases addressing the president or other dignitaries from text.
    Args:
        text (str): The input text to process    
    Returns:
        str: The text with introductory address phrases removed, or original text if none found
    """
    
    # Split into sentences (simple regex)
    # The regex looks for sentence-ending punctuation followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    # Get first 4 sentences as search area - addresses are typically at the beginning
    search_area = ' '.join(sentences[:2]).lower()
    
    # Phrases to search for (lowercase)
    # We look for various forms of presidential/dignitary addresses
    # Store the index where each phrase is found (-1 if not found)
    possible_starts = [
        search_area.lower().find("mr. president"),  # Formal address with period
        search_area.lower().find("mr president"),   # Formal address without period
        search_area.lower().find("ladies and gentlemen"),  # Common formal address
        search_area.lower().find("your excellency"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your excellence"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("president of the general assembly"),
        search_area.lower().find("excellencies"),     # Plural form of address
        search_area.lower().find("esteemed president"),  # Honorific address
        search_area.lower().find("distinguished delegates"),  # Assembly address
        search_area.lower().find("allah"),  # Address to members of an assembly
    ]
    
    # Filter out phrases that weren't found (-1) and keep only valid indices
    starts = [start for start in possible_starts if start != -1]
    
    # Get the latest occurrence of any address phrase
    start = max(starts) if len(starts) > 0 else -1
    
    if start == -1:
        # No address phrases found - log this and return full text
        # print(f"Could not find probable start in the text of {url}")
        start = 0
    
    # Return text starting from after the address phrase
    return text[start:]

In [5]:
def simple_clean(text):
    """Cleans text by removing common formatting artifacts from PDF conversions and UN document patterns.
    
    Args:
        text (str): Input text to be cleaned
        
    Returns:
        str: Cleaned text with unwanted patterns removed
    """
    
    # Convert to lowercase for consistent processing 
    # text = text.lower()
  
    # 1: Remove UN document reference numbers (e.g. "20/26 15-29876")
    text = re.sub(r'\b\d{1,4}\s*/\s*\d{1,4}\s+\d{2,4}-\d{4,8}\b', ' ', text)
    
    # 2: Remove UN meeting record references (e.g. "A/70/PV.24")
    text = re.sub(r'\b[a-z]\s*/\s*\d+\s*/pv\s*\.\s*\d+\b', ' ', text, flags=re.IGNORECASE)
    
    # 3: Remove dates in DD/MM/YYYY format (e.g. "31/12/2023")
    text = re.sub(r'\d{2}\/\d{2}\/\d{4}', '', text) 
  
    # 4: Remove form feed characters (often from PDF conversion)
    text = re.sub(r'\x0c', '', text) 
  
    # 5: Remove parenthetical document references (e.g. "(A/70/123)" or "(A/70/123, annex)")
    text = re.sub(r'\(\s*[a-z]\s*/\s*\d+\s*/\s*\d+\s*(?:,\s*annex)?\s*\)', '', text, flags=re.IGNORECASE)
    
    # 6: Remove standalone line numbers/page numbers (e.g. "42" on its own line)
    text = re.sub(r'^\s*\d+\s*$\n?', '', text, flags=re.MULTILINE)
    
    # 7: Normalize newlines - replace all with single spaces
    text = re.sub(r'\n', ' ', text, flags=re.MULTILINE)

    # 8: Remove numbered list prefixes (e.g. "1.    Some text")
    text = re.sub(r'\d+\.\t', '', text)

    # 9: Remove Unicode BOM (Byte Order Mark) character if present
    text = text.replace('\ufeff', '')
    
    # 10: Normalize whitespace - collapse multiple spaces into one and trim
    text = re.sub(r'\s+', ' ', text).strip()

    # 11. Remove standalone hyphens
    text = re.sub(r'\s*-(?!\w)(?<!\w)-*\s*', ' ', text)

    # 12: Remove punctuations
    #text = text.replace('—', '').replace(',', '').replace(':', '').replace('’', '').replace('“', '').replace("”", '').replace(";", '').replace("''", '')
    
    # 12: Remove words/numbers between parenthesis
    text = re.sub(r'\([^)]*\)', '', text)

    #13: Remove markdown-style bold/italic/blockquote symbols (all below are subpoints rather that main ones)
    text = re.sub(r'\*\*+', '', text)       # removes **, **** etc.
    text = re.sub(r'>+', '', text)          # removes >, >>, etc.
    text = re.sub(r'-{2,}', '', text)       # removes --, --- etc.
    text = re.sub(r'[=*_~#`]+', '', text)   # removes *, _, =, #, ~, ` etc.
    
    return text

In [6]:
def clean_speeches(text, use_simple_clean=True, disregard_mentions_to_president=True, Remove_Stopwords = False, expand=False, Remove_Punctuations = False):
    """Load a text file and apply cleaning operations. 
    Args:
        text: Original text
        use_simple_clean (bool): Whether to apply basic text cleaning
        disregard_mentions_to_president (bool): Whether to remove mentions to the president
        Returns:
        str: Cleaned text from the file"""
    
    if expand: 
        text = contractions.fix(text) 

    # Apply basic text cleaning if the flag is set
    if use_simple_clean:
        text = simple_clean(text)  # Assumes simple_clean is a custom function defined elsewhere

    # Remove mentions to the president if the flag is set
    if disregard_mentions_to_president:
        text = take_out_mentions_to_the_president(text)  # Also assumes this function is defined elsewhere
    
    # Remove Stopwrods
    if Remove_Stopwords:
        text = remove_stopwords(text)

    # Remove Punctuations
    if Remove_Punctuations:
        text = remove_punctuations(text)
        
    # Replace only spaces/tabs induced from the stopwords
    text = re.sub(r'[ \t]+', ' ', text)

    # Return the cleaned (or original) text
    return text

# Step 2: Load the speeches

In [7]:
base_path_alvaro = Path(r"C:\Users\Alvaro Millan Ruiz\OneDrive\Escritorio\BDS\Block_5\NLP\Project")
base_path_gilda = "/Users/szonjapike/Desktop/BDS/Block_5/NLP/TXT/"

base_path = base_path_alvaro # change according to user

In [8]:
final_df = pd.read_csv(base_path / "Final_df.csv")
final_df = final_df.drop(columns=["cleaned_speeches_no_postagging_expanded", "cleaned_speeches_no_postagging_no_expanded", "cleaned_speeches_postagging_no_expanded", "cleaned_speeches_postagging_expanded"])
final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2
...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2


In [28]:
# nltk.download('wordnet')
tqdm.pandas()  # Enable pandas integration

final_df['speeches_for_keyword_search'] = final_df['Speech'].progress_apply(
    lambda x: clean_speeches(
        x, 
        expand=True,
        use_simple_clean=True, 
        disregard_mentions_to_president=True, 
        Remove_Stopwords=False,        # No removal of stopwords
        Remove_Punctuations= False
    )
)

final_df 

100%|██████████| 6439/6439 [00:22<00:00, 291.29it/s]


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1,"Allow me, first of all, Sir, to congratulate y..."
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2,"First I would like to congratulate you, Sir, o..."
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2,It is a special pleasure for me to speak at th...
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4,"Mr. President, on behalf of the delegation of ..."
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2,President of the General Assembly at its forty...
...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H..."
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc..."
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your..."


# Step 3: Look for the keywords

In [None]:
# Climate keyword according to the literature
climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", "extreme weather", 
                    "global environmental change", "climate variability", "greenhouse", "low carbon", "ghge", "renewable energy", "carbon emission",
                    "co2", "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero", "climate crisis",
                    "climate crisis", "climate summit", "climate catastrophe", "climate justice", "climate emergency", 
                    ]

len(climate_keywords)

In [None]:
climate_keywords_dict = {
    # Core Climate Science
    "climate_science": [
        "climate change", "global warming", "global average temperature",
        "climate variability", "global environmental change", "greenhouse effect",
        "temperature", "extreme weather", "climate resilience"
    ],
    
    # Carbon & CO2
    "carbon": [
        "carbon dioxide", "co2", "low carbon", "carbon emission",
        "carbon tax", "carbon footprint", "carbon neutrality",
        "net-zero", "net zero"
    ],
    
    # Greenhouse Gases
    "greenhouse_gases": [
        "greenhouse gas", "ghg", "ghge", "greenhouse", "climate pollutant",
        "climate pollutants"
    ],
    
    # Climate Policy & Agreements
    "climate_policy": [
        "paris accord", "kyoto protocol", "unfccc",
        "framework convention on climate change", "climate politics",
        "cap and trade", "emissions trading scheme", "carbon tax",
        "climate mitigation", "climate action"
    ],
    
    # Institutions & Reports
    "institutions": [
        "ipcc", "ball roadmap"
    ],
    
    # Energy & Emissions
    "energy_emissions": [
        "emissions", "renewable energy", "carbon emission"
    ]
}

In [None]:
def extract_keyword_info(text, keywords):
    """
    Extracts climate-related keyword information from a text string.

    Parameters:
        text (str): The speech or text in which to search for keywords.
        keywords (list of str): A list of climate-related keywords or phrases to match against the text.
    """
    matches = []
    for kw in keywords:
        pattern = r'\b' + re.escape(kw) + r's?\b'  # Now accounts for plural forms
        if re.search(pattern, text, flags=re.IGNORECASE):
            matches.append(kw)
    contains_keyword = len(matches) > 0
    return pd.Series([matches, contains_keyword])

In [34]:
final_df[['matched_climate_keywords', 'contains_climate_keyword']] = final_df['speeches_for_keyword_search'].apply(
    lambda text: extract_keyword_info(text, climate_keywords)
)

final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1,"Allow me, first of all, Sir, to congratulate y...",[],False
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2,"First I would like to congratulate you, Sir, o...",[],False
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2,It is a special pleasure for me to speak at th...,[],False
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4,"Mr. President, on behalf of the delegation of ...",[],False
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2,President of the General Assembly at its forty...,[],False
...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H...","[climate change, climate action, extreme weather]",True
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc...",[climate change],True
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...,"[climate change, climate action, emissions, ex...",True
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your...","[climate change, extreme weather]",True


In [30]:
# Define climate-related keywords (add variants as needed)
# keywords = ['CO2', 'carbon dioxide', 'emissions', 'greenhouse gas', 'GHG', 'climate change']

climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", "extreme weather", 
                    "global environmental change", "climate variability", "greenhouse", "low carbon", "ghge", "renewable energy", "carbon emission",
                    "co2", "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero"
                    ]

# Create a regex pattern to match any keyword
pattern = re.compile('|'.join(map(re.escape, climate_keywords)), flags=re.IGNORECASE)

# Filter rows where speeches contain any keyword
climate_speeches = final_df[final_df['speeches_for_keyword_search'].str.contains(pattern, na=False)]
climate_speeches

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"Please accept my country's congratulations, Si..."
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"I am pleased, Sir, to congratulate you on your..."
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E..."
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"Mr. President, warm felicitations are due you ..."
26,45,1990,CHN,﻿I should like to begin by warmly congratulati...,115,2992,1,I should like to begin by warmly congratulatin...
...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H..."
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc..."
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your..."


In [14]:
pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/10.5 MB 4.7 MB/s eta 0:00:02
   ------- -------------------------------- 1.8/10.5 MB 5.1 MB/s eta 0:00:02
   --------------- ------------------------ 3.9/10.5 MB 6.4 MB/s eta 0:00:02
   ------------------------ --------------- 6.3/10.5 MB 7.5 MB/s eta 0:00:01
   --------------------------------- ------ 8.7/10.5 MB 8.3 MB/s eta 0:00:01
   --------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load ClimateBERT
tokenizer = AutoTokenizer.from_pretrained("Climatebert/distilroberta-base-climate-f")
model = AutoModel.from_pretrained("Climatebert/distilroberta-base-climate-f")

# Define concept vocabularies
concepts = {
    "climate": ["climate change", "climate action", "the climate"],
    "emissions": ["emissions", "dangerous emissions", "carbon emissions"]
}

# Function to get embedding of a sentence
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to get average concept embedding from sentence list
def compute_concept_embedding(sentences, concept_terms):
    filtered = [s for s in sentences if any(term in s.lower() for term in concept_terms)]
    if not filtered:
        return None
    embeddings = [get_sentence_embedding(s) for s in filtered]
    return np.mean(embeddings, axis=0)



tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at Climatebert/distilroberta-base-climate-f and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
