In [1]:
import nltk
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('punt_tab')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re

# from textblob import TextBlob
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import contractions
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import pos_tag

In [2]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
# Initialize stemmer/lemmatizer (run once)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 1: Cleaning Functions

In [3]:
def take_out_mentions_to_the_president(text):
    """Remove introductory phrases addressing the president or other dignitaries from text.
    Args:
        text (str): The input text to process    
    Returns:
        str: The text with introductory address phrases removed, or original text if none found
    """
    
    # Split into sentences (simple regex)
    # The regex looks for sentence-ending punctuation followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    # Get first 4 sentences as search area - addresses are typically at the beginning
    search_area = ' '.join(sentences[:2]).lower()
    
    # Phrases to search for (lowercase)
    # We look for various forms of presidential/dignitary addresses
    # Store the index where each phrase is found (-1 if not found)
    possible_starts = [
        search_area.lower().find("mr. president"),  # Formal address with period
        search_area.lower().find("mr president"),   # Formal address without period
        search_area.lower().find("ladies and gentlemen"),  # Common formal address
        search_area.lower().find("your excellency"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your excellence"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("president of the general assembly"),
        search_area.lower().find("excellencies"),     # Plural form of address
        search_area.lower().find("esteemed president"),  # Honorific address
        search_area.lower().find("distinguished delegates"),  # Assembly address
        search_area.lower().find("allah"),  # Address to members of an assembly
    ]
    
    # Filter out phrases that weren't found (-1) and keep only valid indices
    starts = [start for start in possible_starts if start != -1]
    
    # Get the latest occurrence of any address phrase
    start = max(starts) if len(starts) > 0 else -1
    
    if start == -1:
        # No address phrases found - log this and return full text
        # print(f"Could not find probable start in the text of {url}")
        start = 0
    
    # Return text starting from after the address phrase
    return text[start:]

In [4]:
def simple_clean(text):
    """Cleans text by removing common formatting artifacts from PDF conversions and UN document patterns.
    
    Args:
        text (str): Input text to be cleaned
        
    Returns:
        str: Cleaned text with unwanted patterns removed
    """
    
    # Convert to lowercase for consistent processing 
    # text = text.lower()
  
    # 1: Remove UN document reference numbers (e.g. "20/26 15-29876")
    text = re.sub(r'\b\d{1,4}\s*/\s*\d{1,4}\s+\d{2,4}-\d{4,8}\b', ' ', text)
    
    # 2: Remove UN meeting record references (e.g. "A/70/PV.24")
    text = re.sub(r'\b[a-z]\s*/\s*\d+\s*/pv\s*\.\s*\d+\b', ' ', text, flags=re.IGNORECASE)
    
    # 3: Remove dates in DD/MM/YYYY format (e.g. "31/12/2023")
    text = re.sub(r'\d{2}\/\d{2}\/\d{4}', '', text) 
  
    # 4: Remove form feed characters (often from PDF conversion)
    text = re.sub(r'\x0c', '', text) 
  
    # 5: Remove parenthetical document references (e.g. "(A/70/123)" or "(A/70/123, annex)")
    text = re.sub(r'\(\s*[a-z]\s*/\s*\d+\s*/\s*\d+\s*(?:,\s*annex)?\s*\)', '', text, flags=re.IGNORECASE)
    
    # 6: Remove standalone line numbers/page numbers (e.g. "42" on its own line)
    text = re.sub(r'^\s*\d+\s*$\n?', '', text, flags=re.MULTILINE)
    
    # 7: Normalize newlines - replace all with single spaces
    text = re.sub(r'\n', ' ', text, flags=re.MULTILINE)

    # 8: Remove numbered list prefixes (e.g. "1.    Some text")
    text = re.sub(r'\d+\.\t', '', text)

    # 9: Remove Unicode BOM (Byte Order Mark) character if present
    text = text.replace('\ufeff', '')
    
    # 10: Normalize whitespace - collapse multiple spaces into one and trim
    text = re.sub(r'\s+', ' ', text).strip()

    # 11. Remove standalone hyphens
    text = re.sub(r'\s*-(?!\w)(?<!\w)-*\s*', ' ', text)

    # 12: Remove punctuations
    #text = text.replace('—', '').replace(',', '').replace(':', '').replace('’', '').replace('“', '').replace("”", '').replace(";", '').replace("''", '')
    
    # 12: Remove words/numbers between parenthesis
    text = re.sub(r'\([^)]*\)', '', text)

    #13: Remove markdown-style bold/italic/blockquote symbols (all below are subpoints rather that main ones)
    text = re.sub(r'\*\*+', '', text)       # removes **, **** etc.
    text = re.sub(r'>+', '', text)          # removes >, >>, etc.
    text = re.sub(r'-{2,}', '', text)       # removes --, --- etc.
    text = re.sub(r'[=*_~#`]+', '', text)   # removes *, _, =, #, ~, ` etc.
    
    return text

In [5]:
def clean_speeches(text, use_simple_clean=True, disregard_mentions_to_president=True, Remove_Stopwords = False, expand=False, Remove_Punctuations = False):
    """Load a text file and apply cleaning operations. 
    Args:
        text: Original text
        use_simple_clean (bool): Whether to apply basic text cleaning
        disregard_mentions_to_president (bool): Whether to remove mentions to the president
        Returns:
        str: Cleaned text from the file"""
    
    if expand: 
        text = contractions.fix(text) 

    # Apply basic text cleaning if the flag is set
    if use_simple_clean:
        text = simple_clean(text)  # Assumes simple_clean is a custom function defined elsewhere

    # Remove mentions to the president if the flag is set
    if disregard_mentions_to_president:
        text = take_out_mentions_to_the_president(text)  # Also assumes this function is defined elsewhere
    
    # Remove Stopwrods
    if Remove_Stopwords:
        text = remove_stopwords(text)

    # Remove Punctuations
    if Remove_Punctuations:
        text = remove_punctuations(text)
        
    # Replace only spaces/tabs induced from the stopwords
    text = re.sub(r'[ \t]+', ' ', text)

    # Return the cleaned (or original) text
    return text

# Step 2: Load the speeches

In [3]:
base_path_alvaro = Path(r"C:\Users\Alvaro Millan Ruiz\OneDrive\Escritorio\BDS\Block_5\NLP\Project")
base_path_gilda = "/Users/szonjapike/Desktop/BDS/Block_5/NLP/TXT/"

base_path = base_path_gilda # change according to user

In [4]:
path_dataset_gilda = "/Users/szonjapike/Desktop/BDS/Block_5/NLP/Project/"
base_path = path_dataset_gilda

In [11]:
final_df = pd.read_csv(base_path + "Final_df.csv")
final_df = final_df.drop(columns=["cleaned_speeches_no_postagging_expanded",
                                   "cleaned_speeches_no_postagging_no_expanded",
                                     "cleaned_speeches_postagging_no_expanded",
                                       "cleaned_speeches_postagging_expanded"])
final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level
0,45,1990,NZL,"﻿\nI should like to congratulate you, Sir, on ...",145,3512,4
1,45,1990,EGY,"﻿First of all, I would like to extend may dele...",76,2532,1
2,45,1990,CPV,﻿I should like to begin by offering Mr. de Mar...,71,2432,2
3,45,1990,UGA,"﻿In 1967, I had the privilege of addressing th...",132,3508,1
4,45,1990,NLD,"﻿\nMay I, first of all, extend to Mr. de Marco...",167,4134,4
...,...,...,...,...,...,...,...
6434,79,2024,CIV,Mr President of the 79th Session of the United...,41,1206,2
6435,79,2024,HND,"Madam President of this General Assembly, \nM...",54,1367,2
6436,79,2024,MKD,Distinguished President of the United Nations ...,101,2804,3
6437,79,2024,KOR,"Mr. President,\nMr. Secretary-General,\nDistin...",31,821,4


In [12]:
# nltk.download('wordnet')
tqdm.pandas()  # Enable pandas integration

final_df['speeches_for_keyword_search'] = final_df['Speech'].progress_apply(
    lambda x: clean_speeches(
        x, 
        expand=True,
        use_simple_clean=True, 
        disregard_mentions_to_president=True, 
        Remove_Stopwords=False,        # No removal of stopwords
        Remove_Punctuations= False
    )
)

final_df 

  0%|          | 4/6439 [00:00<03:59, 26.90it/s]

100%|██████████| 6439/6439 [00:59<00:00, 108.25it/s]


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
0,45,1990,NZL,"﻿\nI should like to congratulate you, Sir, on ...",145,3512,4,"I should like to congratulate you, Sir, on you..."
1,45,1990,EGY,"﻿First of all, I would like to extend may dele...",76,2532,1,"First of all, I would like to extend may deleg..."
2,45,1990,CPV,﻿I should like to begin by offering Mr. de Mar...,71,2432,2,I should like to begin by offering Mr. de Marc...
3,45,1990,UGA,"﻿In 1967, I had the privilege of addressing th...",132,3508,1,"In 1967, I had the privilege of addressing thi..."
4,45,1990,NLD,"﻿\nMay I, first of all, extend to Mr. de Marco...",167,4134,4,"May I, first of all, extend to Mr. de Marco my..."
...,...,...,...,...,...,...,...,...
6434,79,2024,CIV,Mr President of the 79th Session of the United...,41,1206,2,"Ladies and Gentlemen, Heads of State and Gover..."
6435,79,2024,HND,"Madam President of this General Assembly, \nM...",54,1367,2,"Madam President of this General Assembly, Mr. ..."
6436,79,2024,MKD,Distinguished President of the United Nations ...,101,2804,3,"Ladies and Gentlemen, Being invited to address..."
6437,79,2024,KOR,"Mr. President,\nMr. Secretary-General,\nDistin...",31,821,4,"Mr. President, Mr. Secretary-General, Distingu..."


# Step 3: Look for the keywords

In [5]:
# Climate keyword according to the literature
climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", "extreme weather", 
                    "global environmental change", "climate variability", "greenhouse", "low carbon", "ghge", "renewable energy", "carbon emission",
                    "co2", "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero", "climate crisis",
                    "climate crisis", "climate summit", "climate catastrophe", "climate justice", "climate emergency", 
                    ]

len(climate_keywords)

43

In [6]:
# This bigger dictionary can be used later for the the ClimateBERT
climate_keywords_dict = {
    # Core Climate Science
    "climate_science": [
        "climate change", "global warming", "global average temperature",
        "climate variability", "global environmental change", "greenhouse effect",
        "temperature", "extreme weather", "climate resilience"
    ],
    
    # Carbon & CO2
    "carbon": [
        "carbon dioxide", "co2", "low carbon", "carbon emission",
        "carbon tax", "carbon footprint", "carbon neutrality",
        "net-zero", "net zero"
    ],
    
    # Greenhouse Gases
    "greenhouse_gases": [
        "greenhouse gas", "ghg", "ghge", "greenhouse", "climate pollutant",
        "climate pollutants"
    ],
    
    # Climate Policy & Agreements
    "climate_policy": [
        "paris accord", "kyoto protocol", "unfccc",
        "framework convention on climate change", "climate politics",
        "cap and trade", "emissions trading scheme", "carbon tax",
        "climate mitigation", "climate action"
    ],
    
    # Institutions & Reports
    "institutions": [
        "ipcc", "ball roadmap"
    ],
    
    # Energy & Emissions
    "energy_emissions": [
        "emissions", "renewable energy", "carbon emission"
    ]
}

In [5]:
def extract_keyword_info(text, keywords):
    """
    Extracts climate-related keyword information from a text string.

    Parameters:
        text (str): The speech or text in which to search for keywords.
        keywords (list of str): A list of climate-related keywords or phrases to match against the text.
    """
    matches = []
    for kw in keywords:
        pattern = r'\b' + re.escape(kw) + r's?\b'  # Now accounts for plural forms
        if re.search(pattern, text, flags=re.IGNORECASE):
            matches.append(kw)
    contains_keyword = len(matches) > 0
    return pd.Series([matches, contains_keyword])

In [16]:
final_df[['matched_climate_keywords', 'contains_climate_keyword']] = final_df['speeches_for_keyword_search'].apply(
    lambda text: extract_keyword_info(text, climate_keywords)
)

final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
0,45,1990,NZL,"﻿\nI should like to congratulate you, Sir, on ...",145,3512,4,"I should like to congratulate you, Sir, on you...","[climate change, greenhouse gas, emissions, gr...",True
1,45,1990,EGY,"﻿First of all, I would like to extend may dele...",76,2532,1,"First of all, I would like to extend may deleg...",[],False
2,45,1990,CPV,﻿I should like to begin by offering Mr. de Mar...,71,2432,2,I should like to begin by offering Mr. de Marc...,[],False
3,45,1990,UGA,"﻿In 1967, I had the privilege of addressing th...",132,3508,1,"In 1967, I had the privilege of addressing thi...",[],False
4,45,1990,NLD,"﻿\nMay I, first of all, extend to Mr. de Marco...",167,4134,4,"May I, first of all, extend to Mr. de Marco my...",[],False
...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,CIV,Mr President of the 79th Session of the United...,41,1206,2,"Ladies and Gentlemen, Heads of State and Gover...","[climate change, global warming, climate action]",True
6435,79,2024,HND,"Madam President of this General Assembly, \nM...",54,1367,2,"Madam President of this General Assembly, Mr. ...",[],False
6436,79,2024,MKD,Distinguished President of the United Nations ...,101,2804,3,"Ladies and Gentlemen, Being invited to address...","[climate change, greenhouse gas, climate actio...",True
6437,79,2024,KOR,"Mr. President,\nMr. Secretary-General,\nDistin...",31,821,4,"Mr. President, Mr. Secretary-General, Distingu...",[],False


In [17]:
# Define climate-related keywords (add variants as needed)
# keywords = ['CO2', 'carbon dioxide', 'emissions', 'greenhouse gas', 'GHG', 'climate change']

climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", "extreme weather", 
                    "global environmental change", "climate variability", "greenhouse", "low carbon", "ghge", "renewable energy", "carbon emission",
                    "co2", "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero"
                    ]

# Create a regex pattern to match any keyword
pattern = re.compile('|'.join(map(re.escape, climate_keywords)), flags=re.IGNORECASE)

# Filter rows where speeches contain any keyword
climate_speeches = final_df[final_df['speeches_for_keyword_search'].str.contains(pattern, na=False)]
climate_speeches

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
0,45,1990,NZL,"﻿\nI should like to congratulate you, Sir, on ...",145,3512,4,"I should like to congratulate you, Sir, on you...","[climate change, greenhouse gas, emissions, gr...",True
12,45,1990,ITA,"﻿Sir, on taking the floor on behalf of the Eur...",151,4568,4,"Sir, on taking the floor on behalf of the Euro...",[],False
17,45,1990,FJI,﻿\nMr. President it gives me great pleasure to...,101,2666,2,President of the General Assembly at its forty...,[global warming],True
19,45,1990,PNG,﻿On behalf of the Government and people\nof Pa...,113,2923,2,On behalf of the Government and people of Papu...,"[greenhouse effect, greenhouse]",True
23,45,1990,FIN,"﻿\nI take great pleasure in seeing you, Sir, a...",92,2080,4,"I take great pleasure in seeing you, Sir, a di...","[carbon dioxide, emissions]",True
...,...,...,...,...,...,...,...,...,...,...
6431,79,2024,CHE,Mr President\nSecretary-General\nExcellencies\...,66,1314,4,Ladies and gentlemen On the morning of 3 March...,[climate change],True
6432,79,2024,CUB,"**Mr President,** \n**Mr Secretary-General,**...",76,2433,3,"Mr President, Mr Secretary-General, Let me beg...","[climate change, global average temperature, t...",True
6434,79,2024,CIV,Mr President of the 79th Session of the United...,41,1206,2,"Ladies and Gentlemen, Heads of State and Gover...","[climate change, global warming, climate action]",True
6436,79,2024,MKD,Distinguished President of the United Nations ...,101,2804,3,"Ladies and Gentlemen, Being invited to address...","[climate change, greenhouse gas, climate actio...",True


In [7]:
climate_speeches = pd.read_csv(base_path + "climate_speeches.csv")

In [8]:
from nltk.tokenize import sent_tokenize
# Function to extract sentences containing climate-related keywords
def extract_climate_sentences(text):
    if not isinstance(text, str):
        return []
    sentences = sent_tokenize(text)
    return [s for s in sentences if any(kw in s.lower() for kw in climate_keywords)]

climate_speeches['climate_sentences'] = climate_speeches['speeches_for_keyword_search'].apply(extract_climate_sentences)


In [9]:
# Drop the speeches that don't talk about climate change
climate_speeches = climate_speeches[climate_speeches['contains_climate_keyword']]

In [10]:
from nltk.tokenize import sent_tokenize


def extract_climate_sentences(text):
    if not isinstance(text, str):
        return []
    sentences = sent_tokenize(text)
    return [s for s in sentences if any(kw in s.lower() for kw in climate_keywords)]

climate_speeches['climate_sentences'] = climate_speeches['speeches_for_keyword_search'].apply(extract_climate_sentences)
climate_speeches 

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,climate_sentences
0,45,1990,NZL,"﻿\nI should like to congratulate you, Sir, on ...",145,3512,4,"I should like to congratulate you, Sir, on you...","['climate change', 'greenhouse gas', 'emission...",True,"[But many of those are small islands, atoll co..."
2,45,1990,FJI,﻿\nMr. President it gives me great pleasure to...,101,2666,2,President of the General Assembly at its forty...,['global warming'],True,"[The effects of global warming, especially on ..."
3,45,1990,PNG,﻿On behalf of the Government and people\nof Pa...,113,2923,2,On behalf of the Government and people of Papu...,"['greenhouse effect', 'greenhouse']",True,"[We are greatly concerned that, in spite of om..."
4,45,1990,FIN,"﻿\nI take great pleasure in seeing you, Sir, a...",92,2080,4,"I take great pleasure in seeing you, Sir, a di...","['carbon dioxide', 'emissions']",True,"[I am thinking, for example, of policies to re..."
5,45,1990,DEU,"﻿I wish first of all to offer you, Mr. Preside...",292,6149,4,"Mr. President, my warm congratulations on your...","['carbon dioxide', 'greenhouse effect', 'emiss...",True,[Another key problem is the changing of the wo...
...,...,...,...,...,...,...,...,...,...,...,...
3189,79,2024,CHE,Mr President\nSecretary-General\nExcellencies\...,66,1314,4,Ladies and gentlemen On the morning of 3 March...,['climate change'],True,[Climate change and loss of biodiversity are h...
3190,79,2024,CUB,"**Mr President,** \n**Mr Secretary-General,**...",76,2433,3,"Mr President, Mr Secretary-General, Let me beg...","['climate change', 'global average temperature...",True,"[Climate change is advancing inexorably., In J..."
3191,79,2024,CIV,Mr President of the 79th Session of the United...,41,1206,2,"Ladies and Gentlemen, Heads of State and Gover...","['climate change', 'global warming', 'climate ...",True,"[Mr President, The progress achieved by our co..."
3192,79,2024,MKD,Distinguished President of the United Nations ...,101,2804,3,"Ladies and Gentlemen, Being invited to address...","['climate change', 'greenhouse gas', 'climate ...",True,[With the alarming level of greenhouse gas emi...


In [14]:
climate_speeches[(climate_speeches['Income Level'] == 1) & (climate_speeches['Year'] == 2019)]

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,climate_sentences
2213,74,2019,RWA,The international community stands at a crossr...,30,666,1,The international community stands at a crossr...,"['climate change', 'global warming', 'climate ...",True,[Never before have we had such well-defined ro...
2216,74,2019,ETH,We are facing unprecedented challenges. Climat...,80,1787,1,We are facing unprecedented challenges. Climat...,"['climate change', 'climate action']",True,[Climate change is real and poses a real threa...
2221,74,2019,BDI,"At the beginning of my statement, my delegatio...",85,3411,1,"At the beginning of my statement, my delegatio...","['climate change', 'climate action', 'climate ...",True,[The great qualities and skills that we all re...
2224,74,2019,MLI,"At the outset, as I take the floor before the ...",85,3153,1,"At the outset, as I take the floor before the ...","['climate change', 'climate action']",True,[This is particularly true with regard to the ...
2226,74,2019,NER,"At the outset, I would like to express my warm...",96,3057,1,"At the outset, I would like to express my warm...","['climate change', 'climate action', 'emission...",True,"[The theme of this year’s session, “Galvanizin..."
2228,74,2019,SSD,It is my honour to address the General Assembl...,77,2452,1,It is my honour to address the General Assembl...,"['climate change', 'climate action']",True,[The theme he has selected for this session — ...
2240,74,2019,AFG,It is an honour for me to and stand at this pr...,95,2251,1,It is an honour for me to and stand at this pr...,"['climate change', 'framework convention on cl...",True,"[I congratulate you, Mr. President, on assumin..."
2247,74,2019,COD,I would first like to congratulate Ambassador ...,127,3894,1,President of the General Assembly at its seven...,"['climate change', 'climate action', 'renewabl...",True,"[The theme of the current session, “Galvanizin..."
2249,74,2019,TJK,It gives me great pleasure to join in the hear...,58,1734,1,President of the General Assembly at its seven...,"['climate change', 'emissions', 'temperature',...",True,"[However, we are also dealing with an array of..."
2257,74,2019,CAF,As I join the great family of the internationa...,60,1972,1,"Mr. President, Mr. Tijjani Muhammad-Bande, my ...","['climate change', 'framework convention on cl...",True,[The seventy-fourth session is being held at a...


# Climate BERT

In [14]:
! pip install transformers sentence-transformers

Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Using cached hf_xet-1.1.3-cp37-abi3-macosx_10_12_x86_64.whl.metadata (879 bytes)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m979.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sentence_tr

In [11]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load ClimateBERT
tokenizer = AutoTokenizer.from_pretrained("Climatebert/distilroberta-base-climate-f")
model = AutoModel.from_pretrained("Climatebert/distilroberta-base-climate-f")

# Define concept vocabularies
concepts = {
    "climate": ["climate change", "climate action", "the climate"],
    "emissions": ["emissions", "dangerous emissions", "carbon emissions"]
}

# Function to get embedding of a sentence
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Function to get average concept embedding from sentence list
def compute_concept_embedding(sentences, concept_terms):
    filtered = [s for s in sentences if any(term in s.lower() for term in concept_terms)]
    if not filtered:
        return None
    embeddings = [get_sentence_embedding(s) for s in filtered]
    return np.mean(embeddings, axis=0)


Some weights of RobertaModel were not initialized from the model checkpoint at Climatebert/distilroberta-base-climate-f and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Trying Embeddings on a small subset

In [19]:
# Picking up two countries of the same income level
countries = ['GMB','ESP']  # Example from Income Level 1
years = [2015, 2019]         # Short range for simplicity

subset_embeddings = climate_speeches[
    (climate_speeches['ISO-Code'].isin(countries)) &
    (climate_speeches['Year'].isin(years)) &
    (climate_speeches['contains_climate_keyword'])
].copy()

In [30]:
subset_embeddings.iloc[0]['climate_sentences']

['To begin with, protecting our environment must emphatically remain a major priority for this global body because registered climate changes over the course of history have significantly impacted negatively on our planet Earth and demand our immediate and permanent attention.',
 '“This body must act with greater commitment and resolve to establish sustainable conservation programmes, with production patterns aimed at effectively combating climate change, restoring our ecosystems, promoting forest management and reversing desertification and land degradation.']

In [31]:
subset_embeddings.iloc[1]['climate_sentences']

['I am thinking of the challenge of climate change.']

In [21]:
# Parse climate sentences
import ast
subset_embeddings['climate_sentences'] = subset_embeddings['climate_sentences'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)


In [22]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")
model = AutoModel.from_pretrained("climatebert/distilroberta-base-climate-f")

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_document_embedding(sentences):
    if not isinstance(sentences, list) or len(sentences) == 0:
        return None
    embeddings = [get_sentence_embedding(s) for s in sentences]
    return np.mean(embeddings, axis=0)

Some weights of RobertaModel were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Apply document level embedding
subset_embeddings['climate_embedding'] = subset_embeddings['climate_sentences'].apply(get_document_embedding)

In [24]:
# Aggregate to Country-Year Level
agg = subset_embeddings.dropna(subset=['climate_embedding']).groupby(['ISO-Code', 'Year'])['climate_embedding'].apply(
    lambda vecs: np.mean(np.stack(vecs), axis=0)
).reset_index()



In [25]:
agg.iloc[0]['climate_embedding']

array([ 6.28137961e-02, -1.26688816e-02,  7.72315189e-02, -2.70027295e-02,
        5.73057711e-01, -1.10701799e-01,  3.09718046e-02,  6.61706254e-02,
        1.35978445e-01, -4.94874306e-02, -6.62455186e-02,  6.69420511e-02,
       -1.85810849e-02, -6.94348589e-02,  7.77413324e-02,  2.88998932e-01,
       -1.66073497e-02,  6.12636916e-02,  1.00362957e-01, -1.55157745e-01,
       -3.70285213e-02,  2.85723228e-02,  5.11427037e-02, -1.08260699e-02,
       -3.84135805e-02,  6.49955496e-02,  1.29769864e-02,  3.76029871e-02,
        1.74974967e-02,  9.30849090e-03, -1.35733604e-01, -1.05362304e-01,
        7.95035735e-02,  4.28649150e-02, -3.65385860e-02,  7.50335827e-02,
        6.43023774e-02, -1.50109544e-01,  1.98140979e-01,  1.18922018e-01,
        1.97497327e-02, -2.35543981e-01,  5.79413362e-02,  1.04218625e-01,
       -6.28283992e-02,  7.23872408e-02, -1.63482100e-01, -7.74684474e-02,
        8.08516145e-02, -5.01379669e-02, -1.25766128e-01,  1.41181901e-01,
       -3.02212294e-02, -

In [26]:
# Compare country embeddings using cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

# Make matrix of embeddings
embedding_matrix = np.stack(agg['climate_embedding'].values)

# Compute cosine similarity between country-year entries
similarity_matrix = cosine_similarity(embedding_matrix)

# Attach labels
labels = agg['ISO-Code'] + "_" + agg['Year'].astype(str)
sim_df = pd.DataFrame(similarity_matrix, index=labels, columns=labels)


In [27]:
print(sim_df.round(2))

          ESP_2015  ESP_2019  GMB_2015  GMB_2019
ESP_2015      1.00      0.97      0.95      0.97
ESP_2019      0.97      1.00      0.99      1.00
GMB_2015      0.95      0.99      1.00      0.99
GMB_2019      0.97      1.00      0.99      1.00


## Example of usage of embeddings

In [10]:
sample_sentences = [
    "We must act urgently to address climate change and protect future generations.",
    "Carbon emissions have reached dangerous levels.",
    "Climate action must be fair and equitable.",
    "We pledge to reduce emissions by 40% by 2030."
]

# Compute concept embeddings
climate_emb = compute_concept_embedding(sample_sentences, concepts["climate"])
emissions_emb = compute_concept_embedding(sample_sentences, concepts["emissions"])

# Compare the two with cosine similarity
similarity = cosine_similarity([climate_emb], [emissions_emb])[0][0]
print(f"Cosine similarity between 'climate' and 'emissions': {similarity:.4f}")


Cosine similarity between 'climate' and 'emissions': 0.9772
