In [1]:
import nltk
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('punt_tab')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import contractions
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import pos_tag

In [2]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
# Initialize stemmer/lemmatizer (run once)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 1: Cleaning Functions

In [3]:
def take_out_mentions_to_the_president(text):
    """Remove introductory phrases addressing the president or other dignitaries from text.
    Args:
        text (str): The input text to process    
    Returns:
        str: The text with introductory address phrases removed, or original text if none found
    """
    
    # Split into sentences (simple regex)
    # The regex looks for sentence-ending punctuation followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    # Get first 4 sentences as search area - addresses are typically at the beginning
    search_area = ' '.join(sentences[:2]).lower()
    
    # Phrases to search for (lowercase)
    # We look for various forms of presidential/dignitary addresses
    # Store the index where each phrase is found (-1 if not found)
    possible_starts = [
        search_area.lower().find("mr. president"),  # Formal address with period
        search_area.lower().find("mr president"),   # Formal address without period
        search_area.lower().find("ladies and gentlemen"),  # Common formal address
        search_area.lower().find("your excellency"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your excellence"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("president of the general assembly"),
        search_area.lower().find("excellencies"),     # Plural form of address
        search_area.lower().find("esteemed president"),  # Honorific address
        search_area.lower().find("distinguished delegates"),  # Assembly address
        search_area.lower().find("allah"),  # Address to members of an assembly
    ]
    
    # Filter out phrases that weren't found (-1) and keep only valid indices
    starts = [start for start in possible_starts if start != -1]
    
    # Get the latest occurrence of any address phrase
    start = max(starts) if len(starts) > 0 else -1
    
    if start == -1:
        # No address phrases found - log this and return full text
        # print(f"Could not find probable start in the text of {url}")
        start = 0
    
    # Return text starting from after the address phrase
    return text[start:]

In [4]:
def simple_clean(text):
    """Cleans text by removing common formatting artifacts from PDF conversions and UN document patterns.
    
    Args:
        text (str): Input text to be cleaned
        
    Returns:
        str: Cleaned text with unwanted patterns removed
    """
    
    # Convert to lowercase for consistent processing 
    # text = text.lower()
  
    # 1: Remove UN document reference numbers (e.g. "20/26 15-29876")
    text = re.sub(r'\b\d{1,4}\s*/\s*\d{1,4}\s+\d{2,4}-\d{4,8}\b', ' ', text)
    
    # 2: Remove UN meeting record references (e.g. "A/70/PV.24")
    text = re.sub(r'\b[a-z]\s*/\s*\d+\s*/pv\s*\.\s*\d+\b', ' ', text, flags=re.IGNORECASE)
    
    # 3: Remove dates in DD/MM/YYYY format (e.g. "31/12/2023")
    text = re.sub(r'\d{2}\/\d{2}\/\d{4}', '', text) 
  
    # 4: Remove form feed characters (often from PDF conversion)
    text = re.sub(r'\x0c', '', text) 
  
    # 5: Remove parenthetical document references (e.g. "(A/70/123)" or "(A/70/123, annex)")
    text = re.sub(r'\(\s*[a-z]\s*/\s*\d+\s*/\s*\d+\s*(?:,\s*annex)?\s*\)', '', text, flags=re.IGNORECASE)
    
    # 6: Remove standalone line numbers/page numbers (e.g. "42" on its own line)
    text = re.sub(r'^\s*\d+\s*$\n?', '', text, flags=re.MULTILINE)
    
    # 7: Normalize newlines - replace all with single spaces
    text = re.sub(r'\n', ' ', text, flags=re.MULTILINE)

    # 8: Remove numbered list prefixes (e.g. "1.    Some text")
    text = re.sub(r'\d+\.\t', '', text)

    # 9: Remove Unicode BOM (Byte Order Mark) character if present
    text = text.replace('\ufeff', '')
    
    # 10: Normalize whitespace - collapse multiple spaces into one and trim
    text = re.sub(r'\s+', ' ', text).strip()

    # 11. Remove standalone hyphens
    text = re.sub(r'\s*-(?!\w)(?<!\w)-*\s*', ' ', text)

    # 12: Remove punctuations
    #text = text.replace('—', '').replace(',', '').replace(':', '').replace('’', '').replace('“', '').replace("”", '').replace(";", '').replace("''", '')
    
    # 12: Remove words/numbers between parenthesis
    text = re.sub(r'\([^)]*\)', '', text)

    #13: Remove markdown-style bold/italic/blockquote symbols (all below are subpoints rather that main ones)
    text = re.sub(r'\*\*+', '', text)       # removes **, **** etc.
    text = re.sub(r'>+', '', text)          # removes >, >>, etc.
    text = re.sub(r'-{2,}', '', text)       # removes --, --- etc.
    text = re.sub(r'[=*_~#`]+', '', text)   # removes *, _, =, #, ~, ` etc.
    # I want to replace this simbol • with a space 
    text = text.replace('•', ' ')            # replaces • with a space
    
    return text

In [5]:
def clean_speeches(text, use_simple_clean=True, disregard_mentions_to_president=True, expand=False):
    """Load a text file and apply cleaning operations. 
    Args:
        text: Original text
        use_simple_clean (bool): Whether to apply basic text cleaning
        disregard_mentions_to_president (bool): Whether to remove mentions to the president
        Returns:
        str: Cleaned text from the file"""
    
    if expand: 
        text = contractions.fix(text) 

    # Apply basic text cleaning if the flag is set
    if use_simple_clean:
        text = simple_clean(text)  # Assumes simple_clean is a custom function defined elsewhere

    # Remove mentions to the president if the flag is set
    if disregard_mentions_to_president:
        text = take_out_mentions_to_the_president(text)  # Also assumes this function is defined elsewhere

    # Return the cleaned (or original) text
    return text

# Step 2: Load the speeches

In [6]:
base_path_alvaro = Path(r"C:\Users\Alvaro Millan Ruiz\OneDrive\Escritorio\BDS\Block_5\NLP\Project")
base_path_gilda = "/Users/szonjapike/Desktop/BDS/Block_5/NLP/TXT/"

base_path = base_path_alvaro # change according to user

In [7]:
final_df = pd.read_csv(base_path / "Final_df.csv")
final_df = final_df.drop(columns=["cleaned_speeches_no_postagging_expanded", "cleaned_speeches_no_postagging_no_expanded", "cleaned_speeches_postagging_no_expanded", "cleaned_speeches_postagging_expanded"])
final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2
...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2


In [8]:
# nltk.download('wordnet')
tqdm.pandas()  # Enable pandas integration

final_df['speeches_for_keyword_search'] = final_df['Speech'].progress_apply(
    lambda x: clean_speeches(
        x, 
        expand=True,
        use_simple_clean=True, 
        disregard_mentions_to_president=True
    )
)

final_df 


100%|██████████| 6439/6439 [00:18<00:00, 349.40it/s]


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1,"Allow me, first of all, Sir, to congratulate y..."
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2,"First I would like to congratulate you, Sir, o..."
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2,It is a special pleasure for me to speak at th...
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4,"Mr. President, on behalf of the delegation of ..."
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2,President of the General Assembly at its forty...
...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H..."
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc..."
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your..."


# Step 3: Look for the keywords

In [9]:
# pd.set_option('display.max_rows', None)

In [10]:
bigrams = pd.read_csv(base_path / "bigrams.csv")
bigrams = bigrams[bigrams['collocation'].str.startswith('carbon')].sort_values(by='count', ascending=False).reset_index(drop=True)
bigrams[bigrams['count'] > 0]

Unnamed: 0,collocation,count,count_nested,length,lambda,z
0,carbon dioxide,159,0,2,14.161921,17.241072
1,carbon emission,135,0,2,7.127955,72.000705
2,carbon neutrality,79,0,2,8.264164,61.051747
3,carbon sink,34,0,2,9.063508,39.212652
4,carbon footprint,32,0,2,9.705604,34.59078
5,carbon market,25,0,2,4.199908,20.769518
6,carbon neutral,16,0,2,6.561462,25.528848
7,carbon tax,15,0,2,5.565553,21.418842
8,carbon capture,14,0,2,6.49626,23.769407
9,carbon-negative country,13,0,2,5.032398,11.492781


In [11]:
# Escape the entire phrase and match it literally
pattern = re.compile(re.escape('ozone layer'), flags=re.IGNORECASE)
climate_speeches = final_df[final_df['speeches_for_keyword_search'].str.contains(pattern, na=False)]
climate_speeches

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E..."
27,45,1990,CMR,﻿\nThis forty-fifth session of the General Ass...,94,3009,2,This forty-fifth session of the General Assemb...
28,45,1990,COD,"﻿Mr. President, the forty-fifth session of the...",88,3865,1,"Mr. President, the forty-fifth session of the ..."
29,45,1990,COG,﻿\nThe convening of the forty-fifth session of...,134,4335,2,The convening of the forty-fifth session of th...
32,45,1990,CPV,﻿I should like to begin by offering Mr. de Mar...,71,2432,2,I should like to begin by offering Mr. de Marc...
...,...,...,...,...,...,...,...,...
5724,76,2021,DJI,"Praise be to God, peace and blessings be upon ...",57,1619,2,"Mr. President, First of all, I would like to w..."
5796,76,2021,MNG,I would like to convey my heartfelt congratula...,110,3480,2,President of the General Assembly and to wish ...
6002,77,2022,OMN,We would like to extend to the President of th...,44,1337,4,President of the General Assembly and his frie...
6118,78,2023,FRA,It is an honour for me to speak before the Gen...,105,2297,4,It is an honour for me to speak before the Gen...


In [12]:
# Climate keyword according to the literature
climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", 
                    "extreme weather", "global environmental change", "climate variability", "low carbon", "ghge", "renewable energy", "carbon emission", "co2", 
                    "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero", "climate crisis",
                    "climate crisis", "climate summit", "climate catastrophe", "climate justice", "climate emergency", "climate funding", "climate fund", "climate fianncing",
                    "climate peace", "climate agreement", "climate security", "climate ambition", "climate issue", "climate impact", "climate conference", "climate event", 
                    "climate conference", "climate challenge", "climate trust", "climate negotiation", "climate catastrophe", "climate risk", "climate goal", "climate change-related", 
                    "climate regime", "climate resilient", "climate policy", "carbon market", "carbon sink", "green climate", "green economy", "emission reduction",
                    "emissions reduction", "emission reduction", "carbon neutral", "carbon sink", "ozone layer"
                    ]

In [13]:
climate_keywords_dict = {
    # 1. Climate Science & Impacts
    "science_impacts": [
        "climate change", "global warming", "global average temperature",
        "climate variability", "extreme weather", "climate impact",
        "greenhouse effect", "temperature", "climate catastrophe",
        "climate risk", "ozone layer", "global environmental change",
        "climate change-related", "climate risk"
    ],

    # 2. Policy & Agreements
    "policy_agreements": [
        "paris accord", "kyoto protocol", "unfccc", "climate policy",
        "framework convention on climate change", "climate agreement",
        "climate regime", "climate negotiation", "climate ambition",
        "climate security", "ball roadmap", "ipcc"
    ],

    # 3. Carbon & Emissions
    "carbon_emissions": [
        "carbon dioxide", "co2", "carbon emission", "carbon tax",
        "carbon footprint", "carbon neutrality", "carbon neutral",
        "carbon market", "carbon sink", "low carbon", "emissions",
        "emission reduction", "emissions reduction", "ghg", "ghge",
        "climate pollutant", "climate pollutants", "carbon neutral",
        "greenhouse gas"
    ],

    # 4. Climate Action & Solutions
    "action_solutions": [
        "climate action", "climate mitigation", "renewable energy",
        "climate resilience", "climate resilient", "green economy",
        "climate funding", "climate fund", "climate financing",
        "net-zero", "net zero", "green climate", "cap and trade",
        "emissions trading scheme", "climate fianncing"
    ],

    # 5. Sociopolitical Climate Issues
    "sociopolitical": [
        "climate justice", "climate emergency", "climate crisis",
        "climate summit", "climate politics", "climate peace",
        "climate challenge", "climate goal", "climate issue",
        "climate event", "climate conference", "climate trust"
    ]
}

In [14]:
def extract_keyword_info(text, keywords):
    """
    Extracts climate-related keyword information from a text string.

    Parameters:
        text (str): The speech or text in which to search for keywords.
        keywords (list of str): A list of climate-related keywords or phrases to match against the text.
    """
    matches = []
    for kw in keywords:
        pattern = r'\b' + re.escape(kw) + r's?\b' # Match also plural forms
        if re.search(pattern, text, flags=re.IGNORECASE):
            matches.append(kw)
    contains_keyword = len(matches) > 0
    return pd.Series([matches, contains_keyword])

In [15]:
final_df[['matched_climate_keywords', 'contains_climate_keyword']] = final_df['speeches_for_keyword_search'].progress_apply(
    lambda text: extract_keyword_info(text, climate_keywords)
)

df_sentiment = final_df[final_df['contains_climate_keyword'] != False]
df_sentiment

100%|██████████| 6439/6439 [01:57<00:00, 54.90it/s]


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"Please accept my country's congratulations, Si...",[global warming],True
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"I am pleased, Sir, to congratulate you on your...","[carbon dioxide, emissions]",True
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E...","[carbon dioxide, greenhouse effect, ozone layer]",True
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"Mr. President, warm felicitations are due you ...",[climate change],True
26,45,1990,CHN,﻿I should like to begin by warmly congratulati...,115,2992,1,I should like to begin by warmly congratulatin...,[climate change],True
...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H...","[climate change, climate action, temperature, ...",True
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc...",[climate change],True
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...,"[climate change, climate action, emissions, ex...",True
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your...","[climate change, extreme weather]",True


# Step 4: Add sentences with the keywords as a column

In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_climate_sentences(speech, keywords):
    """
    Filters the sentences in a speech that contain any of the specified climate-related keywords.
    Args:
        speech (str): The speech text to analyze.
        keywords (list of str): A list of climate-related keywords to search for.
    """
    doc = nlp(speech)
    keyword_sentences = []
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if any(kw.lower() in sent_text.lower() for kw in keywords):
            keyword_sentences.append(sent_text)
    return keyword_sentences


df_sentiment['climate_sentences'] = df_sentiment.progress_apply(
    lambda row: extract_climate_sentences(row['speeches_for_keyword_search'], row['matched_climate_keywords']),
    axis=1
)

df_sentiment

100%|██████████| 3250/3250 [17:28<00:00,  3.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['climate_sentences'] = df_sentiment.progress_apply(


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,climate_sentences
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"Please accept my country's congratulations, Si...",[global warming],True,[Our oceans are polluted by various contaminan...
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"I am pleased, Sir, to congratulate you on your...","[carbon dioxide, emissions]",True,[One of the most urgent priority measures one ...
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E...","[carbon dioxide, greenhouse effect, ozone layer]",True,[We must urgently orient research towards tech...
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"Mr. President, warm felicitations are due you ...",[climate change],True,[We hope the proposed conventions on climate c...
26,45,1990,CHN,﻿I should like to begin by warmly congratulati...,115,2992,1,I should like to begin by warmly congratulatin...,[climate change],True,[No matter how the international climate chang...
...,...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H...","[climate change, climate action, temperature, ...",True,[The effects of climate change are being lived...
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc...",[climate change],True,[This includes not only immediate humanitarian...
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...,"[climate change, climate action, emissions, ex...",True,"[As signatory to the Paris Agreement, we are c..."
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your...","[climate change, extreme weather]",True,"[Mr. President, Climate Change and extreme wea..."


In [24]:
from collections import defaultdict
import pandas as pd

# Step 1: Flatten the climate_sentences and track their original row index
sentences_with_index = [
    (idx, sentence) 
    for idx, row in df_sentiment.iterrows() 
    for sentence in row['climate_sentences']
]

# Step 2: Pre-cache category matches (lightning fast)
category_matches = defaultdict(lambda: defaultdict(list))

for idx, sentence in sentences_with_index:
    for category, keywords in climate_keywords_dict.items():
        # Future improvement:
            # count the number a specific keyword appears in the sentence and assign the sentence to the cateory with the most matches
            # If there is a draw, asisgn to both categories
        if any(kw.lower() in sentence.lower() for kw in keywords):
            category_matches[idx][category].append(sentence)

# Step 3: Assign categorized sentences back to DataFrame
for category in climate_keywords_dict.keys():
    df_sentiment[f'cat_{category}'] = df_sentiment.index.map(
        lambda x: category_matches[x].get(category, [])
    )

df_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment[f'cat_{category}'] = df_sentiment.index.map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment[f'cat_{category}'] = df_sentiment.index.map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment[f'cat_{category}'] = df_sentiment.index.map(
A value is trying to be set on 

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,climate_sentences,cat_science_impacts,cat_policy_agreements,cat_carbon_emissions,cat_action_solutions,cat_sociopolitical
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"Please accept my country's congratulations, Si...",[global warming],True,[Our oceans are polluted by various contaminan...,[Our oceans are polluted by various contaminan...,[],[],[],[]
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"I am pleased, Sir, to congratulate you on your...","[carbon dioxide, emissions]",True,[One of the most urgent priority measures one ...,[],[],[One of the most urgent priority measures one ...,[],[]
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E...","[carbon dioxide, greenhouse effect, ozone layer]",True,[We must urgently orient research towards tech...,[We must urgently orient research towards tech...,[],[We must urgently orient research towards tech...,[],[]
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"Mr. President, warm felicitations are due you ...",[climate change],True,[We hope the proposed conventions on climate c...,[We hope the proposed conventions on climate c...,[],[],[],[]
26,45,1990,CHN,﻿I should like to begin by warmly congratulati...,115,2992,1,I should like to begin by warmly congratulatin...,[climate change],True,[No matter how the international climate chang...,[No matter how the international climate chang...,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H...","[climate change, climate action, temperature, ...",True,[The effects of climate change are being lived...,[The effects of climate change are being lived...,[],[],"[We must do more to turn the tide, to honour o...",[In this era of unprecedented sea level rise i...
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc...",[climate change],True,[This includes not only immediate humanitarian...,[This includes not only immediate humanitarian...,[],[],[],[]
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...,"[climate change, climate action, emissions, ex...",True,"[As signatory to the Paris Agreement, we are c...","[Extreme weather such as flooding, fires and d...",[],[As South Africa we remain committed to contri...,"[As signatory to the Paris Agreement, we are c...",[The climate crisis is now a full-blown climat...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your...","[climate change, extreme weather]",True,"[Mr. President, Climate Change and extreme wea...","[Mr. President, Climate Change and extreme wea...",[],[],[],[]


In [25]:
def extract_climate_sentences_with_context(speech, keywords):
    """
    Extracts sentences containing climate keywords along with one sentence before and after each.
    Returns a list of context windows (strings of 1–3 sentences).
    """
    doc = list(nlp(speech).sents)  # Convert to list to access by index
    context_windows = []

    for i, sent in enumerate(doc):
        sent_text = sent.text.strip()

        if any(kw.lower() in sent_text.lower() for kw in keywords):
            # Grab the previous, current, and next sentence if available
            prev_sent = doc[i-1].text.strip() if i > 0 else ''
            next_sent = doc[i+1].text.strip() if i < len(doc) - 1 else ''
            
            context = ' '.join([s for s in [prev_sent, sent_text, next_sent] if s])
            context_windows.append(context)

    return context_windows


df_sentiment_1 = df_sentiment.copy() 

df_sentiment_1['climate_sentences_extended'] = df_sentiment_1.progress_apply(
    lambda row: extract_climate_sentences_with_context(row['speeches_for_keyword_search'], row['matched_climate_keywords']),
    axis=1
)


  0%|          | 0/3250 [00:00<?, ?it/s]

100%|██████████| 3250/3250 [17:20<00:00,  3.12it/s]


In [26]:
from collections import defaultdict
import pandas as pd

# Step 1: Flatten the climate_sentences and track their original row index
sentences_with_index = [
    (idx, sentence) 
    for idx, row in df_sentiment_1.iterrows() 
    for sentence in row['climate_sentences_extended']
]

# Step 2: Pre-cache category matches (lightning fast)
category_matches = defaultdict(lambda: defaultdict(list))

for idx, sentence in sentences_with_index:
    for category, keywords in climate_keywords_dict.items():
        if any(kw.lower() in sentence.lower() for kw in keywords):
            category_matches[idx][category].append(sentence)

# Step 3: Assign categorized sentences back to DataFrame
for category in climate_keywords_dict.keys():
    df_sentiment_1[f'cat_extended_{category}'] = df_sentiment_1.index.map(
        lambda x: category_matches[x].get(category, [])
    )

df_sentiment_1

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,...,cat_policy_agreements,cat_carbon_emissions,cat_action_solutions,cat_sociopolitical,climate_sentences_extended,cat_extended_science_impacts,cat_extended_policy_agreements,cat_extended_carbon_emissions,cat_extended_action_solutions,cat_extended_sociopolitical
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"Please accept my country's congratulations, Si...",[global warming],True,...,[],[],[],[],"[Rain forests, the most valuable of mankind's ...","[Rain forests, the most valuable of mankind's ...",[],[],[],[]
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"I am pleased, Sir, to congratulate you on your...","[carbon dioxide, emissions]",True,...,[],[One of the most urgent priority measures one ...,[],[],[Studies undertaken at the Vienna-based Intern...,[],[],[Studies undertaken at the Vienna-based Intern...,[],[]
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"Mr. President, events in Eastern and Central E...","[carbon dioxide, greenhouse effect, ozone layer]",True,...,[],[We must urgently orient research towards tech...,[],[],[The same applies to the survival of our plane...,[The same applies to the survival of our plane...,[],[The same applies to the survival of our plane...,[],[]
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"Mr. President, warm felicitations are due you ...",[climate change],True,...,[],[],[],[],[The Conference must produce results that will...,[The Conference must produce results that will...,[],[],[],[]
26,45,1990,CHN,﻿I should like to begin by warmly congratulati...,115,2992,1,I should like to begin by warmly congratulatin...,[climate change],True,...,[],[],[],[],"[Looking forward into the 1990s, we see a worl...","[Looking forward into the 1990s, we see a worl...",[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"Excellencies, I extend my congratulations to H...","[climate change, climate action, temperature, ...",True,...,[],[],"[We must do more to turn the tide, to honour o...",[In this era of unprecedented sea level rise i...,[Please be assured of Samoa’s support in the s...,[Please be assured of Samoa’s support in the s...,[],[],[Unless substantial investments are made to mi...,[This is a red line for many SIDS like Samoa. ...
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"Ladies and Gentlemen, It is a happy coincidenc...",[climate change],True,...,[],[],[],[],"[For this reason, the Republic of Yemen renews...","[For this reason, the Republic of Yemen renews...",[],[],[],[]
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,President of the 79th Session of the UN Genera...,"[climate change, climate action, emissions, ex...",True,...,[],[As South Africa we remain committed to contri...,"[As signatory to the Paris Agreement, we are c...",[The climate crisis is now a full-blown climat...,[A few days ago South Africa also endorsed the...,[The impacts have been swift and devastating. ...,[],"[Extreme weather such as flooding, fires and d...",[A few days ago South Africa also endorsed the...,[The AfCFTA will further integrate regional ec...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"LADIES AND GENTLEMEN, I congratulate you, Your...","[climate change, extreme weather]",True,...,[],[],[],[],"[Furthermore, Zambia recognises the efforts of...","[Furthermore, Zambia recognises the efforts of...",[],[],"[This, and similar scenarios in our region, un...",[]


In [27]:
df_sentiment.to_csv(base_path_alvaro / "df_sentiment_analysis_new.csv", index=False, encoding='utf-8')

In [28]:
df_sentiment_1.to_csv(base_path_alvaro / "df_sentiment_analysis_extended.csv", index=False, encoding='utf-8')