In [1]:
import nltk
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('punt_tab')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import re
from pathlib import Path
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import contractions
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk import pos_tag

In [2]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
# Initialize stemmer/lemmatizer (run once)
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 1: Cleaning Functions

In [3]:
def take_out_mentions_to_the_president(text):
    """Remove introductory phrases addressing the president or other dignitaries from text.
    Args:
        text (str): The input text to process    
    Returns:
        str: The text with introductory address phrases removed, or original text if none found
    """
    
    # Split into sentences (simple regex)
    # The regex looks for sentence-ending punctuation followed by whitespace
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    # Get first 4 sentences as search area - addresses are typically at the beginning
    search_area = ' '.join(sentences[:2]).lower()
    
    # Phrases to search for (lowercase)
    # We look for various forms of presidential/dignitary addresses
    # Store the index where each phrase is found (-1 if not found)
    possible_starts = [
        search_area.lower().find("mr. president"),  # Formal address with period
        search_area.lower().find("mr president"),   # Formal address without period
        search_area.lower().find("ladies and gentlemen"),  # Common formal address
        search_area.lower().find("your excellency"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your excellence"),  # Covers "your excellence" and "your excellency"
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("your majesties"),  # 
        search_area.lower().find("president of the general assembly"),
        search_area.lower().find("excellencies"),     # Plural form of address
        search_area.lower().find("esteemed president"),  # Honorific address
        search_area.lower().find("distinguished delegates"),  # Assembly address
        search_area.lower().find("allah"),  # Address to members of an assembly
    ]
    
    # Filter out phrases that weren't found (-1) and keep only valid indices
    starts = [start for start in possible_starts if start != -1]
    
    # Get the latest occurrence of any address phrase
    start = max(starts) if len(starts) > 0 else -1
    
    if start == -1:
        # No address phrases found - log this and return full text
        # print(f"Could not find probable start in the text of {url}")
        start = 0
    
    # Return text starting from after the address phrase
    return text[start:]

In [4]:
def simple_clean(text):
    """Cleans text by removing common formatting artifacts from PDF conversions and UN document patterns.
    
    Args:
        text (str): Input text to be cleaned
        
    Returns:
        str: Cleaned text with unwanted patterns removed
    """
    
    # Convert to lowercase for consistent processing 
    text = text.lower()
  
    # 1: Remove UN document reference numbers (e.g. "20/26 15-29876")
    text = re.sub(r'\b\d{1,4}\s*/\s*\d{1,4}\s+\d{2,4}-\d{4,8}\b', ' ', text)
    
    # 2: Remove UN meeting record references (e.g. "A/70/PV.24")
    text = re.sub(r'\b[a-z]\s*/\s*\d+\s*/pv\s*\.\s*\d+\b', ' ', text, flags=re.IGNORECASE)
    
    # 3: Remove dates in DD/MM/YYYY format (e.g. "31/12/2023")
    text = re.sub(r'\d{2}\/\d{2}\/\d{4}', '', text) 
  
    # 4: Remove form feed characters (often from PDF conversion)
    text = re.sub(r'\x0c', '', text) 
  
    # 5: Remove parenthetical document references (e.g. "(A/70/123)" or "(A/70/123, annex)")
    text = re.sub(r'\(\s*[a-z]\s*/\s*\d+\s*/\s*\d+\s*(?:,\s*annex)?\s*\)', '', text, flags=re.IGNORECASE)
    
    # 6: Remove standalone line numbers/page numbers (e.g. "42" on its own line)
    text = re.sub(r'^\s*\d+\s*$\n?', '', text, flags=re.MULTILINE)
    
    # 7: Normalize newlines - replace all with single spaces
    text = re.sub(r'\n', ' ', text, flags=re.MULTILINE)

    # 8: Remove numbered list prefixes (e.g. "1.    Some text")
    text = re.sub(r'\d+\.\t', '', text)

    # 9: Remove Unicode BOM (Byte Order Mark) character if present
    text = text.replace('\ufeff', '')
    
    # 10: Normalize whitespace - collapse multiple spaces into one and trim
    text = re.sub(r'\s+', ' ', text).strip()

    # 11. Remove standalone hyphens
    text = re.sub(r'\s*-(?!\w)(?<!\w)-*\s*', ' ', text)

    # 12: Remove punctuations
    #text = text.replace('—', '').replace(',', '').replace(':', '').replace('’', '').replace('“', '').replace("”", '').replace(";", '').replace("''", '')
    
    # 12: Remove words/numbers between parenthesis
    text = re.sub(r'\([^)]*\)', '', text)

    #13: Remove markdown-style bold/italic/blockquote symbols (all below are subpoints rather that main ones)
    text = re.sub(r'\*\*+', '', text)       # removes **, **** etc.
    text = re.sub(r'>+', '', text)          # removes >, >>, etc.
    text = re.sub(r'-{2,}', '', text)       # removes --, --- etc.
    text = re.sub(r'[=*_~#`]+', '', text)   # removes *, _, =, #, ~, ` etc.
    
    return text

In [5]:
def remove_stopwords(text):
    """Remove common stopwords from text"""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words 
                 if word.lower() not in stop_words and len(word) > 2]
    return ' '.join(filtered_words)                          

In [6]:
def remove_punctuations(text):
    return text.replace('.', '').replace('?', '').replace('!', '')

In [7]:
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)


def get_wordnet_pos(word):
    """Map POS tag to first character used by WordNetLemmatizer"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)


def lem_text(text):
    words = word_tokenize(text)
    stemmed_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    return ' '.join(stemmed_words)
    

In [8]:
def clean_speeches(text, use_simple_clean=True, disregard_mentions_to_president=True, Remove_Stopwords = False, expand=False, Remove_Punctuations = False, 
                        lemma=False):
    """Load a text file and apply cleaning operations. 
    Args:
        text: Original text
        use_simple_clean (bool): Whether to apply basic text cleaning
        disregard_mentions_to_president (bool): Whether to remove mentions to the president
        Returns:
        str: Cleaned text from the file"""
    
    if expand: 
        text = contractions.fix(text) 

    # Apply basic text cleaning if the flag is set
    if use_simple_clean:
        text = simple_clean(text)  # Assumes simple_clean is a custom function defined elsewhere

    # Remove mentions to the president if the flag is set
    if disregard_mentions_to_president:
        text = take_out_mentions_to_the_president(text)  # Also assumes this function is defined elsewhere
    
    # Remove Stopwrods
    if Remove_Stopwords:
        text = remove_stopwords(text)

    # Remove Punctuations
    if Remove_Punctuations:
        text = remove_punctuations(text)

    # Lemmatize everything
    if lemma:
        text = lem_text(text)

    # Replace only spaces/tabs induced from the stopwords
    text = re.sub(r'[ \t]+', ' ', text)

    # Return the cleaned (or original) text
    return text

# Step 2: Load the speeches

In [9]:
base_path_alvaro = Path(r"C:\Users\Alvaro Millan Ruiz\OneDrive\Escritorio\BDS\Block_5\NLP\Project")
base_path_gilda = "/Users/szonjapike/Desktop/BDS/Block_5/NLP/TXT/"

base_path = base_path_alvaro # change according to user

In [10]:
final_df = pd.read_csv(base_path / "Final_df.csv")
final_df = final_df.drop(columns=["cleaned_speeches_no_postagging_expanded", "cleaned_speeches_no_postagging_no_expanded", "cleaned_speeches_postagging_no_expanded", "cleaned_speeches_postagging_expanded"])
final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2
...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2


In [11]:
speeches = final_df[(final_df['ISO-Code'] == 'CUB') & (final_df['Year'] == 2024)]['Speech']
speech_text = speeches.iloc[0]
print(speech_text)


**Mr President,**  
**Mr Secretary-General,**

Let me begin by reaffirming Cuba’s solidarity with and support for the brotherly **Palestinian people**, victims of more than **75 years of colonial occupation**, flagrant violations of their legitimate national rights, subjected to cruelty, aggression, collective punishment, and apartheid.

In the past **eleven months**, the Israeli military has killed more than **40,000 civilians**. In that indiscriminate and disproportionate massacre, **more children than men and women have died**. This is being carried out with the complicity and weapons provided by the **United States government**, and with the **complicit silence of others**. We pay tribute to the **more than 220 UN staff members** who were also killed.

Cuba’s position is clear and unequivocal. President **Miguel Díaz-Canel Bermúdez** has declared:  
> “History will not forgive the indifferent. And we will not be among them.”

This is a wound on the conscience of humanity.

> **The 

In [12]:
# nltk.download('wordnet')
tqdm.pandas()  # Enable pandas integration

final_df['speeches_for_keyword_search'] = final_df['Speech'].progress_apply(
    lambda x: clean_speeches(
        x, 
        expand=True,
        use_simple_clean=True, 
        disregard_mentions_to_president=True, 
        Remove_Stopwords=False,        # No removal of stopwords
        Remove_Punctuations= False,      # No removal of puctuations for sentiment analysis
        lemma=False
    )
)

final_df 


100%|██████████| 6439/6439 [00:25<00:00, 248.55it/s]


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1,"allow me, first of all, sir, to congratulate y..."
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2,"first i would like to congratulate you, sir, o..."
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2,it is a special pleasure for me to speak at th...
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4,"mr. president, on behalf of the delegation of ..."
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2,president of the general assembly at its forty...
...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"excellencies, i extend my congratulations to h..."
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"ladies and gentlemen, it is a happy coincidenc..."
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,president of the 79th session of the un genera...
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"ladies and gentlemen, i congratulate you, your..."


In [13]:
speeches = final_df[(final_df['ISO-Code'] == 'CUB') & (final_df['Year'] == 2024)]['speeches_for_keyword_search']
speech_text = speeches.iloc[0]
print(speech_text)

mr president, mr secretary-general, let me begin by reaffirming cuba’s solidarity with and support for the brotherly palestinian people, victims of more than 75 years of colonial occupation, flagrant violations of their legitimate national rights, subjected to cruelty, aggression, collective punishment, and apartheid. in the past eleven months, the israeli military has killed more than 40,000 civilians. in that indiscriminate and disproportionate massacre, more children than men and women have died. this is being carried out with the complicity and weapons provided by the united states government, and with the complicit silence of others. we pay tribute to the more than 220 un staff members who were also killed. cuba’s position is clear and unequivocal. president miguel díaz-canel bermúdez has declared: “history will not forgive the indifferent. and we will not be among them.” this is a wound on the conscience of humanity. the genocide against the palestinian people must end, unconditi

# Step 3: Look for the keywords

In [22]:
# Climate keyword according to the literature
climate_keywords = ["climate change", "global warming", "cap and trade", "unfccc", "paris accord", "emissions trading scheme", "global average temperature",
                    "kyoto protocol", "climate resilience", "carbon dioxide", "climate politics", "framework convention on climate change", "ball roadmap", 
                    "greenhouse gas", "ghg", "greenhouse effect", "ipcc", "climate mitigation", "climate action", "emissions", "temperature", "extreme weather", 
                    "global environmental change", "climate variability", "greenhouse", "low carbon", "ghge", "renewable energy", "carbon emission",
                    "co2", "climate pollutant", "climate pollutants", "carbon tax", "carbon footprint", "carbon neutrality", "net-zero", "net zero"
                    ]


In [23]:
def extract_keyword_info(text, keywords):
    """
    Extracts climate-related keyword information from a text string.

    Parameters:
        text (str): The speech or text in which to search for keywords.
        keywords (list of str): A list of climate-related keywords or phrases to match against the text.
    """
    matches = []
    for kw in keywords:
        pattern = r'\b' + re.escape(kw) + r'\b'
        if re.search(pattern, text):
            matches.append(kw)
    contains_keyword = len(matches) > 0
    return pd.Series([matches, contains_keyword])

In [24]:
final_df[['matched_climate_keywords', 'contains_climate_keyword']] = final_df['speeches_for_keyword_search'].apply(
    lambda text: extract_keyword_info(text, climate_keywords)
)

final_df

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
0,45,1990,AFG,"﻿Allow me, first of all, Sir, to congratulate ...",159,4982,1,"allow me, first of all, sir, to congratulate y...",[],False
1,45,1990,AGO,"﻿First I would like to congratulate you, Sir, ...",77,2970,2,"first i would like to congratulate you, sir, o...",[],False
2,45,1990,ALB,﻿It is a special pleasure for me to speak at t...,112,3783,2,it is a special pleasure for me to speak at th...,[],False
3,45,1990,ARE,"﻿\nMr. President, on behalf of the delegation ...",115,3407,4,"mr. president, on behalf of the delegation of ...",[],False
4,45,1990,ARG,"﻿At the outset, let me convey to you, Sir, my ...",81,2816,2,president of the general assembly at its forty...,[],False
...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"excellencies, i extend my congratulations to h...","[climate change, climate action, extreme weather]",True
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"ladies and gentlemen, it is a happy coincidenc...",[climate change],True
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,president of the 79th session of the un genera...,"[climate change, climate action, emissions, ex...",True
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"ladies and gentlemen, i congratulate you, your...","[climate change, extreme weather]",True


In [25]:
df_sentiment = final_df[final_df['contains_climate_keyword'] != False]
df_sentiment

Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"please accept my country's congratulations, si...",[global warming],True
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"i am pleased, sir, to congratulate you on your...","[carbon dioxide, emissions, greenhouse]",True
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"mr. president, events in eastern and central e...","[carbon dioxide, greenhouse effect, greenhouse]",True
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"mr. president, warm felicitations are due you ...",[climate change],True
28,45,1990,COD,"﻿Mr. President, the forty-fifth session of the...",88,3865,1,"mr. president, the forty-fifth session of the ...","[climate change, carbon dioxide, greenhouse ef...",True
...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"excellencies, i extend my congratulations to h...","[climate change, climate action, extreme weather]",True
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"ladies and gentlemen, it is a happy coincidenc...",[climate change],True
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,president of the 79th session of the un genera...,"[climate change, climate action, emissions, ex...",True
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"ladies and gentlemen, i congratulate you, your...","[climate change, extreme weather]",True


# Step 4: Add sentences with the keywords as a column

In [28]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_climate_sentences(speech, keywords):
    """
    Filters the sentences in a speech that contain any of the specified climate-related keywords.
    Args:
        speech (str): The speech text to analyze.
        keywords (list of str): A list of climate-related keywords to search for.
    """
    doc = nlp(speech)
    keyword_sentences = []
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if any(kw.lower() in sent_text.lower() for kw in keywords):
            keyword_sentences.append(sent_text)
    return keyword_sentences

In [30]:
tqdm.pandas()  # patches pandas with progress_apply


df_sentiment['climate_sentences'] = df_sentiment.progress_apply(
    lambda row: extract_climate_sentences(row['speeches_for_keyword_search'], row['matched_climate_keywords']),
    axis=1
)


df_sentiment

  0%|          | 0/3172 [00:00<?, ?it/s]

100%|██████████| 3172/3172 [17:55<00:00,  2.95it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['climate_sentences'] = df_sentiment.progress_apply(


Unnamed: 0,Session,Year,ISO-Code,Speech,number_sentences,number_tokens,Income Level,speeches_for_keyword_search,matched_climate_keywords,contains_climate_keyword,climate_sentences
5,45,1990,ATG,"﻿Please accept my country's congratulations, S...",115,3134,3,"please accept my country's congratulations, si...",[global warming],True,[our oceans are polluted by various contaminan...
7,45,1990,AUT,"﻿\n\nI am pleased, Sir, to congratulate you on...",162,4743,4,"i am pleased, sir, to congratulate you on your...","[carbon dioxide, emissions, greenhouse]",True,[one of the most urgent priority measures one ...
11,45,1990,BFA,"﻿\nMr. President, events in Eastern and Centra...",91,2948,1,"mr. president, events in eastern and central e...","[carbon dioxide, greenhouse effect, greenhouse]",True,[we must urgently orient research towards tech...
12,45,1990,BGD,"﻿\nMr. President, warm felicitations are due y...",180,3107,1,"mr. president, warm felicitations are due you ...",[climate change],True,[we hope the proposed conventions on climate c...
28,45,1990,COD,"﻿Mr. President, the forty-fifth session of the...",88,3865,1,"mr. president, the forty-fifth session of the ...","[climate change, carbon dioxide, greenhouse ef...",True,[with a view to finding some grounds for under...
...,...,...,...,...,...,...,...,...,...,...,...
6434,79,2024,WSM,"Excellencies, \nI extend my congratulations t...",68,1572,2,"excellencies, i extend my congratulations to h...","[climate change, climate action, extreme weather]",True,[the effects of climate change are being lived...
6435,79,2024,YEM,"Your Majesties, Excellencies, and Highnesses, ...",57,1876,1,"ladies and gentlemen, it is a happy coincidenc...",[climate change],True,[this includes not only immediate humanitarian...
6436,79,2024,ZAF,President of the 79th Session of the UN Genera...,100,1870,3,president of the 79th session of the un genera...,"[climate change, climate action, emissions, ex...",True,"[extreme weather such as flooding, fires and d..."
6437,79,2024,ZMB,"\n YOUR EXCELLENCY PHILEMON YANG, PRESIDENT O...",81,2348,2,"ladies and gentlemen, i congratulate you, your...","[climate change, extreme weather]",True,"[mr. president, climate change and extreme wea..."


In [38]:
df_sentiment.to_csv(base_path_alvaro / "df_sentiment_analysis.csv", index=False, encoding='utf-8')