In [4]:
# Imports
import os
import re
import string
from PyPDF2 import PdfFileReader
import pandas as pd
import nltk

# Variables initialisation
data_text = []
data_title = []

# Path to the folder
folder_to_view = "Corpus_p/COP26/COP26_en"

# List of developed countries (source: ONU)
developed_countries = ["Austria", "Belgium", "Denmark", "Finland", "France", "Germany", "Greece", "Ireland",
                       "Luxembourg", "Netherlands",
                       "Portugal", "Spain", "Sweden", "United_Kingdom", "Bulgaria", "Croatia", "Cyprus",
                       "Czech_Republic", "Estonia",
                       "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia",
                       "Iceland", "Norway",
                       "Switzerland", "Australia", "Canada", "Japan", "New_Zealand", "United_States", "Japan", "Italy"]


In [5]:
# Going through every file in the directory
for file in os.listdir(folder_to_view):
    # print(f"{folder_to_view}/{file}")

    # Path to every file in the folder
    file_path = f"{folder_to_view}/{file}"
    pdf = PdfFileReader(file_path)
    data_title.append(file)

    all_text = ""

    # Going through every page in the pdf
    for page_num in range(pdf.numPages):
        pageObj = pdf.getPage(page_num)

        # Extract the text and paste it in the file
        try:
            txt = pageObj.extractText()
        except:
            pass
        else:
            all_text += txt
    data_text.append(all_text)

    # f.close()

In [6]:
# Changing the list to uppercase
developed_countries = [country.upper() for country in developed_countries]

speeches = []

# Cleaning the text
for text in data_text:
    # Remove multiple whitespaces
    text = re.sub(r"\s*\n+\s*", " ", text)
    ' '.join(text.split())    
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    speeches.append(text.lower())


In [7]:
print(speeches)



In [8]:
# Initializing the lists that will make the data frame
countries = []
dev = []

# Cleaning each title
for title in data_title:
    title = re.sub("([^A-Z|_])|(_HLS_EN)", "", title)
    title = re.sub("_(?=[A-Z])", "/", title)
    title = re.sub("_", "", title)
    title = re.sub("/", "_", title)
    countries.append(title)
    # Label developed and non-developed countries
    if title in developed_countries:
        dev.append("D")
    else:
        dev.append("ND")

# Creating a dictionary in order to create a pandas data frame
data_dict = {"Country": countries, "Developed": dev, "Speech": speeches}

# Pandas data frame
df = pd.DataFrame(data_dict)
print(df.head())
print(df.shape)

               Country Developed  \
0               ANGOLA        ND   
1  ANTIGUA_AND_BARBUDA        ND   
2              ARMENIA        ND   
3            AUSTRALIA         D   
4              AUSTRIA         D   

                                              Speech  
0  statement by his excellency joão lourenço pres...  
1   alliance of small island states cop26 world l...  
2  1 address by he president armen sarkissian at ...  
3  580 words there is cause for optimism  18 mont...  
4  1 cop 2 6  world leaders summit glasgow statem...  
(91, 3)


In [10]:
# Save the df as pickle to use in other analysis
# df.to_pickle("./df_speech_not_trated_cop26.pkl")

In [45]:
# List for storing the indexes of the empty speeches
idx_text_empty = []

# Cleaning the df from empty speeches
for idx, text in enumerate(df.Speech):
    if text == '' or text == ' ':
        idx_text_empty.append(idx)
    else:
        pass

df = df.drop(idx_text_empty)

# Reset the index of the data frame
df = df.reset_index(drop=True)

print(df.shape)

(85, 3)


In [46]:
# Explore the data frame
print("Out of {} countries, {} are developed, {} are non-developed.".format(len(df),
                                                                            len(df[df['Developed'] == 'D']),
                                                                            len(df[df['Developed'] == 'ND'])))


Out of 85 countries, 19 are developed, 66 are non-developed.


In [49]:
# Tokenization
def tokenize(input_text):
    tokens = re.split(r"\W", input_text)
    for token in tokens:
        if len(token) < 4:
            tokens.remove(token)
    return tokens


# Tokenized text added to the df
df["Speech_tockenized"] = df["Speech"].apply(lambda x: tokenize(x))

print(df.head())

               Country Developed  \
0               ANGOLA        ND   
1  ANTIGUA_AND_BARBUDA        ND   
2              ARMENIA        ND   
3            AUSTRALIA         D   
4              AUSTRIA         D   

                                              Speech  \
0  statement by his excellency joão lourenço pres...   
1   alliance of small island states cop26 world l...   
2  1 address by he president armen sarkissian at ...   
3  580 words there is cause for optimism  18 mont...   
4  1 cop 2 6  world leaders summit glasgow statem...   

                                   Speech_tockenized  
0  [statement, his, excellency, joão, lourenço, p...  
1  [alliance, small, island, states, cop26, world...  
2  [address, he, president, armen, sarkissian, cl...  
3  [words, there, cause, optimism, 18, months, we...  
4  [cop, 6, world, leaders, summit, glasgow, stat...  


In [50]:
# Import nltk stopwords
stopwords = nltk.corpus.stopwords.words("english")


# Function for removing the stopwords
def remove_stopwords(tokenized_text):
    output_text = [word for word in tokenized_text if word not in stopwords]
    return output_text


# Text without stopwords added to the df
df["Speech_non_stop"] = df["Speech_tockenized"].apply(lambda x: remove_stopwords(x))

print(df.head)

<bound method NDFrame.head of                  Country Developed  \
0                 ANGOLA        ND   
1    ANTIGUA_AND_BARBUDA        ND   
2                ARMENIA        ND   
3              AUSTRALIA         D   
4                AUSTRIA         D   
..                   ...       ...   
80  UNITED_ARAB_EMIRATES        ND   
81        UNITED_KINGDOM         D   
82        UNITED_NATIONS        ND   
83               VANUATU        ND   
84                ZAMBIA        ND   

                                               Speech  \
0   statement by his excellency joão lourenço pres...   
1    alliance of small island states cop26 world l...   
2   1 address by he president armen sarkissian at ...   
3   580 words there is cause for optimism  18 mont...   
4   1 cop 2 6  world leaders summit glasgow statem...   
..                                                ...   
80   1 cop26 national statement your excellencies ...   
81  good afternoon everybody welcome to cop welcom...   


In [51]:
# Import lemmatizer
wn = nltk.WordNetLemmatizer()


# Lemmatizing the text
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text


# Lemmatized text added to the df
df["Speech_lemmatized"] = df["Speech_non_stop"].apply(lambda x: lemmatizing(x))

In [52]:
print(df.head)

<bound method NDFrame.head of                  Country Developed  \
0                 ANGOLA        ND   
1    ANTIGUA_AND_BARBUDA        ND   
2                ARMENIA        ND   
3              AUSTRALIA         D   
4                AUSTRIA         D   
..                   ...       ...   
80  UNITED_ARAB_EMIRATES        ND   
81        UNITED_KINGDOM         D   
82        UNITED_NATIONS        ND   
83               VANUATU        ND   
84                ZAMBIA        ND   

                                               Speech  \
0   statement by his excellency joão lourenço pres...   
1    alliance of small island states cop26 world l...   
2   1 address by he president armen sarkissian at ...   
3   580 words there is cause for optimism  18 mont...   
4   1 cop 2 6  world leaders summit glasgow statem...   
..                                                ...   
80   1 cop26 national statement your excellencies ...   
81  good afternoon everybody welcome to cop welcom...   


In [56]:
df_essential = df[["Country", "Developed", "Speech_lemmatized"]]
print(df_essential.tail)

<bound method NDFrame.head of                  Country Developed  \
0                 ANGOLA        ND   
1    ANTIGUA_AND_BARBUDA        ND   
2                ARMENIA        ND   
3              AUSTRALIA         D   
4                AUSTRIA         D   
..                   ...       ...   
80  UNITED_ARAB_EMIRATES        ND   
81        UNITED_KINGDOM         D   
82        UNITED_NATIONS        ND   
83               VANUATU        ND   
84                ZAMBIA        ND   

                                    Speech_lemmatized  
0   [statement, excellency, joão, lourenço, presid...  
1   [alliance, small, island, state, cop26, world,...  
2   [address, president, armen, sarkissian, climat...  
3   [word, cause, optimism, 18, month, staring, ab...  
4   [cop, 6, world, leader, summit, glasgow, state...  
..                                                ...  
80  [1, cop26, national, statement, excellency, hi...  
81  [good, afternoon, everybody, welcome, welcome,...  
82  [secr

In [57]:
# Save the df as pickle to use in other analysis
df_essential.to_pickle("./df_essential_cop26.pkl")