# Installation to run the code in this notebook (requirement file)

In [None]:
# pip install pandas numpy
# pip install ftfy
# pip install cleantext
# pip install bertopic
# pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


# Importing libraries

In [11]:
# Data and number processing
import pandas as pd 
import numpy as np 

#  fix mojibake errors
from ftfy import fix_encoding

# Topic generation and text processing
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

# Imporing modules
import os
import sys

# Add the src path to sys.path
sys.path.append(os.path.abspath('../src'))
from modules.modules import (missing_values_calculator_and_shape, text_cleaning_process)

ModuleNotFoundError: No module named 'bertopic'

# Data Preprocessing (EDA - Data exploratory analysis and Data Cleaning)

In [3]:
pd.set_option('display.max_rows', None)
df = pd.read_csv('../data/Dataset - EMPOLITICON NLP and ML Based Approach for Context and Emotion Classification of Political Speeches From Transcripts.csv')
df.iloc[:5]

Unnamed: 0,Country,Date,Speaker,Headline,Text_of_Speech,Designation,Running President/PM,Speech Link,Emotion,Context
0,Russia,16/07/2021,Vladimir Putin,Meeting of APEC Economic Leaders,"Madam Chair,\n\nColleagues,\n\nFirst of all, ...",President,Vladimir Putin,http://en.kremlin.ru/events/president/transcri...,OPTIMISM,DEVELOPMENT
1,Russia,2021-09-05 00:00:00,Vladimir Putin,Victory Parade on Red Square,"Citizens of Russia,\n\nDear veterans,\n\nComra...",President,Vladimir Putin,http://en.kremlin.ru/events/president/transcri...,JOY,NATIONALISM
2,Russia,2021-08-04 00:00:00,Vladimir Putin,Meeting on the results of implementing Preside...,"Good afternoon, colleagues.\n\nLet’s start.\n\...",President,Vladimir Putin,http://en.kremlin.ru/events/president/transcri...,NEUTRAL,DEVELOPMENT
3,Russia,21-11-2020,Vladimir Putin,G20 Summit,"Colleagues,\n\nThe scope of problems humanity ...",President,Vladimir Putin,http://en.kremlin.ru/events/president/transcri...,NEUTRAL,DEVELOPMENT
4,Russia,20-11-2020,Vladimir Putin,Address to participants in Nuremberg Lessons f...,"Colleagues, friends,\n\nFirst of all, I would ...",President,Vladimir Putin,http://en.kremlin.ru/events/president/transcri...,UPSET,EXTREMISM


In [4]:
# Checking NaN values
missing_values_calculator_and_shape(df)

Unnamed: 0,NAMES,NULL VALUE COUNT,NULL VALUES IN PERCENTAGES (%),DATA TYPE
0,Country,0,0.0,object
1,Date,0,0.0,object
2,Speaker,0,0.0,object
3,Headline,0,0.0,object
4,Text_of_Speech,0,0.0,object
5,Designation,0,0.0,object
6,Running President/PM,0,0.0,object
7,Speech Link,0,0.0,object
8,Emotion,0,0.0,object
9,Context,0,0.0,object


For the moment and for simplicity, only the Russian analysis would be taken into account. In the future, the other parties, will be taken into account. Also, just for future analysis, I will turn the Date column into an actual date format (current one in Object format).

To do that, I will use the `pd.to_datetime()` function from the pandas library to convert the 'Date' column into a datetime format. This will allow for easier manipulation and analysis of the date data. Furthermore, I will manualy remove time from the date, as it is not needed for the analysis.

## Fixing the date format

In [None]:
# df.loc[:, 'Date'] = pd.to_datetime(df['Date'], 
#                                    errors='coerce',
#                                    dayfirst=True)
# df.iloc[:5]

## Filtering to get only the Russian section

In [5]:
russian_df = df[df['Country'] == 'Russia'].reset_index(drop = True)
# russian_df.iloc[:5]
# russian_df.tail(5)

Let us compress the dataset to the speech only so we can analyse it and extract the topic from scratch

In [6]:
r_df = russian_df[['Date', 'Text_of_Speech']]
r_df.iloc[:5]

Unnamed: 0,Date,Text_of_Speech
0,16/07/2021,"Madam Chair,\n\nColleagues,\n\nFirst of all, ..."
1,2021-09-05 00:00:00,"Citizens of Russia,\n\nDear veterans,\n\nComra..."
2,2021-08-04 00:00:00,"Good afternoon, colleagues.\n\nLet’s start.\n\..."
3,21-11-2020,"Colleagues,\n\nThe scope of problems humanity ..."
4,20-11-2020,"Colleagues, friends,\n\nFirst of all, I would ..."


## Fixing mojibake errors (character-encoding errors)

In [None]:
r_df.loc[:, 'Text_of_Speech'] = r_df['Text_of_Speech'].apply(
    lambda x: fix_encoding(x) if isinstance(x, str) else x)
r_df.iloc[:5]

Unnamed: 0,Date,Text_of_Speech
0,16/07/2021,"Madam Chair,\n\nColleagues,\n\nFirst of all, ..."
1,2021-09-05 00:00:00,"Citizens of Russia,\n\nDear veterans,\n\nComra..."
2,2021-08-04 00:00:00,"Good afternoon, colleagues.\n\nLet’s start.\n\..."
3,21-11-2020,"Colleagues,\n\nThe scope of problems humanity ..."
4,20-11-2020,"Colleagues, friends,\n\nFirst of all, I would ..."


## Cleaning special characters, punctuation, and numbers

In [10]:
r_df.loc[:, 'Text_of_Speech'] = r_df['Text_of_Speech'].apply(text_cleaning_process)
r_df.iloc[:5]

Unnamed: 0,Date,Text_of_Speech
0,16/07/2021,madam chair\n\ncolleagues\n\nfirst of all i wo...
1,2021-09-05 00:00:00,citizens of russia\n\ndear veterans\n\ncomrade...
2,2021-08-04 00:00:00,good afternoon colleagues\n\nlet’s start\n\nas...
3,21-11-2020,colleagues\n\nthe scope of problems humanity h...
4,20-11-2020,colleagues friends\n\nfirst of all i would lik...


# Topic Generation

In [None]:
# # Create a list of speeches
# docs = data['Speech'].tolist()

# # Load transformer embedding model
# emb_minilm = SentenceTransformer("all-MiniLM-L6-v2")

# # Define vectorizer (no stop word removal!)
# vectorizer_model = CountVectorizer(ngram_range=(2, 2))

# # Build BERTopic model
# topic_model = BERTopic(
#     embedding_model=emb_minilm,
#     vectorizer_model=vectorizer_model,
#     n_gram_range=(2, 2),
#     top_n_words=10,
#     nr_topics=7,
#     umap_model=UMAP(random_state=1)
# )

# # Fit model
# topics, probabilities = topic_model.fit_transform(docs)
