# Basic program analysis
---

This notebook serves to test some basic data analysis of electoral programs in markdown format.

## Setup

### Import libraries

In [None]:
import os
from ipywidgets import interact
import plotly.express as px
import plotly.io as pio
import pandas as pd

In [None]:
os.chdir("..")

In [None]:
from utils.data_utils import DATA_DIR, load_yaml_file, load_markdown_file
from utils.nlp_utils import get_word_cloud, get_sentences, get_words, get_topical_sentences

### Parameters

Set the plotly style:

In [None]:
pio.templates.default = "plotly_white"

Get the party names from the data:

In [None]:
data_name = None
party_data = None
party_names = None
selected_party = None

In [None]:
data_names = os.listdir(DATA_DIR)
data_names = [name for name in data_names if name != ".DS_Store"]

In [None]:
@interact
def set_data(data=data_names):
    global data_name
    global party_data
    global party_names
    data_name = data
    party_data = load_yaml_file(os.path.join(DATA_DIR, data_name, "parties_data.yml"))
    party_names = list(party_data.keys())

Select a party:

In [None]:
@interact
def select_party(party=party_names):
    global selected_party
    selected_party = party

Get the topics to analyse:

In [None]:
topics = load_yaml_file(os.path.join(DATA_DIR, data_name, "topics.yml"))

In [None]:
topics.keys()

### Load data

In [None]:
program_txt = load_markdown_file(os.path.join(DATA_DIR, data_name, "programs", f"{selected_party}.md"))

In [None]:
# sample of the text
program_txt[:100]

## Analysis

### Key words

Get the sentences:

In [None]:
sentences = get_sentences(program_txt)

In [None]:
sentences[:10]

Get the words:

In [None]:
words = get_words(program_txt)

In [None]:
words[:10]

In [None]:
from string import punctuation

In [None]:
[w for w in words if any(p in w for p in punctuation)]

Do a word cloud:

In [None]:
get_word_cloud(words)

### Headers and subheaders

In [None]:
[s.replace("#", "   ") for s in sentences if s.startswith("#")]

In [None]:
main_headers = [s for s in sentences if s.startswith("# ")]
main_headers

### Category count

#### Topics

In [None]:
topical_sentences = get_topical_sentences(sentences, topics)

In [None]:
topic_sentence_count = dict()
for topic in topics:
    topic_sentence_count[topic] = len(topical_sentences[topic])

In [None]:
topic_sentence_count

In [None]:
for topic in topics:
    print(f"{topic}: {topic_sentence_count[topic] / len(sentences) * 100:.2f}%")

In [None]:
topic_sentence_count = pd.DataFrame(topic_sentence_count, index=["sentence_count"]).T
topic_sentence_count["sentence_percentage"] = topic_sentence_count["sentence_count"] / len(sentences) * 100
topic_sentence_count.index.name = "topic"
topic_sentence_count.sort_index(inplace=True)
topic_sentence_count

In [None]:
fig = px.bar(topic_sentence_count, x="sentence_percentage", orientation="h")
fig.update_layout(
    title=f"Presença de tópicos no programa do partido {selected_party.upper()}",
    xaxis_title="Percentagem de frases topicais no texto",
    yaxis_title="Tópico",
    yaxis=dict(categoryorder="category descending"),
    # marker_color="rgb(0, 0, 0)",
)

#### Rationality vs intentionality