# Basic program analysis
---

This notebook serves to test some basic data analysis of electoral programs in markdown format.

## Setup

### Import libraries

In [None]:
import os
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from ipywidgets import interact

In [None]:
os.chdir("..")

In [None]:
from utils.data_utils import DATA_DIR, load_yaml_file, load_markdown_file
from utils.nlp_utils import get_word_cloud

### Parameters

Get natural language model:

In [None]:
nlp = spacy.load("pt_core_news_lg")

Get the party names from the data:

In [None]:
data_name = None
party_data = None
party_names = None
selected_party = None

In [None]:
data_names = os.listdir(DATA_DIR)
data_names = [name for name in data_names if name != ".DS_Store"]

In [None]:
@interact
def set_data(data=data_names):
    global data_name
    global party_data
    global party_names
    data_name = data
    party_data = load_yaml_file(os.path.join(DATA_DIR, data_name, "parties_data.yml"))
    party_names = list(party_data.keys())

Select a party:

In [None]:
@interact
def select_party(party=party_names):
    global selected_party
    selected_party = party

### Load data

In [None]:
program_txt = load_markdown_file(os.path.join(DATA_DIR, data_name, "programs", f"{selected_party}.md"))

In [None]:
# sample of the text
program_txt[:100]

In [None]:
doc = nlp(program_txt)

## Analysis

### Key words

Get the paragraphs from the text:

In [None]:
paragraphs = program_txt.split("\n")
paragraphs = [p for p in paragraphs if p != ""]

In [None]:
paragraphs[:5]

Get the phrases from the data:

In [None]:
phrases = [sent.text.replace("\n", "") for sent in doc.sents]
phrases[:5]

Get every word in the text that isn't a stopword or punctuation, and that is either a noun, adjective, verb or interjection (based on the [universal POS tags](https://universaldependencies.org/u/pos/)):

In [None]:
words = [
    word.text.replace("\n", "") for word in doc
    if not word.is_stop 
    and not word.is_punct
    and (
        word.pos_ == "NOUN" 
        or word.pos_ == "ADJ"
        or word.pos_ == "VERB"
        or word.pos_ == "INTJ"
        or word.pos_ == "X"
    )
]
words = [word for word in words if word != ""]
words[:10]

Do a word cloud:

In [None]:
word_cloud = WordCloud().generate(" ".join(words))
plt.imshow(word_cloud)

In [None]:
get_word_cloud(words)