# NLP model tests
---

Here I test a few NLP models and their relevance for the project.

## Setup

### Import libraries

In [None]:
import os
from ipywidgets import interact
import plotly.io as pio
from transformers import pipeline
from tqdm.auto import tqdm
import pandas as pd

In [None]:
os.chdir("..")

In [None]:
from utils.data_utils import DATA_DIR, load_yaml_file, load_markdown_file
from utils.nlp_utils import get_sentences, get_sentiment, get_hate_speech
from utils.viz_utils import plot_sentiment, plot_hate_speech

### Parameters

Set the plotly style:

In [None]:
pio.templates.default = "plotly_white"

Get the party names from the data:

In [None]:
data_name = None
party_data = None
party_names = None
selected_party = None

In [None]:
data_names = os.listdir(DATA_DIR)
data_names = [name for name in data_names if name != ".DS_Store"]

In [None]:
@interact
def set_data(data=data_names):
    global data_name
    global party_data
    global party_names
    data_name = data
    party_data = load_yaml_file(os.path.join(DATA_DIR, data_name, "parties_data.yml"))
    party_names = list(party_data.keys())

Select a party:

In [None]:
@interact
def select_party(party=party_names):
    global selected_party
    selected_party = party

### Load data

In [None]:
program_txt = load_markdown_file(os.path.join(DATA_DIR, data_name, "programs", f"{selected_party}.md"))

In [None]:
# sample of the text
program_txt[:100]

In [None]:
sentences = get_sentences(program_txt)
sentences = [s.replace("*", "").replace("#", "") for s in sentences]

In [None]:
sentences[:10]

## Test models

### Sentiment analysis

#### Apply to all sentences

In [None]:
sentiments_df = get_sentiment(sentences)
sentiments_df

#### Plot it

In [None]:
sentiments_df[sentiments_df.label == "negativo"].sentence.sample(10).values

In [None]:
sentiments_df[sentiments_df.label == "positivo"].sentence.sample(10).values

In [None]:
plot_sentiment(sentiments_df)

### Hate speech

#### Apply to all sentences

In [None]:
hate_df = get_hate_speech(sentences)
hate_df

#### Plot it

In [None]:
hate_df[hate_df.label == "ódio"].sentence.sample(10).values

In [None]:
hate_df[hate_df.label == "neutro"].sentence.sample(10).values

In [None]:
plot_hate_speech(hate_df)

### Summarization

#### Load the model

In [None]:
sum_model_path = "csebuetnlp/mT5_multilingual_XLSum"
sum_task = pipeline("text2text-generation", model=sum_model_path, tokenizer=sum_model_path)

#### Apply to the whole program

In [None]:
program_summary = sum_task(program_txt.replace("*", "").replace("#", ""))
program_summary

#### Apply to chapters