# NLP model tests
---

Here I test a few NLP models and their relevance for the project.

## Setup

### Import libraries

In [None]:
import os
from ipywidgets import interact
import plotly.io as pio
from transformers import pipeline
from tqdm.auto import tqdm
import pandas as pd

In [None]:
os.chdir("..")

In [None]:
from utils.data_utils import DATA_DIR, load_yaml_file, load_markdown_file
from utils.nlp_utils import get_sentences

### Parameters

Set the plotly style:

In [None]:
pio.templates.default = "plotly_white"

Get the party names from the data:

In [None]:
data_name = None
party_data = None
party_names = None
selected_party = None

In [None]:
data_names = os.listdir(DATA_DIR)
data_names = [name for name in data_names if name != ".DS_Store"]

In [None]:
@interact
def set_data(data=data_names):
    global data_name
    global party_data
    global party_names
    data_name = data
    party_data = load_yaml_file(os.path.join(DATA_DIR, data_name, "parties_data.yml"))
    party_names = list(party_data.keys())

Select a party:

In [None]:
@interact
def select_party(party=party_names):
    global selected_party
    selected_party = party

### Load data

In [None]:
program_txt = load_markdown_file(os.path.join(DATA_DIR, data_name, "programs", f"{selected_party}.md"))

In [None]:
# sample of the text
program_txt[:100]

In [None]:
sentences = get_sentences(program_txt)
sentences = [s.replace("*", "").replace("#", "") for s in sentences]

In [None]:
sentences[:10]

## Test models

### Sentiment analysis

#### Load the model

In [None]:
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=sentiment_model_path, tokenizer=sentiment_model_path)

#### Apply to all sentences

In [None]:
sentiment_outputs = [sentiment_task(sentence) for sentence in tqdm(sentences)]

In [None]:
sentiment_outputs[:10]

#### Get the average sentiment

In [None]:
sentiments_dict = dict(label=[], score=[], sentence=[])
for idx, output in enumerate(sentiment_outputs):
    sentiments_dict["label"].append(output[0]["label"])
    sentiments_dict["score"].append(output[0]["score"])
    sentiments_dict["sentence"].append(sentences[idx])

In [None]:
sentiments_df = pd.DataFrame(sentiments_dict)
sentiments_df

In [None]:
sentiments_df[sentiments_df.label == "Negative"].sentence.sample(10).values

In [None]:
sentiments_df[sentiments_df.label == "Positive"].sentence.sample(10).values

In [None]:
import plotly.express as px

In [None]:
sentiments_count = sentiments_df.label.value_counts().to_frame().reset_index()
sentiments_count.columns = ["label", "sentence_count"]
sentiments_count["percent"] = sentiments_count.sentence_count / sentiments_count.sentence_count.sum() * 100
sentiments_count["label"] = sentiments_count.label.map({"Positive": "positivo", "Negative": "negativo", "Neutral": "neutro"})
sentiments_count

In [None]:
[float(sentiments_count[sentiments_count.label == label].percent) for label in labels_order]

In [None]:
labels_order = ["neutro", "positivo", "negativo"]
fig = px.bar(
    x=labels_order,
    y=[float(sentiments_count[sentiments_count.label == label].percent) for label in labels_order], 
    title="Sentiment analysis",
)
fig.update_traces(
    marker_color=["gray", "green", "red"],
    hovertemplate="%{y:.1f}%<extra></extra>",
)
fig.update_layout(
    xaxis_title="Sentimento",
    yaxis_title="Percentagem de frases",
)
fig

### Hate speech

#### Load the model

In [None]:
hate_model_path = "Hate-speech-CNERG/dehatebert-mono-portugese"
hate_task = pipeline("text-classification", model=hate_model_path, tokenizer=hate_model_path)

#### Apply to all sentences

In [None]:
hate_outputs = [hate_task(sentence) for sentence in tqdm(sentences)]

In [None]:
hate_outputs[:10]

#### Get the average sentiment

In [None]:
hate_dict = dict(label=[], score=[], sentence=[])
for idx, output in enumerate(hate_outputs):
    hate_dict["label"].append(output[0]["label"])
    hate_dict["score"].append(output[0]["score"])
    hate_dict["sentence"].append(sentences[idx])

In [None]:
hate_df = pd.DataFrame(hate_dict)
hate_df

In [None]:
hate_df.label.value_counts()

In [None]:
hate_df[hate_df.label == "NON_HATE"].sentence.sample(10).values

In [None]:
hate_df[hate_df.label == "HATE"].sentence.sample(10).values

In [None]:
hate_df[hate_df.label == "HATE"].score.describe()

In [None]:
hate_df[(hate_df.label == "HATE") & (hate_df.score > 0.7)].sentence.values

In [None]:
hate_df[(hate_df.label == "HATE") & (hate_df.score > 0.7)]

In [None]:
hate_df["label"] = hate_df.apply(lambda row: "HATE" if ((row.label == "HATE") & (row.score > 0.8)) else "NON_HATE", axis=1)

In [None]:
hate_count = hate_df.label.value_counts().to_frame().reset_index()
hate_count.columns = ["label", "sentence_count"]
hate_count["percent"] = hate_count.sentence_count / hate_count.sentence_count.sum() * 100
hate_count["label"] = hate_count.label.map({"HATE": "ódio", "NON_HATE": "neutro"})
hate_count

In [None]:
# TODO Plot

### Summarization

#### Load the model

In [None]:
sum_model_path = "csebuetnlp/mT5_multilingual_XLSum"
sum_task = pipeline("text2text-generation", model=sum_model_path, tokenizer=sum_model_path)

#### Apply to the whole program

In [None]:
program_summary = sum_task(program_txt.replace("*", "").replace("#", ""))
program_summary

#### Apply to chapters