# Data visualization of the `polids` outputs
---
This notebook serves as a playground to examine and visualize the outputs of the `polids` package. It's a starting point for exploring the results for a given pipeline run and can serve as inpiration for a future web application or another solution that is fit for production.

## Setup

### Import libraries

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
from polids.word_cloud.wordcloud import WordCloudGenerator

### Set parameters

In [None]:
os.listdir()

In [None]:
os.chdir("..")
os.listdir()

In [None]:
data_path = Path("data/elections_portugal/2025/programs_pdf/output")
images_path = Path("images")
images_path.mkdir(exist_ok=True)
image_scale = 4
height = 800
width = 800

### Define auxiliary functions

## Load the data

In [None]:
party_names_df = pd.read_csv(data_path / "party_names.csv")
party_names_df.head()

In [None]:
party_colors = [
    {"party": "ad", "color": "#F68A1E"},  # Orange from PSD branding
    {"party": "adn", "color": "#003087"},  # Dark blue for nationalism
    {"party": "be", "color": "#D81E05"},  # Red for left-wing
    {"party": "chega", "color": "#0033A0"},  # Dark blue from branding
    {"party": "ergue", "color": "#001F54"},  # Dark blue for conservatism
    {"party": "il", "color": "#00A1D6"},  # Cyan for liberalism
    {"party": "jpp", "color": "#009966"},  # Green for Madeira
    {"party": "livre", "color": "#00A859"},  # Green for environmentalism
    {"party": "nova_direita", "color": "#001F54"},  # Dark blue for conservatism
    {"party": "pan", "color": "#008000"},  # Green for ecologism
    {"party": "pcp", "color": "#003399"},  # Blue for communism
    {"party": "pls", "color": "#FFD600"},  # Yellow for PLS
    {"party": "ppm", "color": "#003399"},  # Blue for monarchical conservatism
    {"party": "ps", "color": "#E30613"},  # Red for social democracy
    {"party": "rir", "color": "#FF6200"},  # Orange for humanism
    {"party": "volt", "color": "#502379"},  # Purple for federalism
]
party_colors_df = pd.DataFrame(party_colors)
party_colors_df["pdf_file"] = party_colors_df["party"] + ".pdf"
party_colors_df

In [None]:
chunk_analysis_df = pd.read_csv(data_path / "chunk_analysis.csv")
chunk_analysis_df = chunk_analysis_df.merge(
    party_names_df[["pdf_file", "short_name"]], how="inner", on="pdf_file"
)
chunk_analysis_df = chunk_analysis_df.merge(
    party_colors_df[["pdf_file", "color"]], how="inner", on="pdf_file"
)
chunk_analysis_df.head()

In [None]:
scientific_validations_df = pd.read_csv(data_path / "scientific_validations.csv")
scientific_validations_df = scientific_validations_df.merge(
    party_names_df[["pdf_file", "short_name"]], how="inner", on="pdf_file"
)
scientific_validations_df = scientific_validations_df.merge(
    party_colors_df[["pdf_file", "color"]], how="inner", on="pdf_file"
)
scientific_validations_df.head()

## Produce visualizations

### Number of policy proposals in total per party

In [None]:
number_of_policies_per_party = (
    scientific_validations_df.groupby("short_name")
    .agg({"proposal": "count", "color": "first"})
    .sort_values("proposal", ascending=False)
)
number_of_policies_per_party

In [None]:
fig = px.bar(
    number_of_policies_per_party,
    x=number_of_policies_per_party.index,
    y="proposal",
    title="Número de propostas por partido",
    labels={"short_name": "Partido", "proposal": "Número de propostas"},
    text="proposal",
)
fig.update_traces(marker_color=number_of_policies_per_party["color"].tolist())
fig.update_layout(
    xaxis_title="Partido",
    yaxis_title="Número de propostas",
    xaxis_tickangle=-45,
    margin=dict(l=0, r=0, t=30, b=0),
    showlegend=False,
)
fig.write_image(
    images_path / "number_of_policies_per_party.png",
    scale=image_scale,
    height=400,
    width=800,
)
fig.show()

### Number of policy proposals per topic per party

In [None]:
policy_analysis_df = chunk_analysis_df.copy()
policy_analysis_df.policy_proposals = policy_analysis_df.policy_proposals.apply(eval)
policy_analysis_df = policy_analysis_df.explode(column="policy_proposals")
policy_analysis_df

In [None]:
number_of_policies_per_party_per_topic = (
    policy_analysis_df.groupby(["short_name", "topic"])
    .agg({"policy_proposals": "count", "color": "first"})
    .sort_values("policy_proposals", ascending=False)
    .reset_index()
)
number_of_policies_per_party_per_topic["number_of_policies_ratio"] = (
    number_of_policies_per_party_per_topic["policy_proposals"]
    / number_of_policies_per_party_per_topic.groupby("short_name")[
        "policy_proposals"
    ].transform("sum")
)
number_of_policies_per_party_per_topic

In [None]:
# Create a color mapping dictionary from party name to color
color_map = dict(
    zip(
        number_of_policies_per_party_per_topic["short_name"],
        number_of_policies_per_party_per_topic["color"],
    )
)

fig = px.bar(
    number_of_policies_per_party_per_topic,
    x="topic",
    y="number_of_policies_ratio",
    color="short_name",
    color_discrete_map=color_map,
    barmode="group",
    labels={
        "number_of_policies_ratio": "Percentagem de propostas do partido",
        "topic": "Tema",
        "short_name": "Partido",
    },
    title="Percentagem de propostas por tema e partido",
)

fig.update_layout(
    xaxis_title="Tema",
    yaxis_title="Percentagem de propostas do partido",
    margin=dict(l=0, r=0, t=30, b=0),
    xaxis_tickangle=-45,
    legend_title="Partido",
)

fig.write_image(
    images_path / "number_of_policies_per_party_per_topic.png",
    scale=image_scale,
    height=height,
    width=width,
)

fig.show()

### Sentiment analysis per party

In [None]:
sentiment_df = (
    chunk_analysis_df.groupby("short_name").sentiment.value_counts().to_frame()
)
sentiment_df["ratio"] = sentiment_df["count"] / sentiment_df.groupby(level=0)[
    "count"
].transform("sum")
sentiment_df

In [None]:
# Obter partidos únicos
parties = sentiment_df.index.get_level_values("short_name").unique()

# Criar um dicionário que mapeia nomes de partidos para cores usando number_of_policies_per_party
party_to_color = {}
for party, row in number_of_policies_per_party.iterrows():
    party_to_color[party] = row["color"]

# Calcular número de linhas e colunas para o grid de subplots
n_parties = len(parties)
n_cols = 3  # Usar 3 colunas
n_rows = int(np.ceil(n_parties / n_cols))

# Mapeamento de cores para sentimentos em português
sentiment_colors_pt = {
    "negativo": "#F44336",  # Vermelho
    "neutro": "#FF9800",  # Laranja
    "positivo": "#4CAF50",  # Verde
}

# Ordem fixa dos sentimentos em português
sentiment_order_pt = ["negativo", "neutro", "positivo"]

# Mapeamento dos sentimentos do DataFrame para português
sentiment_translation = {
    "negative": "negativo",
    "neutral": "neutro",
    "positive": "positivo",
}

# Obter partidos únicos
parties = sentiment_df.index.get_level_values("short_name").unique()

# Criar subplots
fig = make_subplots(
    rows=n_rows, cols=n_cols, subplot_titles=[party for party in parties]
)

# Adicionar gráfico de barras para cada partido
for i, party in enumerate(parties):
    row = i // n_cols + 1
    col = i % n_cols + 1

    # Obter dados para este partido
    party_data = sentiment_df.loc[party]

    # Traduzir índices de sentimento para português
    party_data_pt = party_data.copy()
    party_data_pt.index = [
        sentiment_translation.get(idx, idx) for idx in party_data_pt.index
    ]

    # Reindexar para garantir ordem consistente (com fill_value=0 para sentimentos em falta)
    ordered_data = party_data_pt.reindex(sentiment_order_pt, fill_value=0)

    # Criar gráfico de barras
    fig.add_trace(
        go.Bar(
            x=sentiment_order_pt,  # ordem fixa dos sentimentos em português
            y=ordered_data["ratio"],
            text=[f"{ratio:.1%}" for ratio in ordered_data["ratio"]],
            textposition="auto",
            name=party,
            marker_color=[sentiment_colors_pt[s] for s in sentiment_order_pt],
        ),
        row=row,
        col=col,
    )

# Atualizar layout
fig.update_layout(
    title_text="Análise de Sentimento por Partido",
    height=250 * n_rows,
    width=900,
    showlegend=False,
)

# Atualizar títulos dos subplots com as cores dos partidos
for i, party in enumerate(parties):
    if party in party_to_color:
        fig.layout.annotations[i].font.color = party_to_color[party]

# Atualizar eixos
fig.update_xaxes(title_text="Sentimento")
fig.update_yaxes(title_text="Proporção", tickformat=".0%")

# Guardar imagem e mostrar gráfico
fig.write_image(
    images_path / "sentiment_analysis_by_party.png",
    scale=image_scale,
    height=250 * n_rows,
    width=900,
)
fig.show()

### Political compass

In [None]:
def normalize_column(df: pd.DataFrame, column: str) -> pd.Series:
    """Normalize a DataFrame column to the range [-1, 1] based on min-max scaling.

    Args:
        df: The DataFrame containing the column to normalize.
        column: The name of the column to normalize.

    Returns:
        A pandas Series with normalized values in the range [-1, 1].
    """
    min_value = df[column].min()
    max_value = df[column].max()

    if max_value == min_value:
        return pd.Series(0, index=df.index)

    normalized = 2 * (df[column] - min_value) / (max_value - min_value) - 1
    return normalized


# Compute mean values per party
political_compass_df = chunk_analysis_df.groupby("short_name").agg(
    {"political_compass_economic": "mean", "political_compass_social": "mean"}
)

# Normalize both axes to [-1, 1]
political_compass_df["political_compass_economic"] = normalize_column(
    political_compass_df, "political_compass_economic"
)
political_compass_df["political_compass_social"] = normalize_column(
    political_compass_df, "political_compass_social"
)

political_compass_df

In [None]:
# Create a mapping from party short_name to color using party_colors_df
party_to_color: dict[str, str] = {
    row["short_name"]: row["color"]
    for _, row in party_names_df.merge(party_colors_df, on="pdf_file").iterrows()
}

# Set axis limits to exactly -1 to 1
x_min, x_max = -1, 1
y_min, y_max = -1, 1

# Create a color mapping from party name to color
color_map = {
    party: party_to_color.get(party, "#000000") for party in political_compass_df.index
}


def get_text_position(
    x: float,
    y: float,
    x_min: float,
    x_max: float,
    y_min: float,
    y_max: float,
    padding: float = 0.05,
    party_name: str = "",
) -> str:
    """Determine the best text position for a marker to avoid cutoff at plot edges.

    Args:
        x: X coordinate of the marker.
        y: Y coordinate of the marker.
        x_min: Minimum x-axis value.
        x_max: Maximum x-axis value.
        y_min: Minimum y-axis value.
        y_max: Maximum y-axis value.
        padding: Fractional padding from the edge to trigger position adjustment.
        party_name: Name of the party for special handling of long names.

    Returns:
        A Plotly textposition string.
    """
    # Special handling for long party names
    if len(party_name) > 10 and x > 0.5:
        return "middle left"  # Place text to the left for long names near right edge

    # Near left/right edge
    if x <= x_min + padding:
        if y >= y_max - padding:
            return "bottom right"
        elif y <= y_min + padding:
            return "top right"
        else:
            return "middle right"
    elif x >= x_max - padding:
        if y >= y_max - padding:
            return "bottom left"
        elif y <= y_min + padding:
            return "top left"
        else:
            return "middle left"
    # Near top/bottom edge
    if y >= y_max - padding:
        return "bottom center"
    elif y <= y_min + padding:
        return "top center"
    # Default to top center
    return "top center"


# Prepare data for plotting
plot_df = political_compass_df.reset_index()

# Create the scatter plot
fig = px.scatter(
    plot_df,
    x="political_compass_economic",
    y="political_compass_social",
    color="short_name",
    text="short_name",
    color_discrete_map=color_map,
    title="Bússola política: Posições económicas vs sociais dos partidos",
    labels={
        "political_compass_economic": "Eixo económico (Esquerda ← → Direita)",
        "political_compass_social": "Eixo social (Libertário ↓ ↑ Autoritário)",
        "short_name": "Partido",
    },
)

# Update each trace individually with its specific text position
for i, trace in enumerate(fig.data):
    row = plot_df.iloc[i]
    party_name = row["short_name"]
    text_position = get_text_position(
        row["political_compass_economic"],
        row["political_compass_social"],
        x_min,
        x_max,
        y_min,
        y_max,
        padding=0.05,
        party_name=party_name,
    )

    # Adjust font size for longer party names
    font_size = 12
    if len(party_name) > 10:
        font_size = 10

    trace.update(
        marker=dict(size=15),
        textposition=text_position,
        textfont=dict(size=font_size),
    )

# Add axis lines
fig.add_shape(
    type="line",
    x0=x_min,
    y0=0,
    x1=x_max,
    y1=0,
    line=dict(color="black", width=1, dash="dash"),
)
fig.add_shape(
    type="line",
    x0=0,
    y0=y_min,
    x1=0,
    y1=y_max,
    line=dict(color="black", width=1, dash="dash"),
)

# Add color to the quadrants
fig.add_shape(
    type="rect",
    x0=x_min,
    y0=0,
    x1=0,
    y1=y_max,
    fillcolor="red",
    opacity=0.1,
    line_width=0,
)
fig.add_shape(
    type="rect",
    x0=0,
    y0=0,
    x1=x_max,
    y1=y_max,
    fillcolor="blue",
    opacity=0.1,
    line_width=0,
)
fig.add_shape(
    type="rect",
    x0=x_min,
    y0=y_min,
    x1=0,
    y1=0,
    fillcolor="green",
    opacity=0.1,
    line_width=0,
)
fig.add_shape(
    type="rect",
    x0=0,
    y0=y_min,
    x1=x_max,
    y1=0,
    fillcolor="yellow",
    opacity=0.1,
    line_width=0,
)

# Set axis ranges
fig.update_layout(
    xaxis_range=[x_min, x_max],
    yaxis_range=[y_min, y_max],
    margin=dict(l=0, r=0, t=30, b=0),
    height=height,
    width=width,
)

# Save the image
fig.write_image(
    images_path / "political_compass.png",
    scale=image_scale,
    height=height,
    width=width,
)
fig.show()

### Frequency of hate speech and target groups per party

In [None]:
chunk_analysis_df.hate_speech_is_hate_speech.value_counts()

In [None]:
chunk_analysis_df[chunk_analysis_df.hate_speech_is_hate_speech]

In [None]:
# Group hate speech instances by party and target group
hate_speech_df = chunk_analysis_df[chunk_analysis_df.hate_speech_is_hate_speech].copy()


# Convert string representations of tuples to actual tuples if needed
def parse_target_groups(groups):
    if isinstance(groups, str):
        try:
            # Handle string representation of tuples
            if groups.startswith("(") and groups.endswith(")"):
                # Split by comma and strip quotes and spaces
                return tuple(
                    g.strip().strip("'\"") for g in groups[1:-1].split(",") if g.strip()
                )
            return groups
        except:
            return "Unknown"
    elif not groups or groups == [] or groups == ():
        return "Unknown"
    return groups


hate_speech_df["hate_speech_targeted_groups"] = hate_speech_df[
    "hate_speech_targeted_groups"
].apply(parse_target_groups)

# Explode the column so each target group gets its own row
# First ensure all values are actually iterables (lists or tuples)
hate_speech_df["hate_speech_targeted_groups"] = hate_speech_df[
    "hate_speech_targeted_groups"
].apply(lambda x: [x] if isinstance(x, str) else x)
hate_speech_exploded = hate_speech_df.explode("hate_speech_targeted_groups")

# Clean up any empty values after explosion
hate_speech_exploded["hate_speech_targeted_groups"] = hate_speech_exploded[
    "hate_speech_targeted_groups"
].apply(lambda x: "Desconhecido" if not x or x == "" else x)

# Create a dictionary to translate target groups to Portuguese
target_groups_translation = {
    "ethnicity": "etnia",
    "gender identity": "identidade de género / orientação sexual",
    "immigrants": "origem nacional",
    "national origin": "origem nacional",
    "religion": "religião",
    "gender": "identidade de género / orientação sexual",
    "migration": "origem nacional",
    "political affiliation": "afiliação política",
    "sexual orientation": "identidade de género / orientação sexual",
    "Unknown": "Desconhecido",
}

# Translate target groups to Portuguese
hate_speech_exploded["hate_speech_targeted_groups_pt"] = hate_speech_exploded[
    "hate_speech_targeted_groups"
].map(target_groups_translation)

# Count instances by party and target group
hate_speech_counts = (
    hate_speech_exploded.groupby(["short_name", "hate_speech_targeted_groups_pt"])
    .size()
    .reset_index(name="count")
)

# Create the stacked bar chart
fig = px.bar(
    hate_speech_counts,
    x="short_name",
    y="count",
    color="hate_speech_targeted_groups_pt",
    title="Contagem de instâncias de discurso de ódio por partido e grupo alvo",
    labels={
        "short_name": "Partido",
        "count": "Número de instâncias de discurso de ódio",
        "hate_speech_targeted_groups_pt": "Grupo alvo",
    },
    text="hate_speech_targeted_groups_pt",  # Add text showing the target group
    barmode="stack",
)

# Customize the layout
fig.update_traces(
    texttemplate="%{text}",  # Show targeted group name in Portuguese
    textposition="inside",  # Position text inside the bars
    insidetextanchor="middle",  # Anchor text in middle of bar segments
)

# Hide the legend since we're showing labels inside bars
fig.update_layout(
    showlegend=False,
    xaxis_tickangle=-45,
    margin=dict(l=0, r=0, t=35, b=0),
    height=height,
    width=width,
)

# Save the image
fig.write_image(
    images_path / "hate_speech_by_party.png",
    scale=image_scale,
    height=height,
    width=width,
)

fig.show()

### Percentage of policy proposals that have a scientific backing

### Word cloud