# Loglikelihood tests

We'll experiment with the loglikelihood of the text before and after a homoglyph-based attack.

This corresponds to Section 4.2.1 in the paper.

## Text with no attacks
First, we'll get a plot of the loglikelihoods of the text with no attacks (i.e. the original version)

In [None]:
import torch

from silver_speak.utils import (add_fill_tokens, align_two_token_sequences,
                                encode_text, get_different_ranges,
                                get_filled_ranges,
                                get_loglikelihoods_of_tokens,
                                total_loglikelihood)

test_text = """Dr. Capy Cosmos, a capybara unlike any other, astounded the scientific community with his groundbreaking research in astrophysics. With his keen sense of observation and unparalleled ability to interpret cosmic data, he uncovered new insights into the mysteries of black holes and the origins of the universe. As he peered through telescopes with his large, round eyes, fellow researchers often remarked that it seemed as if the stars themselves whispered their secrets directly to him. Dr. Cosmos not only became a beacon of inspiration to aspiring scientists but also proved that intellect and innovation can be found in the most unexpected of creatures."""

# Get the loglikelihoods of the tokens
original_input_ids = encode_text(test_text)
original_toks_loglikelihoods, original_model_outputs = get_loglikelihoods_of_tokens(
    input_ids=original_input_ids
)  # List of (tok_id, loglikelihood) tuples
original_toks = torch.tensor(
    [tok_id for tok_id, loglikelihood in original_toks_loglikelihoods]
)
original_loglikelihoods = torch.tensor(
    [loglikelihood for tok_id, loglikelihood in original_toks_loglikelihoods]
)

In [None]:
# Plot the loglikelihoods
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

# Label the tokens
plt.figure(figsize=(10, 5))
plt.plot(original_loglikelihoods, color="blue")
plt.xlabel("Token index")
plt.ylabel("Loglikelihood")
plt.title("Loglikelihood of the tokens in the text")

# Save the figures in 'figures/'
# If __file__ is not defined, define it (this may happen in interactive environments)
if "__file__" not in globals():
    __file__ = Path("visualization.py").resolve()
# Make sure that the output directory exists
Path(__file__).parent.parent.joinpath("figures").mkdir(exist_ok=True)
figures_dir = Path(__file__).parent.parent / "figures"
plt.savefig(figures_dir / "loglikelihoods_original.pdf")

## Attacked text
Now, we'll replace 10% of the characters in the text and see how the loglikelihoods change.

In [None]:
from silver_speak.homoglyphs.random_attack import random_attack
# First, we run the attack with 10% replacement percentage
attacked_text = random_attack(test_text, percentage=0.1, random_seed=42)

attacked_input_ids = encode_text(attacked_text)
attacked_toks_loglikelihoods, attacked_model_outputs = get_loglikelihoods_of_tokens(input_ids=attacked_input_ids)

In [None]:
# Now, plot the original and attacked loglikelihoods

original_toks = torch.tensor([tok_id for tok_id, loglikelihood in original_toks_loglikelihoods])
original_loglikelihoods = torch.tensor([loglikelihood for tok_id, loglikelihood in original_toks_loglikelihoods])
attacked_toks = torch.tensor([tok_id for tok_id, loglikelihood in attacked_toks_loglikelihoods])
attacked_loglikelihoods = torch.tensor([loglikelihood for tok_id, loglikelihood in attacked_toks_loglikelihoods])
aligned_toks = align_two_token_sequences(reference=attacked_toks, target=original_toks)
aligned_loglikelihoods = add_fill_tokens(reference=aligned_toks, target=original_loglikelihoods, ELEMENT_TO_FILL=torch.nan) # This makes the plot not show the fill tokens

# Calculate the percentage of the tokens that change when we run the attack
num_attacked_tokens = torch.sum(attacked_toks != aligned_toks)
num_total_tokens = len(attacked_toks)
percentage_attacked = num_attacked_tokens / num_total_tokens
print(f"The percentage of tokens that change when we run the attack is: {percentage_attacked}")

# Plot the original loglikelihoods w.r.t. the attacked text in the same plot, to compare, with different colors
plt.figure(figsize=(7.6, 5))

# Plot the loglikelihoods
plt.plot(aligned_loglikelihoods, label='Original', color='blue')
plt.plot(attacked_loglikelihoods, label='Attacked', alpha=0.7, color='red')

# Draw a light gray overlay to indicate the different tokens between the who texts
for start, end in get_different_ranges(attacked_toks, aligned_toks):
    # Use avxspan to fill the area between the two lines
    plt.axvspan(start - 1, end + 1, color='grey', alpha=0.2, edgecolor=None, linewidth=0, hatch='\\')
# Add it to the legend
plt.fill_between([], [], color='grey', alpha=0.2, label='Differing tokens', hatch='\\')

# Draw a gray overlay to indicate the fill tokens, between x=40 and x=50. Fill the entire y-axis. Do not draw an outline
for start, end in get_filled_ranges(aligned_toks):
    # Use avxspan to fill the area between the two lines
    plt.axvspan(start - 1, end + 1, color='grey', alpha=0.4, edgecolor=None, linewidth=0, hatch='//')
# Add it to the legend
plt.fill_between([], [], color='grey', alpha=0.4, label='New tokens', hatch='//')

plt.xlabel('Token index')
plt.ylabel('Loglikelihood')
plt.legend()
# Tight layout
plt.subplots_adjust(left=0.1, right=1.0, top=0.9, bottom=0.1)
plt.savefig(figures_dir / 'loglikelihoods_attacked.pdf')

## Distributions of the loglikelihoods
Finally, plot the distributions of the loglikelihoods that we see in the original and attacked versions.

In [None]:
import math

loglikelihoods = [
    [
        loglikelihood for tok_id, loglikelihood in original_toks_loglikelihoods
    ],  # Just take the loglikelihoods of the original text
    [
        loglikelihood for tok_id, loglikelihood in attacked_toks_loglikelihoods
    ],  # Same for the attacked text
]

plt.figure(figsize=(2.2, 5))
# Log scale
plt.violinplot(loglikelihoods, showmeans=True)
# X labels: 'Original', 'Attacked'
plt.xticks([1, 2], ["Original", "Attacked"])
plt.xlabel("")
# Disable the y-axis
plt.yticks([])
# Tight layout
plt.subplots_adjust(left=0.0, right=1.0, top=0.9, bottom=0.1)
plt.savefig(figures_dir / "distributions_loglikelihoods.pdf")