# Experiments on the divergence of RoBERTa embeddings

The purpose of this notebook is to answer this question:
*Does the cross-entopy of the K-means clusterization of the output embeddings of the RoBERTa model with the true labels of the dataset increase when we apply homoglyph-based adversarial attacks to the input text?*

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("SJTU-CL/RoBERTa-large-ArguGPT-sent")
model = AutoModelForSequenceClassification.from_pretrained("SJTU-CL/RoBERTa-large-ArguGPT-sent")
if torch.cuda.is_available():
    model.to('cuda')

In [None]:
# Print information about the model architecture
print(model)
print(model.config)

In [None]:
# Remove the classification head
model_without_head = model.roberta

In [None]:
model_without_head

In [None]:
# Load a dataset. For this example, we'll use the CHEAT dataset.
from datasets import load_dataset

dataset = load_dataset("silverspeak/cheat")['train']

real_labels = dataset['generated']

In [None]:
# Get the output embeddings of the model for the dataset
def get_mean_embedding(text, first_token = False):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()}
    with torch.no_grad():
        outputs = model_without_head(**inputs)
    if first_token:
        # Take the first token of the output embeddings
        return outputs.last_hidden_state.cpu()[:, 0, :]
    else:
        # Mean pooling of the output embeddings, otherwise we would have to use the [CLS] token
        return outputs.last_hidden_state.cpu().mean(dim=1)

embeddings = []
for i in range(len(dataset)):
    text = dataset[i]['text']
    embeddings.append(get_mean_embedding(text))
# If the embeddings have 2 dimensions, we need to stack them
if len(embeddings[0].shape) == 2:
    embeddings = torch.stack(embeddings)
# If the embeddings have 3 dimensions, we need to concatenate them
elif len(embeddings[0].shape) == 3:
    embeddings = torch.cat(embeddings, dim=1)

# squeeze the embeddings
embeddings = embeddings.squeeze()
print(f'Shape of the embeddings: {embeddings.shape}')

In [None]:
# Now, apply the rewriting algorithm to the embeddings with 10% replacement rate
from silverspeak.homoglyphs.random_attack import random_attack

attacked_embeddings = []
for i in range(len(dataset)):
    text = dataset[i]['text']
    attacked_text = random_attack(text, percentage=0.1)
    attacked_embeddings.append(get_mean_embedding(attacked_text))
# If the embeddings have 2 dimensions, we need to stack them
if len(attacked_embeddings[0].shape) == 2:
    attacked_embeddings = torch.stack(attacked_embeddings)
# If the embeddings have 3 dimensions, we need to concatenate them
elif len(attacked_embeddings[0].shape) == 3:
    attacked_embeddings = torch.cat(attacked_embeddings, dim=1)

# squeeze the embeddings
attacked_embeddings = attacked_embeddings.squeeze()
print(f'Shape of the attacked embeddings: {attacked_embeddings.shape}')

## Embedding Projections

Let's see where every text is projected in the embedding space. We will use the dimensionality reduction techniques (UMAP, PCA, t-SNE) to visualize the embeddings of the RoBERTa model before and after the adversarial attacks.

In [None]:
from pathlib import Path

# Save the figures in 'figures/'
# If __file__ is not defined, define it (this may happen in interactive environments)
if "__file__" not in globals():
    __file__ = Path("visualization.py").resolve()
# Make sure that the output directory exists
Path(__file__).parent.parent.joinpath("figures").mkdir(exist_ok=True)
figures_dir = Path(__file__).parent.parent / "figures"

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_embeddings(embeddings, real_labels, file_name):
    human_embeddings = embeddings[np.array(real_labels) == False]
    machine_embeddings = embeddings[np.array(real_labels) == True]
    plt.scatter(machine_embeddings[:, 0], machine_embeddings[:, 1], c='red', label='AI', alpha=0.5, s=8)
    plt.scatter(human_embeddings[:, 0], human_embeddings[:, 1], c='blue', label='Human', alpha=0.5, s=8)
    # Show a legend for the real labels
    # plt.title('Original embeddings')#, fontname='Times New Roman')
    plt.legend()
    # Save as PDF
    plt.savefig(figures_dir / file_name)
    plt.show()

In [None]:
# Do an UMAP visualization of the embeddings
import umap
import matplotlib.pyplot as plt
import numpy as np

umap = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42, n_components=2, local_connectivity=5)
reduced_umap_embeddings = umap.fit_transform(embeddings.numpy())
plot_embeddings(reduced_umap_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_umap_original.pdf')

# Now, do the same for the attacked embeddings
reduced_attacked_embeddings = umap.fit_transform(attacked_embeddings.numpy())
plot_embeddings(reduced_attacked_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_umap_attacked.pdf')

In [None]:
# Now, do the same for PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_pca_embeddings = pca.fit_transform(embeddings.numpy())
plot_embeddings(reduced_pca_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_pca_original.pdf')

# Now, do the same for the attacked embeddings
reduced_attacked_embeddings = pca.fit_transform(attacked_embeddings.numpy())
plot_embeddings(reduced_attacked_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_pca_attacked.pdf')

In [None]:
# Now, do the same for t-SNE
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, random_state=42)
reduced_tsne_embeddings = tsne.fit_transform(embeddings.numpy())
plot_embeddings(reduced_tsne_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_tsne_original.pdf')

# Now, do the same for the attacked embeddings
reduced_attacked_embeddings = tsne.fit_transform(attacked_embeddings.numpy())
plot_embeddings(reduced_attacked_embeddings, real_labels, file_name=figures_dir / 'divergence_embeddings_tsne_attacked.pdf')

In [None]:
# Other experiment: what happens if we combine both the original and the attacked embeddings and observe where they are in the embedding space?
combined_embeddings = torch.cat([embeddings, attacked_embeddings])
reduced_combined_embeddings = umap.fit_transform(combined_embeddings.numpy())
reduced_original_embeddings = reduced_combined_embeddings[:len(embeddings)]
reduced_attacked_embeddings = reduced_combined_embeddings[len(embeddings):]
human_embeddings_original = reduced_original_embeddings[np.array(real_labels) == False]
machine_embeddings_original = reduced_original_embeddings[np.array(real_labels) == True]
human_embeddings_attacked = reduced_attacked_embeddings[np.array(real_labels) == False]
machine_embeddings_attacked = reduced_attacked_embeddings[np.array(real_labels) == True]
plt.scatter(machine_embeddings_original[:, 0], machine_embeddings_original[:, 1], c='red', label='AI (original)', alpha=0.5, s=8)
plt.scatter(human_embeddings_original[:, 0], human_embeddings_original[:, 1], c='blue', label='Human (original)', alpha=0.5, s=8)
plt.scatter(machine_embeddings_attacked[:, 0], machine_embeddings_attacked[:, 1], c='green', label='AI (attacked)', alpha=0.5, s=8, marker='x')
plt.scatter(human_embeddings_attacked[:, 0], human_embeddings_attacked[:, 1], c='purple', label='Human (attacked)', alpha=0.5, s=8, marker='x')
# Show a legend for the real labels
# plt.title('Original embeddings')#, fontname='Times New Roman')
plt.legend()
# Save as PDF
plt.savefig(figures_dir / 'divergence_embeddings_umap_combined.pdf')
plt.show()