In [1]:
# import basic libraries
import os
import numpy as np
import pandas as pd
import re
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import pickle
import plotly.graph_objects as go
from collections import Counter
import plotly.io as pio
import pyperclip
from sklearn.cluster import KMeans

In [4]:
# import the Czech BERT model
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("pnadel/latin_tokenizer") #"pnadel/LatinBERT")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModel.from_pretrained("pnadel/LatinBERT")

# Test on artificial dataset

In [2]:
latin_sentences = [
    "liber",
    "Liber est magister vitae.",
    "Librum eius in mensa videre potes.",
    "Liber pecuniae multa fortunam facit.",
    "Librum legere amo, ut in alium mundum evadam.",
    "Amicus librum est amicus æternum.",
    "Liber naturae res omnes sustentat.",
    "Amor liber est, non vinculum.",
    "Invenire teipsum in libris est invenire thesaurum.",
    "Liber curiositatis est via ad sapientiam.",
    "Lux in libro quaerenda est, non in lectulo."
]

The following code does this:
* For each document, wload a text file with sentences containing the target term (e.g. "liber"). These files contain a sentence per line.
* Go throuh each sentence in the list of sentences and for each 
    * Apply BERT tokenizer on the sentence (see `sentence_data["sentence_tokens"]`).
    * Assign sentence-level BERT embedding to it, averaging the last hidden state layer of the model (see `sentence_data["sentence_embeddings"]`).
    * Assing token-level BERT embedding to each token (see `sentence_data["sentence_token_embeddings"]`). 
    * Select the embedding of the  target term and save it separately (see `sentence_data["target_embedding"]`
    * Assign additional metadata to each sentence based on the source document.

In [5]:
sent = latin_sentences[0]
inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs

In [6]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [7]:
def get_embeddings_data(sent, target):
            inputs = tokenizer(sent.lower(), return_tensors='pt', padding=True, truncation=True, max_length=512)
            # Convert input_ids to actual tokens
            sentence_data = {}
            sentence_data["sentence"] = sent
            sentence_data["sentence_tokens"] = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
            sentence_data["token_ids"] = inputs['input_ids'][0].tolist()

            with torch.no_grad():
                outputs = model(**inputs)
                sentence_data["sentence_embeddings"] = outputs.last_hidden_state.mean(dim=1)
                sentence_data["sentence_token_embeddings"] = outputs.last_hidden_state[0].numpy()
            # embedding for a target term
            for idx, token in enumerate(sentence_data["sentence_tokens"]):
                if re.search(target, token, re.IGNORECASE):
                    sentence_data["target_embedding"] = sentence_data["sentence_token_embeddings"][idx]
            return sentence_data

In [8]:
target = "lib(er|ri|rum|ro)"
for sent in latin_sentences:
    print(re.search(target, sent, re.IGNORECASE))

In [9]:
target = "lib(er|ri|rum|ro)"
sentence_data = get_embeddings_data(latin_sentences[0], target)

In [10]:
sentence_data # ["sentence_tokens"]

In [11]:
data = []
target = "lib(er|ri|rum|ro)"
for sent in latin_sentences:
    try:
        data.append(get_embeddings_data(sent, target))
    except:
        pass

In [12]:
len(data)

In [22]:
tokens = data[2]["sentence_tokens"]
tokens

In [23]:
('').join(token.replace('Ġ', ' ') for token in tokens)

In [12]:
data  = [d for d in data if "target_embedding" in d.keys()]
len(data)

In [13]:
data  = [d for d in data if "target_embedding" in d.keys()]
target_vectors = [d["target_embedding"] for d in data]
sentences = [d["sentence"] for d in data]

target_embeddings = np.vstack(target_vectors)
cosine_sim_matrix = pd.DataFrame(cosine_similarity(target_embeddings), index=sentences, columns=sentences)

In [14]:
cosine_sim_matrix

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))  # You may want to adjust the size depending on your similarity matrix
cax = ax.matshow(cosine_sim_matrix, cmap='coolwarm')  # Change colormap here
plt.title('Cosine Similarity Matrix')
fig.colorbar(cax)
plt.xticks(np.arange(len(sentences)), sentences,
           rotation=90)  # Rotating x labels may help prevent them from overlapping
plt.yticks(np.arange(len(sentences)), sentences)
plt.show()

In [33]:
data[0]["target_embedding"]

 # Has to be modified for working with the latin data...

In [12]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=0).fit(target_vectors)
labels = kmeans.labels_

In [13]:
len(labels)

In [14]:
print(labels[:20])

In [15]:
cluster_colors = {0: "#1f77b4",  # muted blue
                  1: "#ff7f0e",  # safety orange
                  2: "#2ca02c",  # cooked asparagus green
                  3: "#d62728",  # brick red
                  4: "#9467bd"}  # muted purple

In [16]:
colors = [cluster_colors[l] for l in labels]

In [17]:
target_embeddings = np.vstack(target_vectors)
cosine_sim_matrix = pd.DataFrame(cosine_similarity(target_embeddings), index=sentences, columns=sentences)

In [18]:
cosine_sim_matrix.iloc[8].sort_values(ascending=False)

In [22]:
tsne = TSNE(n_components=3, perplexity=5)
embeddings_tsne = tsne.fit_transform(np.vstack(target_vectors))

In [23]:
xs = embeddings_tsne[:, 0]
ys = embeddings_tsne[:, 1]
zs = embeddings_tsne[:, 2]

In [24]:
title = "Sentences with 'bezpečí' (N={})".format(len(sentences))
fig = go.Figure(data=go.Scatter3d(
    x=xs,
    y=ys,
    z=zs,
    mode='markers',
    marker=dict(
        size=5,
        color=colors,
        opacity=0.3
    ),
    text=hover_text,  # use mapped hover text
    hoverinfo='text',  # ensure only the text field is displayed on hover
))

fig.update_layout(
    title=title,
    scene=dict(
        xaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        yaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        zaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        bgcolor='rgba(255,255,255,0)'
    ),
    paper_bgcolor='rgba(255,255,255,255)',  # set the color of the area around the axes
    plot_bgcolor='rgba(255,255,255,255)',  # set the color of the entire chart
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=0, r=0, b=0, t=0)
)


In [25]:
html_code = pio.to_html(fig, include_plotlyjs='cdn')
pyperclip.copy(html_code)

In [26]:
colors = ["green" if x=="exil" else "red" for x in periodical_categories]
title = "Sentences with 'bezpečí' (N={})".format(len(sentences))
fig = go.Figure(data=go.Scatter3d(
    x=xs,
    y=ys,
    z=zs,
    mode='markers',
    marker=dict(
        size=5,
        color=colors,
        opacity=0.3
    ),
    text=hover_text,  # use mapped hover text
    hoverinfo='text',  # ensure only the text field is displayed on hover
))

fig.update_layout(
    title=title,
    scene=dict(
        xaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        yaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        zaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        bgcolor='rgba(255,255,255,0)'
    ),
    paper_bgcolor='rgba(255,255,255,255)',  # set the color of the area around the axes
    plot_bgcolor='rgba(255,255,255,255)',  # set the color of the entire chart
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=0, r=0, b=0, t=0)
)

In [27]:
html_code = pio.to_html(fig, include_plotlyjs='cdn')
pyperclip.copy(html_code)

### Analysis of "bezpečnost" embeddings

In [90]:
%%time
input_path = "../data/rawsentences_bezpecnost/"
output_dir = "../data/embeddings_bezpecnost/"
try:
    os.mkdir(output_dir)
except:
    pass
target = r"bezpečnost[ií]?"
files_df.apply(lambda row: get_embeddings_data(row, target, output_dir), axis=1)

In [29]:
data = []
for filename in os.listdir("../data/embeddings_bezpecnost/"):
    data.extend(pickle.load(open("../data/embeddings_bezpecnost/" + filename, "rb")))

# filter by year
data = [d for d in data if 1948 <= d["year"] <= 1989]

target_vectors = [d["target_embedding"] for d in data]
sentences = [d["sentence"] for d in data]
periodical_categories = [d["periodical_category"] for d in data]
colors = ["green" if x=="exil" else "red" for x in periodical_categories]
hover_text = [d["sentence"].replace("\n", "") + " ({0}, {1}, {2})".format(d["periodical_title"], str(d["year"]).replace(".0", ""), d["periodical_category"])
              for d in data]

In [30]:
Counter(periodical_categories)

In [31]:
tsne = TSNE(n_components=3, perplexity=5)
embeddings_tsne = tsne.fit_transform(np.vstack(target_vectors))

In [94]:
xs = embeddings_tsne[:, 0]
ys = embeddings_tsne[:, 1]
zs = embeddings_tsne[:, 2]

In [95]:
title = "Sentences with 'bezpečnost' (N={})".format(len(sentences))
fig = go.Figure(data=go.Scatter3d(
    x=xs,
    y=ys,
    z=zs,
    mode='markers',
    marker=dict(
        size=5,
        color=colors,
        opacity=0.3
    ),
    text=hover_text,  # use mapped hover text
    hoverinfo='text',  # ensure only the text field is displayed on hover
))

fig.update_layout(
    title=title,
    scene=dict(
        xaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        yaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        zaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        bgcolor='rgba(255,255,255,0)'
    ),
    paper_bgcolor='rgba(255,255,255,255)',  # set the color of the area around the axes
    plot_bgcolor='rgba(255,255,255,255)',  # set the color of the entire chart
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=0, r=0, b=0, t=0)
)


In [96]:
html_code = pio.to_html(fig, include_plotlyjs='cdn')
pyperclip.copy(html_code)

In [32]:
data = [d for d in data if d["periodical_category"] == "samizdat"]

target_vectors = [d["target_embedding"] for d in data]
sentences = [d["sentence"] for d in data]
periodical_categories = [d["periodical_category"] for d in data]
colors = ["green" if x=="exil" else "red" for x in periodical_categories]
hover_text = [d["sentence"].replace("\n", "") + " ({0}, {1}, {2})".format(d["periodical_title"], str(d["year"]).replace(".0", ""), d["periodical_category"])
              for d in data]

In [42]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=0).fit(target_vectors)
labels = kmeans.labels_
cluster_colors = {0: "#1f77b4",  # muted blue
                  1: "#ff7f0e",  # safety orange
                  2: "#2ca02c",  # cooked asparagus green
                  3: "#d62728",  # brick red
                  4: "#9467bd"}  # muted purple
colors = [cluster_colors[l] for l in labels]

In [43]:
tsne = TSNE(n_components=3, perplexity=3)
embeddings_tsne = tsne.fit_transform(np.vstack(target_vectors))

In [44]:
xs = embeddings_tsne[:, 0]
ys = embeddings_tsne[:, 1]
zs = embeddings_tsne[:, 2]

In [51]:
title = "Sentences with 'bezpečnost' (N={})".format(len(sentences))
fig = go.Figure(data=go.Scatter3d(
    x=xs,
    y=ys,
    z=zs,
    mode='markers',
    marker=dict(
        size=5,
        color=colors,
        opacity=0.3
    ),
    text=hover_text,  # use mapped hover text
    hoverinfo='text',  # ensure only the text field is displayed on hover
))

fig.update_layout(
    title=title,
    scene=dict(
        xaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        yaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        zaxis=dict(title='', showgrid=False, showline=False, showticklabels=False, zeroline=False, linecolor='rgba(0,0,0,0)'),
        bgcolor='rgba(255,255,255,0)'
    ),
    paper_bgcolor='rgba(255,255,255,255)',  # set the color of the area around the axes
    plot_bgcolor='rgba(255,255,255,255)',  # set the color of the entire chart
    autosize=False,
    width=1000,
    height=800,
    margin=dict(l=0, r=0, b=0, t=0)
)

In [53]:
html_code = pio.to_html(fig, include_plotlyjs='cdn')
pyperclip.copy(html_code)