In [113]:
import os
import numpy as np
import pandas as pd
import nltk
import re
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

import plotly
import plotly.express as px
import plotly.graph_objects as go


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("fav-kky/FERNET-C5")
model = AutoModelForMaskedLM.from_pretrained("fav-kky/FERNET-C5")

Some weights of the model checkpoint at fav-kky/FERNET-C5 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("fav-kky/FERNET-C5")
model = AutoModel.from_pretrained("fav-kky/FERNET-C5")

In [3]:
path = "../data/rawtexts/"
filenames = os.listdir(path)
len(filenames)

11615

In [8]:
target = "bezpečí"
output_dir = "../data/rawsentences_bezpeci/"
try:
    os.mkdir(output_dir)
except:
    pass
def get_concordances(filename, target, output_dir):
    try:
        with open(path + filename) as f:
            text = f.read()
        text = re.sub("\[pageend\d+\]", " ", text)
        #tokens = nltk.word_tokenize(text)
        matches = re.findall(r'[^\.]*?\b{}\b[^\.]*\.'.format(target), text, re.IGNORECASE)
        if len(matches) > 0:
            matches = [match + '\n' for match in matches]
            pathfn = output_dir + filename
            with open(pathfn, 'w') as f:
                f.writelines(matches)
            return len(matches)
        else:
            return 0
    except:
        return 0

In [None]:
for filename in filenames:
    get_concordances(filename=filename, target = "bezpečí", output_dir = "../data/rawsentences_bezpeci/")

In [72]:
input_path = "../data/rawsentences_bezpeci/"
output_path = "../data/processedtexts_bezpeci/"
try:
    os.mkdir(output_path)
except:
    pass
all_sentences = []
for filename in os.listdir(input_path):
    with open(input_path + filename) as f:
        sentences = f.readlines()
    all_sentences.extend(sentences)
    #processed_sentences = []
    #for sentence in sentences:
    #    sentence = sentence.strip()
    #    processed_sentence = pipe(sentence)[0]['sequence']
    #    processed_sentences.append(processed_sentence)
    #with open(output_path + filename, 'w') as f:
    #    f.writelines(processed_sentences)


In [73]:
len(all_sentences)

2757

In [None]:
%%time
target = r"\bbezpečím?"
data = []

sentences = all_sentences

for sent in sentences:
    inputs = tokenizer(sent, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Convert input_ids to actual tokens
    sentence_data = {"sentence" : sent}
    sentence_data["sentence_tokens"] = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    sentence_data["token_ids"] = inputs['input_ids'][0].tolist()
    with torch.no_grad():
        outputs = model(**inputs)
        sentence_data["sentence_embeddings"] = outputs.last_hidden_state.mean(dim=1)
        sentence_data["sentence_token_embeddings"] = outputs.last_hidden_state[0].numpy()
    # embedding for a target term
    for idx, token in enumerate(sentence_data["sentence_tokens"]):
        if re.search(target, token, re.IGNORECASE):
            sentence_data["target_embedding"] = sentence_data["sentence_token_embeddings"][idx]
            data.append(sentence_data)

In [97]:
target_vectors = [d["target_embedding"] for d in data]
sentences = [d["sentence"] for d in data]

In [98]:
target_embeddings = np.vstack(target_vectors)
cosine_sim_matrix = pd.DataFrame(cosine_similarity(target_embeddings), index=sentences, columns=sentences)

In [102]:
cosine_sim_matrix.iloc[8].sort_values(ascending=False)

 Buďme však realističtí: lidé usilují o mír pro své domovy a o bezpečí; baží po slávě a po potlesku; snaží se získat moc a vliv; hledají přátelství a lásku.\n                                                                                                                                                                                                                                                                1.000000
 Protože nikdo nemůže vyprodukovat všechno, co požaduje, podporuje dosažení všech těchto cílů směna, a právě tak působí i shromažďování majetku, skýtající bezpečí a přinášející moc.\n                                                                                                                                                                                                                                       0.673763
 Je příznakem dnešní doby zmatku, že se ideologie a oceňování vlastních zájmů mění rychleji, nez kdy jindy a že zahraniční politika většiny států postrádá

In [106]:
tsne = TSNE(n_components=3)
embeddings_tsne = tsne.fit_transform(np.vstack(target_vectors))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [111]:
xs = embeddings_tsne[:, 0]
ys = embeddings_tsne[:, 1]
zs = embeddings_tsne[:, 2]

hover_text = sentences

In [120]:
title = "Sentences with 'bezpečí' (N={})".format(len(sentences))
fig = go.Figure(data=go.Scatter3d(
    x=xs,
    y=ys,
    z=zs,
    mode='markers',
    marker=dict(
        size=5,
        color='purple',
        opacity=0.3
    ),
    text=hover_text,  # use mapped hover text
    hoverinfo='text',  # ensure only the text field is displayed on hover
))

fig.update_layout(
    title=title,
    scene=dict(
        xaxis=dict(title='X Axis', showgrid=False, showline=False, zeroline=False),
        yaxis=dict(title='Y Axis', showgrid=False, showline=False, zeroline=False),
        zaxis=dict(title='Z Axis', showgrid=False, showline=False, zeroline=False)
    )
)

In [None]:
cosine_sim_matrix

In [None]:
import plotly
import plotly.express as px