In [2]:
import shutil
import re
import os
os.environ['TOKENIZERS_PARALLELISM'] = "true"
from transformers import AutoTokenizer, AutoModelForMaskedLM
from cltk.sentence.lat import LatinPunktSentenceTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence

In [3]:
tokenizer = AutoTokenizer.from_pretrained("pnadel/latin_tokenizer") #"pnadel/LatinBERT")
model = AutoModelForMaskedLM.from_pretrained("pnadel/LatinBERT")

In [4]:
#!curl https://sciencedata.dk/public/87394f685b79e7f1ebd4a7ead2b4941c/noscemus_raw.zip -o ../data/large_data/noscemus_raw.zip
# shutil.unpack_archive("../data/large_data/noscemus_raw.zip", "../data/large_data/noscemus_raw")

In [5]:
def text_cleaner(rawtext):
    cleantext = rawtext.replace("¬\n", "").replace("\n", " ").replace("ß", "ss").replace("ij","ii")
    cleantext = " ".join([t[0] + t[1:].lower() for t in cleantext.split()])
    cleantext = re.sub("\s\s+", " ", cleantext)
    return cleantext

In [6]:
filenames = os.listdir("../data/large_data/noscemus_raw")

In [10]:
fn = filenames[3]
fn

In [11]:
with open("../data/large_data/noscemus_raw/" + fn, "r") as f:
    text = f.read()

In [12]:
text = text_cleaner(text)
text[:200]

In [13]:
corpus = LatinPunktSentenceTokenizer().tokenize(text)
corpus[1:4]

In [14]:
#tokenizer.pad_token = tokenizer.eos_token
tokenized_corpus = [tokenizer.encode(sentence, return_tensors="pt") for sentence in corpus]

In [15]:
tokenized_corpus[1:4]

In [16]:
max_len = max(len(seq[0]) for seq in tokenized_corpus)
padded_corpus = torch.stack([torch.cat([seq, torch.zeros((1, max_len - len(seq[0])))], dim=1) for seq in tokenized_corpus])
padded_corpus = padded_corpus.squeeze(1)
padded_corpus = padded_corpus.long()

In [17]:
padded_corpus.shape

In [87]:
# Pass the padded corpus through the model
with torch.no_grad():
    outputs = model(input_ids=padded_corpus)

In [89]:
hidden_states = outputs.hidden_states[-1]  # Access the last layer's hidden states


In [35]:
mask_token_id = tokenizer.mask_token_id if tokenizer.mask_token_id is not None else tokenizer.add_tokens("[MASK]")

tokens = tokenizer.encode(text, return_tensors="pt")

In [38]:
masked_indices = torch.bernoulli(torch.full(tokens.shape, 0.15)).bool()
masked_tokens = tokens.clone()
# Use torch.where to replace masked tokens
masked_tokens = torch.where(masked_indices, torch.tensor(mask_token_id), masked_tokens)

In [39]:
corpus = ["audentes forsque deusque iuvat", 
          "audentis fortuna iuvat, piger ipse sibi opstat",
          "audentes in tela ruunt",
          "audentes facit amissae spes lapsa salutis, succurruntque"]
tokenized_corpus = [tokenizer.encode(sentence, return_tensors="pt") for sentence in corpus]

In [40]:
tokenized_corpus

In [None]:
for fn in os.listdir("../data/large_data/noscemus_raw"):
    try:
        with open("../data/large_data/sents_lemmata/{}.txt".format(fn), "r") as f:
            text = f.read()
        text = text_cleaner(text)
        # text_sentence_vectors = []
        # for each sentence in the text
            # apply the bert vectors on it
            # add the sentence to a list of vectorized sentences
            # text_sentence_vectors.append()
        # save the file vectors into its own folder or to sciencedata  (following 
    except:
        pass

In [None]:

!python3 scripts/gen_berts.py --bertPath models/latin_bert/ --tokenizerPath models/subword_tokenizer_latin/latin.subword.encoder > berts.output.txt
