In [1]:
from my_functions import *

‎𐤀 CLTK version '1.2.1'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.

⸖ ``LatinSpacyProcess`` using Stanza model by Stanford University from https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.


In [2]:
model_base = "cabrooks/LOGION-base"
model_50k = "cabrooks/LOGION-50k_wordpiece"

In [3]:
from transformers import BertTokenizer, BertForMaskedLM
tokenizer = BertTokenizer.from_pretrained(model_base)
model = BertForMaskedLM.from_pretrained(model_base, output_hidden_states=True)

In [4]:
from transformers import BertTokenizer, BertForMaskedLM
tokenizer_50k = BertTokenizer.from_pretrained(model_50k)
model_50k = BertForMaskedLM.from_pretrained(model_50k, output_hidden_states=True)

In [5]:
original = [ 
    "εξης δ᾽ εζομενοι πολιην αλα τυπτον ερετμοις",
    "αν δε και αυτοι βαντες επι κληισι καθιζον",
    "νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ἔρυσσαν",
    "τοι δε πρυμνησια ελυσαν"

]
paraphrased = [
    "εξης δ᾽ καθεζομενοι πολιην αλα τυπτον ερετμοις",
    "οι δ᾽ αιψ᾽ εισβαινον και επι κληισι καθιζον",
    "νῆα μὲν οὖν πάμπρωτον ἁλὸς βένθοσδε ἔρυσσαν",
    "τε πρυμνησια λυσαι"
]
confound = [
   "εξης δ᾽ εζομενοι πολιην αλα καθιζον ερετμοις",
    "αν δε και αυτοι βαντες επι κληισι τυπτον",
    "νῆα μὲν οἵ γε μέλαιναν ἁλὸς βένθοσδε ελυσαν",
    "τοι δε πρυμνησια ἔρυσσαν"
]

In [6]:
original = remove_diacritics(original)
paraphrased = remove_diacritics(paraphrased)
confound = remove_diacritics(confound)

BASE

In [7]:
original_emb_dict = mean_cls_embeddings(original, model = model, tokenizer = tokenizer)
paraphrased_emb_dict = mean_cls_embeddings(paraphrased, model = model, tokenizer = tokenizer)
confound_emb_dict = mean_cls_embeddings(confound, model = model, tokenizer = tokenizer)

In [9]:
report_similarity(original_emb_dict, paraphrased_emb_dict, confound_emb_dict, method = "cls")

T-statistic between original-paraphrased and original-original: [3.459114], P-value: [0.01348188]
T-statistic between original-confound and original_original: [4.544662], P-value: [0.00391388]
T-statistic between original-paraphrased and original-confound: [-0.90745294], P-value: [0.39914545]


{'originalvsparaphrased': {'average': 0.91746926, 'variance': 0.002470727},
 'originalvsconfound': {'average': 0.9465302, 'variance': 0.00060602784},
 'originalvsoriginal': {'average': 0.734463, 'variance': 0.00592626}}

In [10]:
report_similarity(original_emb_dict, paraphrased_emb_dict, confound_emb_dict, method = "mean")

T-statistic between original-paraphrased and original-original: [4.8184767], P-value: [0.00294465]
T-statistic between original-confound and original_original: [8.784868], P-value: [0.00012063]
T-statistic between original-paraphrased and original-confound: [-1.2986712], P-value: [0.24173224]


{'originalvsparaphrased': {'average': 0.9094701, 'variance': 0.003166628},
 'originalvsconfound': {'average': 0.9551279, 'variance': 0.00054148573},
 'originalvsoriginal': {'average': 0.71563363, 'variance': 0.0016881862}}

50K

In [11]:
original_emb_dict = mean_cls_embeddings(original, model_50k, tokenizer_50k)
paraphrased_emb_dict = mean_cls_embeddings(paraphrased, model_50k, tokenizer_50k)
confound_emb_dict = mean_cls_embeddings(confound, model_50k, tokenizer_50k)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
report_similarity(original_emb_dict, paraphrased_emb_dict, confound_emb_dict, method = "cls")

T-statistic between original-paraphrased and original-original: [3.6031263], P-value: [0.01132393]
T-statistic between original-confound and original_original: [6.2478294], P-value: [0.00077917]
T-statistic between original-paraphrased and original-confound: [-1.4733229], P-value: [0.19109437]


{'originalvsparaphrased': {'average': 0.89549726, 'variance': 0.002392032},
 'originalvsconfound': {'average': 0.9427889, 'variance': 0.00069893344},
 'originalvsoriginal': {'average': 0.75946856, 'variance': 0.0018838275}}

In [13]:
report_similarity(original_emb_dict, paraphrased_emb_dict, confound_emb_dict, method = "mean")

T-statistic between original-paraphrased and original-original: [3.6252406], P-value: [0.01102777]
T-statistic between original-confound and original_original: [10.2460985], P-value: [5.0413542e-05]
T-statistic between original-paraphrased and original-confound: [-2.0859687], P-value: [0.08205451]


{'originalvsparaphrased': {'average': 0.8558754, 'variance': 0.004650526},
 'originalvsconfound': {'average': 0.9408885, 'variance': 0.00033232424},
 'originalvsoriginal': {'average': 0.69269204, 'variance': 0.0014280115}}