In [1]:
# prompt: connect to my google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/BullingerDigitalLMFootnotes/data/external_knowledge

/content/drive/MyDrive/BullingerDigitalLMFootnotes/data/external_knowledge


In [3]:
!pip install transformers faiss-gpu
!pip install jsonlines

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [4]:
!pip install jsonlines



In [5]:

import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(f"Using device: {device}")


Using device: cuda


In [7]:
import faiss
import numpy as np
import jsonlines
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [9]:
# Step 1: Load the pre-trained transformer model and tokenizer
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"  # Multilingual model
# model_name = "pstroe/roberta-base-latin-cased"  # latin model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [8]:
import faiss
import numpy as np
import jsonlines
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

DATA_FOLDER = "../../data/external_knowledge"

# Step 1: Load the pre-trained transformer model and tokenizer
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"  # Multilingual model
# model_name = "pstroe/roberta-base-latin-cased"  # latin model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Function to generate embeddings for text
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get a single vector
    return embeddings.squeeze().cpu().numpy()

# Step 2: Load the Bible data from JSONL and embed each verse
verses = []
embeddings = []

# Path to your JSONL file (created previously)
bible_jsonl = "vulgata_bible.jsonl"

with jsonlines.open(bible_jsonl) as reader:
    for verse in tqdm(reader, total=35820):
        text = verse['text']
        verses.append(verse)  # Keep track of verses for later use
        embedding = embed_text(text, tokenizer, model)
        embeddings.append(embedding)

# Convert the embeddings to a NumPy array
embeddings = np.array(embeddings)

# Step 3: Create a FAISS index
d = embeddings.shape[1]  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(d)  # L2 distance for similarity search
index.add(embeddings)  # Add the embeddings to the index

# Step 4: Save the FAISS index to disk
faiss.write_index(index, f"vulgata_bible_faiss.index")
print("FAISS index saved to disk.")


100%|██████████| 35820/35820 [06:31<00:00, 91.56it/s]


FAISS index saved to disk.


In [20]:
import faiss
import faiss.contrib.torch_utils  # Required for GPU support
import numpy as np
import jsonlines
import torch
from transformers import AutoTokenizer, AutoModel

# Step 1: Load the pre-trained transformer model and tokenizer (on GPU)
# model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"  # Multilingual model
model_name = "pstroe/roberta-base-latin-cased"  # latin model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')  # Move model to GPU

# Function to generate embeddings for a batch of texts on GPU
def embed_batch(texts, tokenizer, model):
    # Tokenize the batch of texts
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to('cuda')

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling across tokens to get a single vector for each text

    return embeddings.cpu().numpy()  # Move embeddings back to CPU

# Step 2: Load the Bible data from JSONL and embed in batches
verses = []
embeddings = []

# Path to your JSONL file
bible_jsonl = "vulgata_bible.jsonl"

batch_size = 32  # Adjust based on your GPU memory capacity

# Read the verses in batches
with jsonlines.open(bible_jsonl) as reader:
    batch_texts = []

    for verse in tqdm(reader, total=35820):
        text = verse['text']
        verses.append(verse)  # Keep track of verses for later use
        batch_texts.append(text)

        # If we reach the batch size, process the batch
        if len(batch_texts) == batch_size:
            batch_embeddings = embed_batch(batch_texts, tokenizer, model)  # Embed the batch
            embeddings.extend(batch_embeddings)
            batch_texts = []  # Reset the batch

    # Handle the last batch if it has fewer than batch_size items
    if batch_texts:
        batch_embeddings = embed_batch(batch_texts, tokenizer, model)
        embeddings.extend(batch_embeddings)

# Convert the embeddings to a NumPy array
embeddings = np.array(embeddings)
# Convert the embeddings to a NumPy array
embeddings = np.array(embeddings)

# Step 3: Create a FAISS index
d = embeddings.shape[1]  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(d)  # L2 distance for similarity search
index.add(embeddings)  # Add the embeddings to the index

  0%|          | 31/35820 [00:02<39:55, 14.94it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [18]:
torch.cuda.empty_cache()

In [7]:
!ls

sample_data


In [10]:
index = faiss.read_index("vulgata_bible_faiss.index")

In [12]:
# Step 3: Load the Bible verses from the JSONL file to map back to the text
verses = []
bible_jsonl = "vulgata_bible.jsonl"
with jsonlines.open(bible_jsonl) as reader:
    for verse in reader:
        verses.append(verse)

In [14]:
# Function to generate embeddings for text
def embed_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling to get a single vector
    return embeddings.squeeze().cpu().numpy()

In [17]:
# Step 6: Define a function to search for similar verses
def search(query, index, tokenizer, model, verses, top_k=5):
    # Embed the query
    query_embedding = embed_text(query, tokenizer, model).reshape(1, -1)

    # Search for the most similar verses
    distances, indices = index.search(query_embedding, top_k)

    # Return the results with distances
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            "book": verses[idx]['book'],
            "chapter": verses[idx]['chapter'],
            "verse": verses[idx]['verse'],
            "text": verses[idx]['text'],
            "distance": distances[0][i]
        })
    return results

# Step 7: Query the system
query = "Viri similiter cohabitantes secundum scientiam, quasi infirmiori vasculo muliebri impartientes honorem, tamquam et coheredibus gratiæ vitæ:"
results = search(query, index, tokenizer, model, verses, top_k=3)

# Step 8: Print the results
for result in results:
    print(f"{result['book']} {result['chapter']}:{result['verse']} - {result['text']} (Distance: {result['distance']:.4f})")


I Petrus 3:7 - Viri similiter cohabitantes secundum scientiam, quasi infirmiori vasculo muliebri impartientes honorem, tamquam et coheredibus gratiæ vitæ: ut non impediantur orationes vestræ. (Distance: 4.8385)
Ecclesiasticus 42:13 - de vestimentis enim procedit tinea, et a muliere iniquitas viri. (Distance: 17.1913)
Lucas 10:8 - Et in quamcumque civitatem intraveritis, et susceperint vos, manducate quæ apponuntur vobis: (Distance: 18.0486)


In [36]:
# Step 7: Query the system
query = "Accipitur ergo predicatio hic tropicos pro absolutione ab Ade prevaricatione__5 aut pro salutis manifestatione"
results = search(query, index, tokenizer, model, verses, top_k=10)

# Step 8: Print the results
for result in results:
    print(f"{result['book']} {result['chapter']}:{result['verse']} - {result['text']} (Distance: {result['distance']:.4f})")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


I Petrus 3:7 - Viri similiter cohabitantes secundum scientiam, quasi infirmiori vasculo muliebri impartientes honorem, tamquam et coheredibus gratiæ vitæ: ut non impediantur orationes vestræ. (Distance: 83.1919)
Ecclesiasticus 8:6 - Ne despicias hominem avertentem se a peccato, neque improperes ei. memento quoniam omnes in correptione sumus. (Distance: 84.0194)
II Petrus 2:20 - Si enim refugientes coinquinationes mundi in cognitione Domini nostri, et Salvatoris Iesu Christi, his rursus implicati superantur: facta sunt eis posteriora deteriora prioribus. (Distance: 84.0387)
II Corinthii 4:2 - sed abdicamus occulta dedecoris, non ambulantes in astutia, neque adulterantes verbum Dei, sed in manifestatione veritatis commendantes nosmetipsos ad omnem conscientiam hominum coram Deo. (Distance: 85.9802)
Ecclesiasticus 20:24 - Est qui perdet animam suam præ confusione, et ab imprudenti persona perdet eam: personæ autem acceptione perdet se. (Distance: 89.1021)
Psalmi 30:21 - Abscondes eos in a

In [24]:
%ls ..

[0m[01;34mdownsized_letters[0m/    footnote_df_test_head.csv  letter_df_test.csv       strat_sample_bible.json
[01;34mexternal_knowledge[0m/   footnote_downsized_df.csv  letter_downsized_df.csv  strat_sample.json
footnote_df.csv       [01;34mhuman[0m/                     literatur.tsv
footnote_df_head.csv  id_to_edition_map.json     [01;34mmodel_responses[0m/
footnote_df_test.csv  letter_df.csv              [01;34mprompts[0m/


In [31]:
# check usefulness:

import pandas as pd
import json

def get_bible_df(split:str):
  footnote_df = pd.read_csv("../footnote_downsized_df.csv")

  with open("../strat_sample_bible.json", "r") as f:
      strat_sample_bilble = json.load(f)

  tuple_list = []
  for sublist in strat_sample_bilble[split]:
    tuple_list.append((sublist[0], sublist[1]))

  tuples_df = pd.DataFrame(tuple_list, columns=["letter_id", "n_footnote"])

  return footnote_df.merge(tuples_df, on=['letter_id', 'n_footnote'], how='inner')

bible_df = get_bible_df("dev")
bible_df.head()


Unnamed: 0,letter_id,edition,n_footnote,n_sentence,xml_footnote,xml_sentence,text_footnote,text_sentence,len_footnote,pos_footnote,label
0,10047,10,2,2,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:...","<s xmlns=""http://www.tei-c.org/ns/1.0"" n=""2"" x...","Gemeint ist 1Petr 3, 19f (vgl. unten Z. 23).","Quęris ex me indocto plane homine, charissime ...",9,11,misc
1,10047,10,5,8,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:...","<s xmlns=""http://www.tei-c.org/ns/1.0"" n=""8"" x...","Vgl. Röm 5, 14.",Accipitur ergo predicatio hic tropicos pro abs...,4,9,short
2,10047,10,6,9,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:...","<s xmlns=""http://www.tei-c.org/ns/1.0"" n=""9"" x...","Vgl. 1Kor 15, 54f.","Christus praedicavit inferis, hoc est, Christu...",4,29,short
3,10053,1,17,10,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:...","<s xmlns=""http://www.tei-c.org/ns/1.0"" n=""10"" ...","Vgl. 1 Kor 15, 19.","Ist nun ghein ander läben mee dann nun das, da...",5,18,short
4,10053,1,20,15,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:...","<s xmlns=""http://www.tei-c.org/ns/1.0"" n=""15"" ...",Vgl. Gen 1-2.,Den gott muͦstu dir fürstellen als den einigen...,3,12,short
