In [1]:
!huggingface-cli login

n

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Preparations

### Installs with pip

In [3]:
%pip install accelerate



In [4]:
%pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [5]:
%pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


### Imports

In [6]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer

In [7]:
# mounting drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [8]:
%cd drive/MyDrive/BullingerDigitalLMFootnotes/git_repo


/content/drive/MyDrive/BullingerDigitalLMFootnotes/git_repo


### functions

In [9]:
# functions to run over the prompts

import os, csv
from tqdm import tqdm
import jsonlines
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

def count_prompt_tokens(model_id, prompt_type, split):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  data_path = "../data"
  folder_path = os.path.join(data_path, f"prompts/{prompt_type}/{split}")
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  token_len_list = []
  long_letters_set = set()
  for filename in tqdm(os.listdir(folder_path)):
    filepath = os.path.join(folder_path, filename)
    with jsonlines.open(filepath) as infile:
      messages = [line for line in infile]

    # get the input ids
    input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
    )
    if input_ids.shape[1] > 20000:
      letter_id = filename.split("_")[0]
      long_letters_set.add(letter_id)
    token_len_list.append(input_ids.shape[1])
  return token_len_list, long_letters_set

# Generating with a chat model
def generate_chat(messages:list, model, tokenizer):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # generate the attention mask, got a warning that it might be needed, though I am not sure if that is necessary
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=model.device)

    if "llama" in model.config._name_or_path:
      terminators = [
          tokenizer.eos_token_id,
          tokenizer.convert_tokens_to_ids("<|eot_id|>")
      ]


      outputs = model.generate(
          input_ids,
          attention_mask=attention_mask,
          max_new_tokens=256,
          eos_token_id=terminators,
          do_sample=True,
          temperature=0.6,
          top_p=0.9,
          pad_token_id=tokenizer.eos_token_id
      )
    else:  # for the Qwen model
        outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id
    )



    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)


def run_llama_over_prompts(prompt_type, split, long_letters_set=set(), batch_size=0, testrun=False):
  """run over the set specified. Saves in a csv file under model_responses
  Note that if there is already a file with that name it will only add the
  ones that are not generated yet"""

  data_path = "../data"
  folder_path = os.path.join(data_path, f"prompts/{prompt_type}/{split}")
  model_name = model_id.split("/")[-1]
  outfile_path = os.path.join(data_path, f"model_responses/llama/{model_name}_{prompt_type}_{split}.csv")
  if testrun:
    outfile_path = outfile_path.replace(".csv", "_testrun.csv")

  finished = []  # tuples of letter_id and n_footnote that are already done
  if os.path.exists(outfile_path):
    with open(outfile_path, "r", encoding="utf-8") as infile:
      reader = csv.reader(infile)
      next(reader)  # skip header
      try:
        finished = [(row[0], row[1]) for row in reader]
      except IndexError:
        print("faulty csv file found, rewriting")
        finished = []

  if not finished:  # if the file was non existent or faulty... rewrite
    with open(outfile_path, "w", encoding="utf-8") as outfile:
      outfile.write(f"letter_id,n_footnote,generated_footnote\n")

  unfinished = []
  for filename in os.listdir(folder_path):
    letter_id = filename.split("_")[0]
    n_footnote = filename.split("_")[1].split(".")[0]
    if (letter_id, n_footnote) in finished:
      continue
    # ignore the long letters for if specified
    if letter_id in long_letters_set:
      continue
    unfinished.append((filename, letter_id, n_footnote))

  if batch_size == 0:
    for filename, letter_id, n_footnote in tqdm(unfinished):
      filepath = os.path.join(folder_path, filename)
      with jsonlines.open(filepath) as infile:
        messages = [line for line in infile]
        try:
          generated_footnote = generate_chat(messages, model, tokenizer)
        except RuntimeError as e:
          if 'CUDA out of memory' in str(e):
            print(f"letter {letter_id} causes out of memory error")
            long_letters_set.add(letter_id)
            torch.cuda.empty_cache()
            continue
          else:
              # Raise other exceptions
              raise e
        torch.cuda.empty_cache()
      with open(outfile_path, "a", encoding="utf-8") as outfile:
        writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, escapechar="\\")
        writer.writerow([letter_id, n_footnote, generated_footnote])

  else:
    messages_list = []
    letter_id_list = []
    n_footnote_list = []
    for filename, letter_id, n_footnote in tqdm(unfinished):
      filepath = os.path.join(folder_path, filename)
      with jsonlines.open(filepath) as infile:
        messages = [line for line in infile]
      messages_list.append(messages)
      if len(messages_list) == batch_size:
        generated_footnotes = generate_chat_batch(messages_list, model, tokenizer)
        torch.cuda.empty_cache()
        for generated_footnote, letter_id, n_footnote in zip(generated_footnotes, letter_id_list, n_footnote_list):
          with open(outfile_path, "a", encoding="utf-8") as outfile:
            writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, escapechar="\\")
            writer.writerow([letter_id, n_footnote, generated_footnote])
        messages_list = []
        letter_id_list = []
        n_footnote_list = []



# Informal tests

### Texts for testing:

In [None]:
# Small input, one FN is there, a second one is to be generated:
text1 = """<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="file10224" type="Brief" source="HBBW-3" n="193">
	<teiHeader xml:lang="de">
		<fileDesc>
			<titleStmt>
				<title subtype="file">Konrad Geßner, Johannes Fries / Basel an Heinrich Bullinger, 25. Februar [1533]</title>
			</titleStmt>
			<publicationStmt>
				<authority>Universität Zürich</authority>
				</publicationStmt>
			<sourceDesc>
				</sourceDesc>
		</fileDesc>
		</teiHeader>
	<text xml:lang="la">
		<body>
			<div xml:id="div1" corresp="regest1">
				<p>
					<s n="1" xml:lang="la" type="auto">Optimo et integerrimo viro M. Henrico Bullingero, mecaenati charissimo.</s>
				</p>
				<p>
					<s n="2" xml:lang="la" type="auto">S.</s>
					<s n="3" xml:lang="la" type="auto">Impediunt nos ab itinere<note xml:id="fn3" type="footnote" n="3"><persName ref="p1283" cert="high">Geßner</persName> und <persName ref="p1214" cert="high">Fries</persName> befanden sich auf dem Weg nach <placeName ref="l59" cert="high">Bourges</placeName>.</note> nives, pluvia et ventorum vis.</s>
					<s n="4" xml:lang="la" type="auto"><placeName ref="l28" cert="high">Basileae</placeName> apud <persName ref="p8418" cert="high">Myconium</persName><note xml:id="fn4" type="footnote" n="4">"""

# Same Letter, but now the first FN is to be generated. Meaning the model has no direct example of a FN in the edition
text2 = """<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="file10224" type="Brief" source="HBBW-3" n="193">
	<teiHeader xml:lang="de">
		<fileDesc>
			<titleStmt>
				<title subtype="file">Konrad Geßner, Johannes Fries / Basel an Heinrich Bullinger, 25. Februar [1533]</title>
			</titleStmt>
			<publicationStmt>
				<authority>Universität Zürich</authority>
				</publicationStmt>
			<sourceDesc>
				</sourceDesc>
		</fileDesc>
		</teiHeader>
	<text xml:lang="la">
		<body>
			<div xml:id="div1" corresp="regest1">
				<p>
					<s n="1" xml:lang="la" type="auto">Optimo et integerrimo viro M. Henrico Bullingero, mecaenati charissimo.</s>
				</p>
				<p>
					<s n="2" xml:lang="la" type="auto">S.</s>
					<s n="3" xml:lang="la" type="auto">Impediunt nos ab itinere<note xml:id="fn3" type="footnote" n="3">"""

# A long letter, the model has to generate the very last FN
text3 =  """<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="file12260" type="Brief" source="HBBW-15" n="2213">
	<teiHeader xml:lang="de">
		<fileDesc>
			<titleStmt>
				<title subtype="file">Matthias Erb / Reichenweier an Heinrich Bullinger, 11. August [1545]</title>
			</titleStmt>
			<publicationStmt>
				<authority>Universität Zürich</authority>
				</publicationStmt>
			<sourceDesc>
				<bibl type="scan">Zürich StA</bibl>
				</sourceDesc>
		</fileDesc>
		</teiHeader>
	<text xml:lang="la">
		<body>
			<div xml:id="div1" corresp="regest1">
				<p>
					<s n="1" xml:lang="la" type="auto">Gratia domini tecum.</s>
					<s n="2" xml:lang="la" type="auto">Quum decima augusti<note xml:id="fn2" type="footnote" n="2">An einem Montag, dem Vortag (s. Z. 17f) des Datums dieses Briefes.</note> e suggestu descendissem me actutum a templo ad aulam conferens (vocatus scilicet a cancellario<note xml:id="fn3" type="footnote" n="3"><persName ref="p8103" cert="high">Sigismund Stier</persName>.</note> ob quędam negotia ecclesię componenda), venit hic tuus legatus<note xml:id="fn4" type="footnote" n="4"><persName ref="p18219" cert="high">Markus Wüst</persName>; s. oben <ref target="file12256">Nr. 2209</ref>, Anm. b.</note> hora ferme nona et, dum in angiportu sisto gradum, servum<note xml:id="fn5" type="footnote" n="5">Unbekannt.</note> ad limina de <persName ref="p1010" cert="high">Erbio</persName> rogat, me vero oculis ostiario significante ne proderet, statim digito demonstrat dicens: „Hic est.“</s>
					<s n="3" xml:lang="la" type="auto">Is ergo ad me veniens percunctatur, an mihi sit nomen <persName ref="p1010" cert="high">Erbio</persName>.</s>
					<s n="4" xml:lang="la" type="auto">Aio.</s>
					<s n="5" xml:lang="la" type="auto">Statimque litteras<note xml:id="fn6" type="footnote" n="6">Oben <ref target="file12256">Nr. 2209</ref>.</note> depromens dicebat a te missas.</s>
					<s n="6" xml:lang="la" type="auto">Acceptas legi simul obstupescens tuam, qui alias occupatissimus es, sedulitatem in describendis ex autographo copiis<note xml:id="fn7" type="footnote" n="7">Vgl. oben <ref target="file12256">Nr. 2209</ref>, 15-19.</note>.</s>
					<s n="7" xml:lang="la" type="auto">Stans ergo attonitus et propemodum exanimis iussi adolescentem expectare, dum redeam, atque subinde inter tractatus et varias dissidentium quęstiones et tacite et indesinenter volvo ac revolvo intime, quę scripseras.</s>
					<s n="8" xml:lang="la" type="auto">Absolutus a strepitu aulico eo cum adolescente pransum interque epulandum de singulis rebus percunctor, statim scilicet a mensa tibi responsurus.</s>
					<s n="9" xml:lang="la" type="auto">Dum id meditor, alius adest nuntius<note xml:id="fn8" type="footnote" n="8">Unbekannt.</note> litteras ferens a principe<note xml:id="fn9" type="footnote" n="9"><persName ref="p8098" cert="high">Graf Georg von Württemberg-Mömpelgard</persName>, der sich laut Z. 16 damals in <placeName ref="l569" cert="high">Worms</placeName> aufhielt. Seine Teilnahme am <placeName ref="l569" cert="high">Wormser</placeName> Reichstag ist in RTA JR XVIII-2 nicht belegt.</note>, in quibus multa mihi committuntur negotia principemque a comitiis<note xml:id="fn10" type="footnote" n="10">Der Reichstag in <placeName ref="l569" cert="high">Worms</placeName>.</note> propediem rediturum pollicentes.</s>
					<s n="10" xml:lang="la" type="auto">Hinc eo die, tametsi adolescens crebro urgeret abituriens, nil scripsi omnia in posterum, hoc est in hodiernum, reiiciens diem.</s>
					<s n="11" xml:lang="la" type="auto">Haec de pueri tui diligentia.</s>
				</p>
				<p>
					<s n="12" xml:lang="la" type="auto">Quod vero ad negotium adtinet, paucis haec accipe.</s>
					<s n="13" xml:lang="la" type="auto">Fateor me iam annum quinquagesimum egressum<note xml:id="fn11" type="footnote" n="11"><persName ref="p1010" cert="high">Erb</persName> war 1494 geboren.</note>, et qui nunc in ea aetate sum, quę sedatior esse cupit, sepicule curas aulicas, quibus distringor,<pb type="scan" next="257v."/> depręcatum, sed parum promovisse; imo, quo plus me expedire cupio, eo amplius perstringor dispendio et vitae et rerum.</s>
					<s n="14" xml:lang="la" type="auto">Sic deo optimo maximo visum est.</s>
				</p>
				<p>
					<s n="15" xml:lang="la" type="auto">Quod ad vitam, ut omnia manifestius depingam, subit tarda senectus,<note xml:id="fn12" type="footnote" n="12"><hi>Vergil,</hi> Aeneis, 9, 610.</note> quę difficilis incipit esse, sola non suffitiens laboribus, dum indies creberrime exantlatur.</s>
					<s n="16" xml:lang="la" type="auto">Nullum habeo symmistam, ut nosti, opinor.</s>
					<s n="17" xml:lang="la" type="auto">Plures iussu principis vocavi; sed horrent id, quod ego quęror.</s>
					<s n="18" xml:lang="la" type="auto">Metuunt enim sibi ab ingeniis cervinis<note xml:id="fn13" type="footnote" n="13">ingenium cervinum: ein ängstliches Wesen; s. <bibl>Adagia</bibl>, 2, 7, 36 (<bibl>ASD</bibl> II/4 110, <ref target="file11679">Nr. 1636</ref>).</note>, nec ego quenquam nisi volentem ad me trahere volo, etc.<pb type="edition" next="439"/></s>
					<s n="19" xml:lang="la" type="auto">Quod vero ad res meas adtinet, male collocat ille, quem ex nomine nosti, stipendia.</s>
					<s n="20" xml:lang="la" type="auto">Ex patrimonio intra quinquennium<note xml:id="fn16" type="footnote" n="16">D.h. seit 1539, was mit dem Amtsantritt <persName ref="p17986" cert="high">Henmans</persName> übereinstimmt.</note> ducentos fere aureos non in meam, sed magis ecclesię necessitatem exposui, quod dies domini revelabit.</s>
					<s n="21" xml:lang="la" type="auto">Nec quęror, optime frater, de perditis rebus, sed de Harpiis furacibus, qui recte monentes non exaudiunt et ecclesię avaritię notam inurunt; quod non adeo magni referret, nisi metuendum de excidio foret.</s>
					<s n="22" xml:lang="la" type="auto">Quamdiu rapiendum, placet Christus; dum defecerit ecclesię fiscus, non placebit ultra Christus!</s>
					<s n="23" xml:lang="la" type="auto">Hactenus 3 fuimus in isto oppido, ubi quisque designatum habuit<pb type="edition" next="440"/> stipendium.</s>
					<s n="24" xml:lang="la" type="auto">Ego nunc solus omnibus curis pręfectus aegre a satrapis<note xml:id="fn18" type="footnote" n="18">Zu verstehen: von den Beamten, u.a. vom neuen Schaffner.</note> meum recipio stipendium, et si egerem, vix triobulo sublevarent.</s>
					<s n="25" xml:lang="la" type="auto">Sunt pręterea et alię rationes, quibus te nolo nunc gravare scribendo, etc.</s>
					<s n="26" xml:lang="la" type="auto">Atque inter tot turbines nondum video satis iustam occasionem desertionis huius ecclesię.</s>
				</p>
				<p>
					<s n="27" xml:lang="de" type="auto"><pb type="scan" next="257a,r."/>Summa.</s>
					<s n="28" xml:lang="la" type="auto">Ita haereo inter sacrum et saxum<note xml:id="fn19" type="footnote" n="19">Siehe Adagia, 1, 1, 15 (<bibl>ASD</bibl> II/1 128f, Nr. 15).</note>, ut nunc sine discrimine ecclesięque Christi maxima iactura, ut facile intelligis, discedere nequeam.</s>
				</p>
				<p>
					<s n="29" xml:lang="la" type="auto">Atque eę sunt potissimę rationes.</s>
					<s n="30" xml:lang="la" type="auto">Ecclesias nostras iam a sexto anno<note xml:id="fn20" type="footnote" n="20">D.h. seit 1538, als Erbin Reichenweier tätig wurde.</note> utcunque in ordinem, ut potuimus - quando, ut volueramus, non licuit - redegimus.</s>
				</p>
				<p>
					<s n="31" xml:lang="la" type="auto">Annitendum ergo, ut cępta perficiantur.</s>
					<s n="32" xml:lang="la" type="auto">Ad hoc ecclesia illa nobis contigua, Montbelgardiana, cum suo principe<note xml:id="fn21" type="footnote" n="21"><persName ref="p1546" cert="high">Graf Christoph von Württemberg</persName>.</note> adhuc vacillat; de qua totus liber foret perscribendus.</s>
					<s n="33" xml:lang="la" type="auto">Pręterea scholę quattuor in isto parviusculo agro sunt constitutę et quinta in rure instituenda, stipendia quoque studiosis quibusdam ante paucos menses pollicita,<note xml:id="fn23" type="footnote" n="23">Erst 1555 kam es zur Einrichtung eines solchen Stipendiums durch Graf Georg; s. <hi>Adam,</hi> aao, S. 306.</note> sed haec non omnia firmata et rata, proponenda etiamnum ecclesię quędam decreta, quibus ad disciplinam et sanctos mores utcunque redigantur, idque statim redeunte principe.</s>
					<s n="34" xml:lang="la" type="auto">Illo enim absente haec omnia conscripta sunt.</s>
					<s n="35" xml:lang="la" type="auto">Dum igitur nunc mutavero locum omnia sub cultro relinquens<note xml:id="fn24" type="footnote" n="24">omnia sub cultro relinquens: alles der Gefahr aussetzend; vgl. <bibl>Adagia</bibl>, 2, 10, 83 (<bibl>ASD</bibl> II/4 328, <ref target="file12030">Nr. 1983</ref>).</note>, facile intelligis, quid periculi inmineat.</s>
					<s n="36" xml:lang="la" type="auto">In primis severus princeps, a quo omnia pendent, facile vel mutaretur vel irritaretur, ut solet hoc hominum genus; id quod nimirum magis in ecclesię quam in meum incomodum cęderet.</s>
					<s n="37" xml:lang="la" type="auto">Viden inter coelum et terram me suspensum palpitare neque posse hac ratione vel vocatione, quę tamen et sancta et iusta, liberari?</s>
					<s n="38" xml:lang="la" type="auto">Occasio mihi expectanda comodior, quę me liberet sine ecclesię iactura.</s>
					<s n="39" xml:lang="la" type="auto">Nam si iam abirem, ecclesia viduata absque pastore inter cunctatores et cessatores fluctuaret, etc.</s>
				</p>
				<p>
					<s n="40" xml:lang="la" type="auto"><pb type="scan" next="257a,v."/>Ago vero et habeo tibi gratias inmortales, humanissime Bullingere, quod admodum ingenti cura meo nomine laboraveris, atque hoc impensius te amo, quod charitatis offitium non fucate, sed dextre candideque in me collocaris, imo significaris in hac sancta vocatione.</s>
					<s n="41" xml:lang="la" type="auto">Respondebit olim procul dubio deus optimus maximus tuis votis, quo <placeName ref="l1976" cert="high">Augustani</placeName> nanciscantur virum aliquem sanctum, ecclesię dignum ministrum,<note xml:id="fn25" type="footnote" n="25"><persName ref="p8018" cert="high">Johannes Haller</persName> wurde schließlich aus Zurich nach <placeName ref="l1976" cert="high">Augsburg</placeName> gesandt; s. oben <ref target="file12211">Nr. 2164</ref>, Anm. 10.</note> atque ego etiam tuis assiduis<pb type="edition" next="441"/> praecibus vel liberer publica occasione vel sub onere non defitiam.</s>
					<s n="42" xml:lang="la" type="auto">Sed te etiamnum per Iesum Christum, ne deinceps plus mihi vel tribuas, immodicus scilicet virtutum mearum, quibus penitus careo, pręco, vel magis sentias de me, quam res est.</s>
					<s n="43" xml:lang="la" type="auto">Optime enim mihi ipsi conscius sum curtę suppellectilis,<note xml:id="fn26" type="footnote" n="26"><hi>Persius,</hi> Saturae, 4, 52.</note> quam tu, ni fallor, maiori atque ego, nescio quorum iuditio persuasus, aestimas.</s>
				</p>
				<p>
					<s n="44" xml:lang="la" type="auto">Porro, quod nunc temporis importunitas negat, alias benignus deus, ecclesiarum verus et unicus pastor<note xml:id="fn27" type="footnote" n="27">Vgl. <cit type="bible"><ref>Hebr 13, 20</ref></cit>; <cit type="bible"><ref>1Petr 2, 25</ref></cit>; <cit type="bible"><ref>5, 4.</ref></cit></note>, solita clementia sancta resarciet oportunitate.</s>
					<s n="45" xml:lang="la" type="auto">Sancto quoque magistratui <placeName ref="l1976" cert="high">Augustano</placeName>, inprimis vero erudito viro domino <persName ref="p8354" cert="high">Georgio Leto</persName><note xml:id="fn28" type="footnote" n="28"><persName ref="p8354" cert="high">Georg Frölich</persName>.</note>, ingentes gratias meo nomine agas, obsecro, quod me, ineptum homuntionem,<note xml:id="fn29" type="footnote" n="29">Vgl. <cit type="bible"><ref>1Kor 15,9</ref></cit>; <cit type="bible"><ref>Eph 3, 8</ref></cit>.</note> adeo egregia dignati sint vocatione.</s>
					<s n="46" xml:lang="la" type="auto">Ego quoque deinceps eius ecclesię maiorem habebo - si potis fuero - ubique rationem, optoque illis stabilem pacem et concordiam talesque antistites, qui pietate, eruditione, prudentia, vigilantia et fidelitate ita pręmineant, quo Christus in ea republica ita mittat radices, ut nullis satanę et impiorum imposturis deiiciatur.</s>
					<s n="47" xml:lang="la" type="auto">Monebis ergo illos, ne me deinceps expectent; ignoro enim meam liberationem.</s>
					<s n="48" xml:lang="la" type="auto">Servet te Christus.</s>
				</p>
				<p>
					<note type="attachment">[Beilage:]</note>
					<s n="49" xml:lang="la" type="auto"><pb type="scan" next="258r."/>Nova: Absolutionem conmitiorum<note xml:id="fn30" type="footnote" n="30">Der Reichstag zu <placeName ref="l569" cert="high">Worms</placeName>.</note> non ignoras, in quibus nil certi definitum, nisi quod istis hybernis mensibus adversarii nobis pacem condixerunt; futura aestate nobis providendum erit<note xml:id="fn31" type="footnote" n="31">Ein Gerücht.</note>.</s>
					<s n="50" xml:lang="la" type="auto">Acta comitiorum nondum sunt publicata.</s>
					<s n="51" xml:lang="la" type="auto">Aiunt quidam caesari<note xml:id="fn33" type="footnote" n="33"><persName ref="p18538" cert="high">Karl V.</persName></note> omnia esse commissa, ut dicat sententiam etiam in religionis negotio certis tamen conditionibus; quodsi verum fuerit, ovem dicemus lupo<note xml:id="fn34" type="footnote" n="34">Vgl. <cit type="bible"><ref>Joh 10, 12</ref></cit>.</note> commissam.</s>
				</p>
				<p>
					<s n="52" xml:lang="la" type="auto"><persName ref="p18477" cert="high">Ferdinandus</persName> priori septimana<note xml:id="fn35" type="footnote" n="35">Zu verstehen: In der ersten Augustwoche. - <persName ref="p18477" cert="high">Ferdinand I.</persName> war am 31. Juli 1545 aus <placeName ref="l569" cert="high">Worms</placeName> abgereist; s. unten <ref target="file12261">Nr. 2214</ref>, 28 und Anm. 20.</note> a comitiis magno strepitu solvens Bohemiam petit cum <persName ref="p7688" cert="high">Nausea</persName><note xml:id="fn36" type="footnote" n="36"><persName ref="p7688" cert="high">Friedrich Nausea</persName>, Bischof von <placeName ref="l546" cert="high">Wien</placeName>.</note> suo et quibusdam monachis.</s>
					<s n="53" xml:lang="la" type="auto">Sequenti septimana<pb type="edition" next="442"/> caesar abiturus est, sed quo, ignoratur<note xml:id="fn37" type="footnote" n="37"><persName ref="p18538" cert="high">Karl V.</persName> hatte <placeName ref="l569" cert="high">Worms</placeName> am 7. August in Richtung <placeName ref="l267" cert="high">Köln</placeName> verlassen; s. oben <ref target="file12261">Nr. 2214</ref>, 31 und Anm. 23f.</note>.</s>
					<s n="54" xml:lang="la" type="auto">Erant alia dicta comitia ad <placeName ref="l410" cert="high">Reginospurgum</placeName> ad mensem octobrem,<note xml:id="fn38" type="footnote" n="38">Der geplante Reichstag in <placeName ref="l410" cert="high">Regensburg</placeName>, der erst Anfang 1546 begann; s. oben <ref target="file12258">Nr. 2211</ref>, Anm. 24.</note>, sed aiunt quidam irritata<note xml:id="fn39" type="footnote" n="39">Hier im Sinne von "vergeblich"; s. <hi><persName ref="p8320" cert="high">Dasypodius</persName>,</hi> Dic. 199.</note>, etc.</s>
				</p>
				<p>
					<s n="55" xml:lang="la" type="auto">Urgent plures nacionale concilium, idque <placeName ref="322" type="auto_name">Metis</placeName><note xml:id="fn40" type="footnote" n="40"><placeName ref="l322" cert="high">Metz</placeName>.</note> celebrari postulant.</s>
					<s n="56" xml:lang="la" type="auto">Musher<note xml:id="fn41" type="footnote" n="41">Vom französischen "Monsieur".</note> de Gyß<note xml:id="fn42" type="footnote" n="42"><persName ref="p18450" cert="high">Claude I. von Lothringen</persName>, Herzog von Guise.</note>, defuncti ducis Lotharingię frater<note xml:id="fn43" type="footnote" n="43">Der am 12. Juni 1545 verstorbene <persName ref="p7637" cert="high">François I. von Lothringen</persName> war nicht Claudes Bruder, sondern sein Neffe. Bruder von Claude war der Vater des Verstorbenen, nämlich <persName ref="p18431" cert="high">Anton II. von Lothringen</persName> (gest. Juni 1544).</note>, patruum suum conscripto milite et auxilio regis<note xml:id="fn45" type="footnote" n="45"><persName ref="p7637" cert="high">König Franz I.</persName></note> Gallorum conatur e ducatu pellere.</s>
					<s n="57" xml:lang="la" type="auto"><persName ref="p18538" cert="high">Caesar</persName> vero iuniori<note xml:id="fn46" type="footnote" n="46"><persName ref="p18536" cert="high">Karl III. von Lothringen</persName>.</note>, qui patri<note xml:id="fn47" type="footnote" n="47"><persName ref="p7637" cert="high">François I. von Lothringen</persName>.</note> successit, subsidium et <placeName ref="l29" cert="high">Hispanum</placeName> militem pollicitus est.</s>
					<s n="58" xml:lang="la" type="auto">Quodsi res non fuerit composita, putant novum bellum inter caesarem et Gallum oriturum.</s>
					<s n="59" xml:lang="la" type="auto">Feruntur alia quedam nova, sed forte non fide digna: mundus mendatiis scatet<note xml:id="fn48" type="footnote" n="48">Vgl. <bibl>TPMA</bibl> XIII 46, 125f: Die Welt ist falscher Zungen voll.</note>.</s>
					<s n="60" xml:lang="la" type="auto">Haec omnia raptim.</s>
				</p>
				<p>
					<s n="61" xml:lang="la" type="auto">Valebis igitur, vir humanissime, cum ecclesia sancta et familia.</s>
					<s n="62" xml:lang="la" type="auto">Salutabis vicissim tuos, in primis meum <persName ref="p8412" cert="high">Megandrum</persName>, <persName ref="p8437" cert="high">Erasmum</persName>,<note xml:id="fn49" type="footnote" n="49">"""

In [None]:
# whole letter for the instruction prompt
letter = """
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="file10224" type="Brief" source="HBBW-3" n="193">
<teiHeader xml:lang="de">
<fileDesc>
<titleStmt>
<title subtype="file">Konrad Geßner, Johannes Fries / Basel an Heinrich Bullinger, 25. Februar [1533]</title>
</titleStmt>
<publicationStmt>
<authority>Universität Zürich</authority>
</publicationStmt>
<sourceDesc> </sourceDesc>
</fileDesc>
</teiHeader>
<text xml:lang="la">
<body>
<div xml:id="div1" corresp="regest1">
<p>
<s n="1" xml:lang="la" type="auto">Optimo et integerrimo viro M. Henrico Bullingero, mecaenati charissimo.</s>
</p>
<p>
<s n="2" xml:lang="la" type="auto">S.</s>
<s n="3" xml:lang="la" type="auto">
Impediunt nos ab itinere
<note xml:id="fn3" type="footnote" n="3">
<persName ref="p1283" cert="high">Geßner</persName>
und
<persName ref="p1214" cert="high">Fries</persName>
befanden sich auf dem Weg nach
<placeName ref="l59" cert="high">Bourges</placeName>
.
</note>
nives, pluvia et ventorum vis.
</s>
<s n="4" xml:lang="la" type="auto">
<placeName ref="l28" cert="high">Basileae</placeName>
apud
<persName ref="p8418" cert="high">Myconium</persName>
<note xml:id="fn4" type="footnote" n="4">
<persName ref="p1214" cert="high">Fries</persName>
und
<persName ref="p1283" cert="high">Geßner</persName>
waren in einem Brief
<persName ref="p8127" cert="high">Pellikans</persName>
vom 19. Februar 1533 an
<persName ref="p8418" cert="high">Myconius</persName>
empfohlen worden (Zürich StA, E II 358, 98).
</note>
sine sumtu moramur sudum coelum et tempestatem mitiorem expectantes.
</s>
<s n="5" xml:lang="la" type="auto">Nivibus obrutae sunt viae omnes, montes praesertim, per quos nulla itinera nunc patent.</s>
<s n="6" xml:lang="la" type="auto">
Maxime tamen omnium nos detinet, quod Gallos et alios quosdam itineris comites
<note xml:id="fn5" type="footnote" n="5">Unbekannt.</note>
facturos brevi hic invenimus.
</s>
<s n="7" xml:lang="la" type="auto">Tuam humanitatem rogamus literas nobis a senatu poscat Tigurinos nos esse et a Tigurino senatu propter studia ablegatos.</s>
<s n="8" xml:lang="la" type="auto">Ita enim docti plerique consuluerunt, quo nobis tutioribus esse liceat.</s>
<s n="9" xml:lang="la" type="auto">Ne nos negligas etiam atque etiam oramus poscimusque.</s>
<s n="10" xml:lang="la" type="auto">Si dederis operam, facile impetrabis.</s>
<s n="11" xml:lang="la" type="auto">
Literis nos acceptis
<note xml:id="fn6" type="footnote" n="6">
Nach Aussage von
<persName ref="p8398" cert="high">Konrad Klauser</persName>
haben
<persName ref="p1283" cert="high">Geßner</persName>
und
<persName ref="p1214" cert="high">Fries</persName>
die gewünschten Empfehlungsschreiben offenbar erhalten (s. unten
<ref target="file10226">[Nr. 195]</ref>
S. 77, 5). Auffindbar sind sie jedoch nicht mehr.
</note>
quamprimum cum comitibus maturabimus iter.
</s>
</p>
<p>
<s n="12" xml:lang="la" type="auto">Vale et nos tibi commendatos habe.</s>
</p>
<p>
<s n="13" xml:lang="la" type="auto">
<placeName ref="l28" cert="high">Basileae</placeName>
in aedibus
<persName ref="p8418" cert="high">Myconii</persName>
, februarii 25.
</s>
</p>
<p>
<s n="14" xml:lang="la" type="auto">
<persName ref="p1214" cert="high">Ioannes Frisius</persName>
et
<persName ref="p1283" cert="high">C. Gesnerus</persName>
tui toti.
</s>
</p>
</div>
</body>
</text>
</TEI>"""

## Run completion model, LLama 3 8B
Prompting the model with the letter up until the start tag of the FN
and stopping the generation when ```</note>``` is generated.

In [None]:
# Functions to generate

class StoppingCriteriaSub(StoppingCriteria):
    """Subclass of StoppingCriteria that allows stopping based on a specific word
    disregarding the exact tokenization. A list of words can be passed to the stops argument.
    Note that it might not work well for longer words"""
    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = stops

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            text = tokenizer.decode(input_ids[0])
            # Todo: find a nice way to watch the generation
            # print(text.split()[-1])
            # need to use the text, as the same text can be tokenized differently depending on context...
            # no guarantee that it ends with the text, as end token may be ">." or something...
            if stop in text[-10:]:
                return True
        return False

stop_words = ["</note>"]
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words)])


def generate(prompt:str, model, tokenizer):
    input_ids = tokenizer(prompt, return_tensors="pt")['input_ids'].to(model.device)
    # terminators = tokenizer.encode("</note>", add_special_tokens=False)
    # print(terminators)
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            do_sample=True,
            # eos_token_id=terminators,
            temperature=0.7,
            max_new_tokens=500,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
            stopping_criteria=stopping_criteria  # stop when "</note>" is generated
            )

    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
# load the model


model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Quantization, as shown here: https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=VPD7QS_DR-mw
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-9e54e1c424c3>", line 15, in <cell line: 15>
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 3715, in from_pretrained
    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
  File "/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py", line 1079, in get_checkpoint_shard_files
    cached_filename = cached_file(
  File "/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py", line 402, in cached_file
    resolved_file = hf_hub_

TypeError: object of type 'NoneType' has no len()

#### Test 1
Testing the completion of an FN that is not the first one. So the model has an
example of what we want...

Original FN: <br>
```xml
<persName ref="p1214" cert="high">Fries</persName>
und
<persName ref="p1283" cert="high">Geßner</persName>
waren in einem Brief
<persName ref="p8127" cert="high">Pellikans</persName>
vom 19. Februar 1533 an
<persName ref="p8418" cert="high">Myconius</persName>
empfohlen worden (Zürich StA, E II 358, 98).
</note>
```

In [None]:
for i in range(10):
  print(generate(text1, model, tokenizer))
torch.cuda.empty_cache()

#### Test 2
Same letter, but the model has to generate the first FN, i.e. it has no example of what a FN in the edition looks like in the prompt.

Original FN: <br>
```xml
<persName ref="p1283" cert="high">Geßner</persName>
und
<persName ref="p1214" cert="high">Fries</persName>
befanden sich auf dem Weg nach
<placeName ref="l59" cert="high">Bourges</placeName>
.
</note>
```


In [None]:
for i in range(10):
  print(generate(text2, model, tokenizer))
torch.cuda.empty_cache()

#### test 3

Test mit einem langen Brief: <br>
- Es geht ein bisschen länger, und braucht c.a. 12 GB RAM (T4 hat bis zu 15)

Original FN: <br>
```xml
<note xml:id="fn49" type="footnote" n="49">
<persName ref="p8437" cert="high">Erasmus Schmid</persName>
.
</note>
```

In [None]:
for i in range(10):
  print(generate(text3, model, tokenizer))
torch.cuda.empty_cache()

In [None]:
# del model
torch.cuda.empty_cache()

## Run instruction model LLama3 8B

In [None]:
# Generating with a chat model
def generate_chat(messages:list, model, tokenizer):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # generate the attention mask, got a warning that it might be needed, though I am not sure if that is necessary
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
# load the model


model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Quantization, as shown here: https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=VPD7QS_DR-mw
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

In [None]:


ICL_SYSTEM_PROMPT = "You are a research assistant for a historian, specialized on the European reformation working on an edition of the correspondence of Heinrich Bullinger. Given a letter in TEI format, your task is to complete the missing footnote"

# without an example it is actually rather difficult to get the model to reply in the correct format... idk. I think there is a way to "force output format?" Constrained Decoding...
SYSTEM_PROMPT = """
You are a research assistant for a historian specialized on the European reformation, working on an edition of the correspondence of Heinrich Bullinger.
Given a letter in TEI format, your task is to complete the missing Footnote. That is complete until you reach the </note> tag.
"""
# footnote for the icl
footnote_text2 = """<note xml:id="fn3" type="footnote" n="3"><persName ref="p1283" cert="high">Geßner</persName> und <persName ref="p1214" cert="high">Fries</persName> befanden sich auf dem Weg nach <placeName ref="l59" cert="high">Bourges</placeName>.</note>"""

def instruction_prompt(text):
  """Generate a prompt carefully explaining the task, no examples needed (hopefully...)"""
  messages = [
      {"role": "system", "content": SYSTEM_PROMPT},
      {"role": "researcher", "content": text},
  ]
  return messages

def icl_prompt(text):
  """Generate a prompt that gives the example from text2"""
  messages = [
      {"role": "system", "content": ICL_SYSTEM_PROMPT},
      {"role": "historian", "content": text2},
      {"role": "assistant", "content": footnote_text2},
      {"role": "historian", "content": text}
  ]
  return messages




#### test 1
Formulating the completion task in Chat template.

Original FN: <br>
```xml
<persName ref="p1214" cert="high">Fries</persName>
und
<persName ref="p1283" cert="high">Geßner</persName>
waren in einem Brief
<persName ref="p8127" cert="high">Pellikans</persName>
vom 19. Februar 1533 an
<persName ref="p8418" cert="high">Myconius</persName>
empfohlen worden (Zürich StA, E II 358, 98).
</note>
```
FN from the ICL:<br>
```xml
<note xml:id="fn49" type="footnote" n="49">
<persName ref="p8437" cert="high">Erasmus Schmid</persName>
.
</note>
```

In [None]:

for i in range(10):
  print(generate_chat(icl_prompt(text1), model, tokenizer))
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()

Zero shot gives wacky results... tinkered a bit with it, but to no avail...

In [None]:
for i in range(10):
  print(generate_chat(instruction_prompt(text1), model, tokenizer))
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()

When the example is from another letter, the model is very confused, or the prompt is too long for the model?

In [None]:
for i in range(10):
  print(generate_chat(icl_prompt(text3), model, tokenizer))
torch.cuda.empty_cache()

#### test 2
Feeding the whole letter w/o the FN in question
Need to feed an example letter, otherwise the output will be very random. Given the context, it is probably too much?

In [None]:
import re

def footnote_regex(n):
  matching_string = (fr"( ?<note [^>]*? type=\"footnote\" n=\"{n}\">)" # matching group 1: the opening tag
                  r"(.*?(?=<\/note>))"  # matching group 2 everything up until the closing tag (positive lookup!! no matching group!!)
                  r"(<\/note>)"  # matching group 3: the endtag
  )
  return matching_string

def remove_footnote_content(text, n):
  """remove all content from the footnote n="n" """
  return re.sub(footnote_regex(n), r"\1\3", text)

def get_footnote_content(text, n):
  """get the content of a FN"""
  return re.search(footnote_regex(n), text).group(2)

ICL_SYSTEM_PROMPT = "You are a research assistant for a historian, specialized on the European reformation working on an edition of the correspondence of Heinrich Bullinger. Given a letter in TEI format, your task is to complete the missing footnote"

def HISTORIAN_PROMPT(text, n):
  return f"Bitte schlage mir einen Text für Fussnote n={n} in folgendem Dokument vor:\n\n{text}"

def instruct_prompt_fill_in(letter, n, example=(11130, 7)):
  """
  param text: Letter TEI
  param n: Footnote to generate
  example: letter_id and FN to give as an example, 12342 is from the dev-set..."""

  # find a good example letter, to put as icl prompt...
  # get the letter
  example_letter_id, example_n = example
  with open(f"../data/human/{example_letter_id}.xml") as infile:
    example_letter = infile.read()

  messages = [
      {"role": "system", "content": ICL_SYSTEM_PROMPT},
      {"role": "historian", "content": HISTORIAN_PROMPT(remove_footnote_content(example_letter, example_n), example_n)},
      {"role": "assistant", "content": get_footnote_content(example_letter, example_n)},
      {"role": "historian", "content": HISTORIAN_PROMPT(letter, n)}
  ]

  return messages





In [None]:
instruct_prompt_fill_in(letter, 3)

In [None]:
# Use the DFs I made:
# From the DF get a Letter and an FN, remove the FN by matching the xml_string
# apply the chat template
import pandas as pd
import re



# test letter
letter_id = 10224

footnote_df = pd.read_csv("../data/footnote_df.csv")

# get the letter
with open(f"../data/human/{letter_id}.xml") as infile:
  letter = infile.read()

In [None]:
letter


It results in suggestions to FN3 from the example letter. Is the prompt too long for the model?

In [None]:
for i in range(10):
  print(generate_chat(instruct_prompt_fill_in(letter, 3), model, tokenizer))
  print("##################################################################################")


In [None]:
torch.cuda.empty_cache()

#### test 3
removing all FNs

In [None]:
# footnotes of that letter
footnote_df_letter = footnote_df[footnote_df["letter_id"] == letter_id]

letter_no_fns = letter

# removing all FNs
for footnote in footnote_df_letter.iloc:  # Maybe we'll have to take care of the labels here...
  letter_no_fns = remove_footnote_content(letter_no_fns, footnote["n_footnote"])

letter_no_fns

In [None]:
def instruct_prompt_no_fns(letter, n, example_n=None):
  """
  param text: Letter TEI
  param n: Footnote to generate
  example: letter_id and FN to give as an example, 12342 is from the dev-set..."""

    # footnotes of that letter
  footnote_df_letter = footnote_df[footnote_df["letter_id"] == letter_id]

  letter_no_fns = letter

  # removing all FNs
  for footnote in footnote_df_letter.iloc:  # Maybe we'll have to take care of the labels here...
    letter_no_fns = remove_footnote_content(letter_no_fns, footnote["n_footnote"])


  messages = [
      {"role": "system", "content": ICL_SYSTEM_PROMPT},
  ]

  # add example if specified
  if example_n:
    messages.append({"role": "historian", "content": HISTORIAN_PROMPT(letter_no_fns, example_n)},)
    messages.append({"role": "assistant", "content": get_footnote_content(letter, example_n)})

  # add prompt for generation
  messages.append({"role": "historian", "content": HISTORIAN_PROMPT(letter_no_fns, n)})

  return messages

In [None]:
# no examples (zero-shot)

for i in range(10):
  print(generate_chat(instruct_prompt_no_fns(letter, 3), model, tokenizer))
  print("##################################################################################")

In [None]:
for i in range(10):
  print(generate_chat(instruct_prompt_no_fns(letter, 4, example_n=3), model, tokenizer))
  print("##################################################################################")

In [None]:
torch.cuda.empty_cache()

long letter

In [None]:
letter_id = 12260
footnote_df = pd.read_csv("../data/footnote_df.csv")
# get the letter
with open(f"../data/downsized_letters/{letter_id}.xml") as infile:
  letter = infile.read()

In [None]:
print(generate_chat(instruct_prompt_no_fns(letter, 43, example_n=9), model, tokenizer))

## more specific prompt

In [None]:

system_prompt = "Du bist Historiker und hast dich auf die Reformation spezialisiert. Gerade arbeitest du daran Briefe von Heinrich Bullinger zu edieren. Vervollständige die inhaltlichen Fussnoten."
query_whole_letter = """Schlage einen Text für Fussnote 11 vor, die folgende Frage beantwortet: Welche Informationen sind \u00fcber Johannes Buchser relevant f\u00fcr den Kontext des Satzes?
<TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="file11298" type="Brief" source="HBBW-9" n="1255">
<teiHeader xml:lang="de">
<fileDesc>
<titleStmt>
<title subtype="file">Matthias Erb / Reichenweier an Heinrich Bullinger, 17. April [1539]</title>
</titleStmt>
<publicationStmt>
<authority>Universität Zürich</authority>
</publicationStmt>
<sourceDesc>
<bibl type="scan">Zürich StA</bibl>
</sourceDesc>
</fileDesc>
</teiHeader>
<text xml:lang="la">
<body>
<div xml:id="div1" corresp="regest1">
<p>
<s n="1" xml:lang="la" type="auto">Gratia domini tecum.</s>
</p>
<p>
<s n="2" xml:lang="la" type="auto">Quid a te mihi iucundius offerri potuisset quam illa tua consolatrix epistola?</s>
<s n="3" xml:lang="la" type="auto">
Nam eius adhortatione
<note xml:id="fn2" type="footnote" n="2">
Vgl. oben
<ref target="file11287">Nr. 1244</ref>
, 10-18.
</note>
(vera dico, optime Bullingere) non modo prudentior, sed et vigilantior redditus tollero levius tum satanę tum impiorum calumnias.
</s>
<s n="4" xml:lang="la" type="auto">Scio enim, quam multi simus episcopi, at pauci oculati.</s>
</p>
<p>
<s n="5" xml:lang="la" type="auto">
Hinc verissimum est, quod scribis
<note xml:id="fn3" type="footnote" n="3">Vgl. ebd. 19-30.</note>
nostra negligentia aut, si mavis, oscitantia plures posse perire, diligentia vero non paucos servari etc.
</s>
<s n="6" xml:lang="la" type="auto">
Tu roga dominum pro me, ut ecclesiolam huius agri
<note xml:id="fn4" type="footnote" n="4">
Die Kirche bestand aus den Gemeinden der Herrschaften
<placeName ref="l221" cert="high">Horburg</placeName>
und
<placeName ref="l411" cert="high">Reichenweier</placeName>
, denen
<persName ref="p1010" cert="high">Erb</persName>
als Superintendent vorstand; vgl. Johann
<hi>Adam,</hi>
Evangelische Kirchengeschichte der elsässischen Territorien bis zur französischen Revolution, Straßburg 1928, S. 297.
</note>
variis quassatam procellis per ministros suos (qui fere duodecim sumus) solidam ac constantem effitiat perque spiritum suum roborans ad portum fatiat enatare quietum, siquidem satan furit adhuc in papistis.
</s>
<s n="7" xml:lang="la" type="auto">Falsi fratres nobis imponunt, magistratus querit sua, anabaptistę ecclesię corpus diffindentes in diversum rapiunt, [p]hilautia etiam sanctis imponunt.</s>
<s n="8" xml:lang="la" type="auto">Et quid plura?</s>
<s n="9" xml:lang="la" type="auto">
Tenuis certe in nobis spes ac fere deplorata esset, nisi contra spem in spe
<note xml:id="fn5" type="footnote" n="5">
<cit type="bible">
<ref>Röm 4, 18</ref>
</cit>
.
</note>
<foreign xml:lang="el">θεὸς ἀπὸ µηχανῆς</foreign>
<note xml:id="fn6" type="footnote" n="6">
<bibl>Adagia</bibl>
, 1, 1, 68 (
<bibl>LB</bibl>
II 52f).
</note>
nobis affulgeret.
</s>
<s n="10" xml:lang="la" type="auto">
Novarum nil, nisi quod
<persName ref="p8374" cert="high">Hedio</persName>
meus scribit
<note xml:id="fn7" type="footnote" n="7">
Der entsprechende Brief
<persName ref="p8374" cert="high">Hedios</persName>
ist nicht erhalten.
</note>
protestantes adhuc expectare caesaris veredarium
<note xml:id="fn8" type="footnote" n="8">
Gemeint ist der Orator Kaiser Karis V.,
<persName ref="p7852" cert="high">Johann von Weeze</persName>
; zum Stand der Verhandlungen in
<placeName ref="l157" cert="high">Frankfurt</placeName>
vgl. oben
<ref target="file11293">Nr. 1250</ref>
, 52-63.
</note>
speratque litem ad annum protelandam; interim tamen consultabitur de concilio nationali etc.
<pb type="edition" next="111"/>
</s>
</p>
<p>
<s n="11" xml:lang="la" type="auto">
<persName ref="p19956" cert="high">Marcus</persName>
<note xml:id="fn9" type="footnote" n="9">
Der bereits oben
<ref target="file11287">Nr. 1244</ref>
, 7-9 erwähnte Markus ist nicht weiter bekannt.
</note>
rediit alacer et ludicer etc.
</s>
<s n="12" xml:lang="la" type="auto">
<persName ref="p8364" cert="high">Beatus</persName>
ille, sororius olim
<persName ref="p261" cert="high">Ioannis Surani</persName>
<note xml:id="fn11" type="footnote" n="11"></note>
, annuente comite
<persName ref="p7492" cert="high">Wilhelmo a Furstenberg</persName>
ad
<placeName ref="l1720" cert="high">Gengenbachum</placeName>
vocabitur.
</s>
<s n="13" xml:lang="la" type="auto">Certe oppidulum est elegans ac vulgus verbo dei instructum, ilhic et annona copiose provenit.</s>
<s n="14" xml:lang="la" type="auto">
Est autem sub ditione imperii et comiti obpignoratum
<note xml:id="fn12" type="footnote" n="12">
<persName ref="p7492" cert="high">Wilhelm von Fürstenberg</persName>
verwaltete
<placeName ref="l1720" cert="high">Gengenbach</placeName>
als Pfandherr; vgl. Johannes Volker
<hi>Wagner,</hi>
Graf
<persName ref="p7492" cert="high">Wilhelm von Fürstenberg</persName>
, 1491-1549, und die politischgeistigen Mächte seiner Zeit, Stuttgart 1966. - Pariser Historische Studien, 4, S. 185f.
</note>
.
</s>
<s n="15" xml:lang="la" type="auto">Hinc tantum agit patronum.</s>
</p>
<p>
<s n="16" xml:lang="la" type="auto">
Bene vale et
<persName ref="p1753" cert="high">Leonem</persName>
cum
<persName ref="p8412" cert="high">Megandro</persName>
meo terque quaterque salutabis.
</s>
<s n="17" xml:lang="la" type="auto">Boni consule meam balbutiem.</s>
</p>
<p>
<s n="18" xml:lang="la" type="auto">Ex Richenvilla, 17. die aprilis.</s>
</p>
<p>
<s n="19" xml:lang="la" type="auto">
Tuissimus ex animo
<persName ref="p1010" cert="high">Matthias Erbius</persName>
.
</s>
</p>
<p>
<note type="address">[Adresse auf der Rückseite:]</note>
<s n="20" xml:lang="la" type="auto">Viro in primis et pio et docto d. Henricho Bullingero, celebratissimae civitatis Tigurinę a contionibus, fratri charissimo.</s>
</p>
</div>
</body>
</text>
</TEI>
Gebe nur den Text an, wie er in der Fussnote erscheinen könnte, mitsamt TEI Markup"""

query_just_sentence = """Schlage einen Text für Fussnote 11 vor, die folgende Frage beantwortet: Welche Informationen sind \u00fcber Johannes Buchser relevant f\u00fcr den Kontext des Satzes?
<s n="12" xml:lang="la" type="auto">
<persName ref="p8364" cert="high">Beatus</persName>
ille, sororius olim
<persName ref="p261" cert="high">Ioannis Surani</persName>
<note xml:id="fn11" type="footnote" n="11"></note>
, annuente comite
<persName ref="p7492" cert="high">Wilhelmo a Furstenberg</persName>
ad
<placeName ref="l1720" cert="high">Gengenbachum</placeName>
vocabitur.
</s>
Gebe nur den Text an, wie er in der Fussnote erscheinen könnte, mitsamt TEI Markup"""

query_2 = """Füge in Fussnote 5 Informationen ein: Welche Quelle liefert Informationen zum Inhalt des von Frechtus gesendeten Autographs?
<s n="4" xml:lang="la" type="auto">
Verum est, statim post abitum
<persName ref="p467" cert="high">Buceri</persName>
ad
<persName ref="p2095" cert="high">Melanctonem</persName>
<note xml:id="fn4" type="footnote" n="4"></note>
<persName ref="p1055" cert="high">Frechtus</persName>
, ille plane meus
<placeName ref="l513" cert="high">Ulmensis</placeName>
ecclesie antistes, vir iuxta pius et eruditus, ad me ceterosque symmystas
<persName ref="p467" cert="high">Buceri</persName>
autographum misit, quod et tibi bona fide communicare volui
<note xml:id="fn5" type="footnote" n="5"></note>
, non ut aliis ostendas, sed apud te contineas habeasque, quo negotium hoc foelicius vehere poteris.
</s>
Gib als Antwort nur die Fussnote und nichts anderes. Wiederhole nicht den ganzen Text"""


# Generating with a chat model
def generate_chat(messages:list, model, tokenizer):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # generate the attention mask, got a warning that it might be needed, though I am not sure if that is necessary
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
# load the model


model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Quantization, as shown here: https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=VPD7QS_DR-mw
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query_just_sentence}]
for _ in range(10):
  print(generate_chat(messages, model, tokenizer))
  print("##################################################")

<note xml:id="fn11" type="footnote" n="11">
  In diesem Zusammenhang ist Johannes Buchser als reformierter Theologe und Mitstreiter von Johannes Suranus relevant. Er spielte eine wichtige Rolle bei der Einführung der Reformation in der Region um Gengenbach.
</note>
##################################################
<note xml:id="fn11" type="footnote" n="11">
  In diesem Zusammenhang ist Johannes Buchser (auch Johannes Suranus) als ein enger Vertrauter von Beatus von Gengenbach und als Berater des Komturs Wilhelm von Fürstenberg relevant. Über Buchser ist bekannt, dass er ein bedeutender Theologe und Kirchenhistoriker war, der sich mit den Ideen der Reformation auseinandersetzte. Seine Kontakte zu Beatus und Wilhelm von Fürstenberg unterstreichen die Bedeutung des Gengenbacher Klosters als Zentrum der frühreformatorischen Bewegung.
</note>
##################################################
<note xml:id="fn11" type="footnote" n="11">
    In diesem Zusammenhang ist Johannes Buchser (auch 

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query_whole_letter}]


for _ in range(10):
  print(generate_chat(messages, model, tokenizer))
  print("##################################################################################")

<note xml:id="fn11" type="footnote" n="11">
Der sororius war ein Ritter, der in der Nähe von Gengenbach ansässig war. Vgl. Johann
<hi>Adam,</hi>
Evangelische Kirchengeschichte der elsässischen Territorien bis zur französischen Revolution, Straßburg 1928, S. 296.
</note>
##################################################################################
<note xml:id="fn11" type="footnote" n="11">
<bibl>Der bereits oben erwähnte Markus ist nicht weiter bekannt.</bibl>
</note>
##################################################################################
<note xml:id="fn11" type="footnote" n="11">
Johannes Buchser, ein Freund und Unterstützer von Bullinger, ist hier erwähnt. Er war ein bedeutender Theologe und Reformator der elsässischen Kirche.
</note>
##################################################################################
<note xml:id="fn11" type="footnote" n="11">
Für Johannes Buchser ist nichts bekannt, das für den Kontext des Satzes relevant wäre.
</note>
##############

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query_2}]
for _ in range(10):
  print(generate_chat(messages, model, tokenizer))
  print("##################################################################################")

<note xml:id="fn5" type="footnote" n="5">Quelle: Brief des Frechtus an Bullinger, in: Heinrich Bullinger, Epistolae theologorum et aliorum virorum illustrium, Zürich 1566, S. 234-235.
##################################################################################
<note xml:id="fn5" type="footnote" n="5">Die Informationen zum Inhalt des von Frechtus gesendeten Autographs finden sich in einem Brief des Ulmer Superintendenten Georg Stöckel an Martin Bucer, der im Jahr 1535 geschrieben wurde. In diesem Brief wird auf das Autograph angespielt, das Frechtus an Bullinger geschickt hatte.
##################################################################################
<note xml:id="fn5" type="footnote" n="5">
Quelle: Brief des Frechtus, der sich an Bullinger richtet, lautet: "Briefe von Frechtus an Bullinger, 1548, UB Zürich, Ms. I 14, fol. 115-116" (Bullinger, Heinrich: Briefwechsel. Hg. v. P. Zimmermann. Bd. 5. Zürich 1942, S. 123-124).
</note>
##########################################

### RAG

## Evaluating the Generation
In terms of ranking BLEU and ROUGE agree more or less, the two German BERT models seem to agree between each other.


| **Tests with Markup**             | BLEU  | ROUGE | roberta-base | xlm-roberta-base | wechsel-german | german-cased |
|------------------|-------|-------|--------------|------------------|----------------|--------------|
| Completion (1 example)          | **0.17** | **0.24**  | **0.85**      | **0.85**          | **0.82**       | **0.71**       |
| Completion (no example)            | 0.00  | 0.01  | 0.70         | 0.69             | 0.66           | 0.50         |
| Completion (long letter)            | 0.16  | 0.15  | 0.84         | 0.83             | 0.79           | **0.71**         |
| Instruction (1-shot, complete)| 0.11  | 0.15  | 0.82         | 0.83             | 0.81           | 0.70         |
| Instruction (1-shot, add)| 0.01  | 0.16  | 0.82         | 0.82             | 0.77           | 0.62         |
|Instruction (1-shot, long letter)| -     | -     | -            | -                | -              | -            |


| Tests without Markup             | BLEU  | ROUGE | roberta-base | xlm-roberta-base | wechsel-german | german-cased |
|------------------|-------|-------|--------------|------------------|----------------|--------------|
| Completion (1 example)            | 0.00  | **0.10**  | 0.82         | 0.84             | **0.76**       | 0.57         |
| Completion (no example)            | 0.00  | 0.01  | 0.71         | 0.72             | 0.67           | 0.50         |
| Completion (long letter)            | 0.00  | 0.10  | 0.80         | 0.83             | 0.77           | 0.58         |
| instruction (1-shot, complete)| 0.00  | 0.07  | 0.81         | 0.82             | 0.75           | 0.56         |
| instruction (1-shot, add) | **0.00**  | **0.13**  | **0.83**      | **0.85**          | **0.76**       | **0.58**       |
| instruction (1-shot, long letter) -     | -     | -            | -                | -              | -            |


Without Markup:


### installs, imports and functions

In [None]:
!pip install evaluate
!pip install transformers
!pip install bert-score
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting pyarrow>=15.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m34.7 MB/s

In [None]:
import re
from evaluate import load
import evaluate
from html import unescape
import bert_score
from tqdm import tqdm

bleu = load("bleu")
rouge = load("rouge")
bertscore = load("bertscore")

def remove_outer_note_tag(xml_str):
  """remove the outer note tag, since in the footnote_xml in the df it is still contained"""
  match_obj = re.match(r"<note [^>]*?>(.*?)</note>", xml_str, re.DOTALL)
  if match_obj: return match_obj.group(1)
  else: return xml_str

def normalize_ws(text):
  """normalize whitespace"""
  return re.sub(r"\s+", " ", text).strip()

def remove_markup(sent):
  """Remove all markup from a sentence"""
  return unescape(re.sub(rf"<.*?>", "", sent))

###########
# Calculating BERT scores
###########
# Default of what layer to use for the representation
# The original module defines the default as
# "the number of layers tuned on WMT16 correlation data."
# I have to add some models, that are not in the original
# Maybe a todo: Test if the evaluation changes significantly based on this...
bert_score.utils.model2layers["google-bert/bert-base-german-cased"] = 9
bert_score.utils.model2layers["benjamin/roberta-base-wechsel-german"] = 9

def bertscore_f1_average(preds, refs, model_type):
  """calculate bertscore f1 average
  args:
    model_type: str, name of the model to use for the scoring"""

  results = bertscore.compute(predictions=preds, references=refs, model_type=model_type)
  f1 = results["f1"]
  return sum(f1) / len(f1)

def semantic_scores(predictions, references):
  """calculate semantic scores with BERT"""
  # calculate f1 average with different Bert models
  # the large models seem not to make much of a difference, but take more time
    # ->> but check it anyway sometime
  # The German models are more critical though
  # models = ["roberta-base", "xlm-roberta-base", "benjamin/roberta-base-wechsel-german",
    #         "google-bert/bert-base-german-cased"]
  models = ["benjamin/roberta-base-wechsel-german"]
  scores = {}
  # print(f"calculating Bert scores for {len(models)} models")
  for model in models:
    scores[model] = bertscore_f1_average(predictions, references, model)
  return scores

def all_metrics(predictions, references):

  all_results = {
      "with_markup": {},
      "without_markup": {}
  }
  # Calculate BLEU score
  bleu_results = bleu.compute(predictions=predictions, references=references)
  all_results["with_markup"]["bleu"] = bleu_results["bleu"]

  # Calculate ROUGE scores
  rouge_results = rouge.compute(predictions=predictions, references=references)
  all_results["with_markup"]["rouge"] = rouge_results["rouge1"]

  # Calculate Semantic score with BERT
  bertscore_results = semantic_scores(predictions=predictions, references=references)
  all_results["with_markup"]["bertscore"] = bertscore_results
  # print("\n".join([f"{model}: {score}" for model, score in bertscore_results.items()]))

  # removing markup
  predictions = [remove_markup(pred) for pred in predictions]
  references = [remove_markup(ref) for ref in references]

  # Calculate BLEU score
  bleu_results = bleu.compute(predictions=predictions, references=references)
  all_results["without_markup"]["bleu"] = bleu_results["bleu"]

  # Calculate ROUGE scores
  rouge_results = rouge.compute(predictions=predictions, references=references)
  all_results["without_markup"]["rouge"] = rouge_results["rouge1"]

  # Calculate Semantic score with BERT
  bertscore_results = semantic_scores(predictions=predictions, references=references)
  all_results["without_markup"]["bertscore"] = bertscore_results
  # print("\n".join([f"{model}: {score}" for model, score in bertscore_results.items()]))

  return all_results

def evaluate_batch(preds, refs, batch_size=64, processes=8):
  results =  {
            "bleu":[],
            "rouge":[],
            "bertscore":[]
          }
  print("calculating BLEU and ROUGE...")
  pool = multiprocessing.Pool(processes=8)
  bleu_rouge_results = list(tqdm(pool.imap(compute_bleu_rouge, zip(preds, refs)), total=len(preds)))
  pool.close()
  pool.join()
  for bleu, rouge in bleu_rouge_results:
    results["bleu"].append(bleu)
    results["rouge"].append(rouge)

  print(f"calculating semantic scores with BERT, batch_size = {batch_size}...")
  results["bertscore"].append(compute_bertscore(preds, refs, batch_size=batch_size)["f1"])
  return results

def evaluate_with_and_without(preds, refs, batch_size=64, processes=8):
  all_results = {
        "with_markup": {},
        "without_markup": {}
        }
  print("\n###########evaluating with mark-up########\n")
  all_results["with_markup"] = evaluate_batch(preds, refs, batch_size=batch_size, processes=processes)
  print("\n###########evaluating without mark-up########\n")
  preds = [remove_markup(pred) for pred in preds]
  refs = [remove_markup(ref) for ref in refs]
  all_results["without_markup"] = evaluate_batch(preds, refs, batch_size=batch_size, processes=processes)



def compute_bleu_rouge(pred_ref_pair):
  pred, ref = pred_ref_pair
  result_bleu = bleu.compute(predictions=[pred], references=[ref])["bleu"]
  result_rouge = rouge.compute(predictions=[pred], references=[ref])["rouge1"]
  return (result_bleu, result_rouge)


def compute_bertscore(predictions, references, batch_size=2):

  results = {"precision": [], "recall": [], "f1": []}
  with tqdm(total=len(predictions)) as pbar:
      for i in range(0, len(predictions), batch_size):
          batch_preds = predictions[i : i + batch_size]
          batch_refs = references[i : i + batch_size]
          batch_results = bertscore.compute(
              predictions=batch_preds, references=batch_refs, lang="de"
          )
          results["precision"].extend(batch_results["precision"])
          results["recall"].extend(batch_results["recall"])
          results["f1"].extend(batch_results["f1"])
          pbar.update(len(batch_preds))
  return results



In [None]:
a = [1,2,3]
b = ["a", "b", "c"]
next(zip(a,b))

(1, 'a')

### Evaluating output from completion model

In [None]:
def format_completion_model_output(x):
  """get a list of the generated fns without the end-tag"""
  generated_fns = x.split("\n")
  generated_fns = [re.sub(r"</note>.*$", "", fn).strip() for fn in generated_fns]  # remove end tag
  return generated_fns

#### test 1
Completion model has to write the second FN in the text.

In [None]:


original_fn = """<persName ref="p1214" cert="high">Fries</persName>
und
<persName ref="p1283" cert="high">Geßner</persName>
waren in einem Brief
<persName ref="p8127" cert="high">Pellikans</persName>
vom 19. Februar 1533 an
<persName ref="p8418" cert="high">Myconius</persName>
empfohlen worden (Zürich StA, E II 358, 98).
</note>"""

# Copied from experiment one
x = """In der Druckversion von 1838 ist Myconium in Myconii. In der Druckversion von 1904 ist Myconium in Myconio.</note></
Die Schreibweise ist ungewöhnlich. Die Schreibung des Namens mit "m" ist in der Literatur nicht bekannt. Myconium war ein Basler Drucker.</note>
<persName ref="p8418" cert="high">Myconius</persName> (um 1495–1544) war Professor der Theologie in <placeName ref="l59" cert="high">Bourges</placeName>. Er hatte eine reformatorische Richtung vertreten und gehörte zu den Unterzeichnern des <term ref="l34">Bourges-Eides</term> von 1534. Auf ihn geht die erste lateinische Übersetzung der <ref target="l27">Heidelberger Katechismus</ref> zurück. <persName ref="p1283" cert="high">Geßner</persName> und <persName ref="p1214" cert="high">Fries</persName> wurden durch ihn mit <ref target="l27">Bullinger</ref> in Verbindung gebracht.</note>
Zitat aus <bibl ref="b192">Bullingers Briefwechsel</bibl>, Bd. 1, S. 69.</note></
<persName ref="p8418" cert="high">Myconium</persName> ist der Basler Drucker und Buchhändler <persName ref="p8419" cert="high">Sebastian Myconius</persName>.</note>
Die Druckerei des <persName ref="p8418" cert="high">Myconius</persName></note>,
Siehe <hi>regest2</hi></note>
Vermutlich ist der <persName ref="p8418" cert="high">Myconius</persName> <persName ref="p1191" cert="high">Fries</persName>, dessen Sohn <persName ref="p1192" cert="high">Johann Jakob</persName> im Jahr 1576 die Reformation in <placeName ref="l28" cert="high">Basel</placeName> wieder eingeführt hat.</note></
 <persName ref="p8418" cert="high">Myconius</persName> war ein Freund von <persName ref="p1283" cert="high">Geßner</persName> und <persName ref="p1214" cert="high">Fries</persName>. </note>,
Siehe die Antwort des <persName ref="p8418" cert="high">Myconium</persName> vom 27. Februar 1533, <ref target="file10339"/>.</note>,"""

generated_fns = format_completion_model_output(x)

# Normalize WS:
predictions = [normalize_ws(fn) for fn in generated_fns]
references = [normalize_ws(original_fn) for _ in range(len(generated_fns))]


In [None]:
results_completion_test1 = all_metrics(predictions, references)

calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import json
print(json.dumps(results_completion_test1, indent=4))

{
    "with_markup": {
        "bleu": 0.17184261471618462,
        "rouge": 0.24375082681093463,
        "bertscore": {
            "roberta-base": 0.8453013896942139,
            "xlm-roberta-base": 0.8534922420978546,
            "benjamin/roberta-base-wechsel-german": 0.8225707352161408,
            "google-bert/bert-base-german-cased": 0.7088321954011917
        }
    },
    "without_markup": {
        "bleu": 0.0,
        "rouge": 0.10409088694766444,
        "bertscore": {
            "roberta-base": 0.817693042755127,
            "xlm-roberta-base": 0.8369762599468231,
            "benjamin/roberta-base-wechsel-german": 0.7648057699203491,
            "google-bert/bert-base-german-cased": 0.5699180662631989
        }
    }
}


#### test 2
Completion model has to write the first FN in the text

In [None]:
original_fn = """<persName ref="p1283" cert="high">Geßner</persName>
und
<persName ref="p1214" cert="high">Fries</persName>
befanden sich auf dem Weg nach
<placeName ref="l59" cert="high">Bourges</placeName>
.
</note>"""

x = """Eigene Schrift.</note>
Lassen wir den Weg frei.</note>
Vermutlich wegen der von Bullinger verhängten Schreibverbot für die Anhänger des Zürcher Täufertums (vgl. HBBW-3 190). </note>
P. 4. (Bullinger).</note>

[Bullinger 1926, p. 26]</note>
Glossar: Impedire = in irremediale Schwierigkeiten geraten.</note>
Cf. <ptr target="file10223#div2" type="copy"/> (n. 3).</note>
Bullinger, Epist. 26, 4.</note>
*<hi rend="sup">†</hi></note>
[A. 20.]</note><"""

generated_fns = format_completion_model_output(x)

# Normalize WS:
predictions = [normalize_ws(fn) for fn in generated_fns]
references = [normalize_ws(original_fn) for _ in range(len(generated_fns))]

In [None]:
results_completion_test2 = all_metrics(predictions, references)

calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(json.dumps(results_completion_test2, indent=4))

{
    "with_markup": {
        "bleu": 0.0,
        "rouge": 0.005194805194805196,
        "bertscore": {
            "roberta-base": 0.6956130537119779,
            "xlm-roberta-base": 0.6917953166094694,
            "benjamin/roberta-base-wechsel-german": 0.658250483599576,
            "google-bert/bert-base-german-cased": 0.5005842366001823
        }
    },
    "without_markup": {
        "bleu": 0.0,
        "rouge": 0.011363636363636366,
        "bertscore": {
            "roberta-base": 0.7076948501847007,
            "xlm-roberta-base": 0.721705436706543,
            "benjamin/roberta-base-wechsel-german": 0.6661924503066323,
            "google-bert/bert-base-german-cased": 0.5023831996050748
        }
    }
}


#### test 3
longer text

In [None]:
original_fn = """<note xml:id="fn49" type="footnote" n="49">
<persName ref="p8437" cert="high">Erasmus Schmid</persName>
.
</note>"""

x = """Vgl. <bibl>Adagia</bibl>, 4, 1, 84 (<bibl>ASD</bibl> II/5 157, <ref target="file12185">Nr. 2031</ref>). - <persName ref="p8412" cert="high">Megander</persName> und <persName ref="p8437" cert="high">Erasmus</persName> sind offenbar identisch.</note>
Zu verstehen: <persName ref="p8437" cert="high">Erasmus</persName> von <placeName ref="l1398" cert="high">Burg</placeName>. - <persName ref="p8412" cert="high">Johannes Megander</persName>.</note>
D.h. <persName ref="p8437" cert="high">Johann Eschmann</persName>.</note>
Vgl. oben <ref target="file12256">Nr. 2209</ref>, Anm. 14.</note>
Vgl. <bibl>TPMA</bibl> XII 75, 96, 99f: Megander, Erasmus.</note>
Vgl. <bibl>Adagia</bibl>, 1, 4, 4 (<bibl>ASD</bibl> II/1 158, <ref target="file12195">Nr. 1939</ref>).</note>
Erasmus Megander.</note>
Vgl. <bibl>Adagia</bibl>, 1, 3, 2 (<bibl>ASD</bibl> I/1 141, <ref target="file11509">Nr. 1337</ref>).</note>
Vgl. <bibl>Adagia</bibl>, 1, 8, 76 (<bibl>ASD</bibl> II/1 238f, Nr. 176).</note>
Vgl. oben <ref target="file12256">Nr. 2209</ref>, 3.</note>"""

generated_fns = format_completion_model_output(x)

# Normalize WS:
predictions = [normalize_ws(fn) for fn in generated_fns]
references = [normalize_ws(original_fn) for _ in range(len(generated_fns))]

In [None]:
results_completion_test3 = all_metrics(predictions, references)

calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(json.dumps(results_completion_test3, indent=4))

{
    "with_markup": {
        "bleu": 0.15678695449484145,
        "rouge": 0.14784825311141103,
        "bertscore": {
            "roberta-base": 0.8399091482162475,
            "xlm-roberta-base": 0.825313127040863,
            "benjamin/roberta-base-wechsel-german": 0.7936817109584808,
            "google-bert/bert-base-german-cased": 0.7072402536869049
        }
    },
    "without_markup": {
        "bleu": 0.0,
        "rouge": 0.10052631578947369,
        "bertscore": {
            "roberta-base": 0.8041177749633789,
            "xlm-roberta-base": 0.826343297958374,
            "benjamin/roberta-base-wechsel-german": 0.767687177658081,
            "google-bert/bert-base-german-cased": 0.5779593199491501
        }
    }
}


### Evaluating output from the instruction model

In [None]:

# getting the FN content, when the model spits out a regex
# reposted here from cell in the run completion model section
def footnote_regex(n):
  matching_string = (fr"( ?<note [^>]*? type=\"footnote\" n=\"{n}\">)" # matching group 1: the opening tag
                  r"(.*?(?=<\/note>))"  # matching group 2 everything up until the closing tag (positive lookup!! no matching group!!)
                  r"(<\/note>)"  # matching group 3: the endtag
  )
  return matching_string

def get_footnote_content(text, n):
  """get the content of a FN"""
  return re.search(footnote_regex(n), text).group(2)

#### test 1


In [None]:
original_fn = """<persName ref="p1214" cert="high">Fries</persName>
und
<persName ref="p1283" cert="high">Geßner</persName>
waren in einem Brief
<persName ref="p8127" cert="high">Pellikans</persName>
vom 19. Februar 1533 an
<persName ref="p8418" cert="high">Myconius</persName>
empfohlen worden (Zürich StA, E II 358, 98).
</note>"""

# generation 6 is missing the end-tag, had to add it manually
# another disadvantage of the chat model...

x = """<note xml:id="fn4" type="footnote" n="4">v. <bibl>Basel, 1533</bibl>, <bibl>Basel, 1533</bibl>, <bibl>Basel, 1533</bibl>, <bibl>Basel, 1533</bibl>. <ref target="#bibl1" corresp="bibl1">[1]</ref></note>
<note xml:id="fn4" type="footnote" n="4">vgl. <bibl><author ref="p8418" cert="high">Myconius</author>, <title ref="l28" cert="high">Baselische Chronik</title>, <date ref="l28" cert="high">1533</date>, <page ref="l28" cert="high">A2v</page>.</bibl></note>
<note xml:id="fn4" type="footnote" n="4"><ref target="#l28" corresp="l28">Basel</ref> am <ref target="#p8418" corresp="p8418">Mühlenbrunnen</ref>.</note>
<note xml:id="fn4" type="footnote" n="4"><ref target="#l28"><bibl>Basileae</bibl></ref> <ref target="#p8418"><bibl>Myconium</bibl></ref> <date value="1533-02-25" cert="high">25. Februar [1533]</date></note>
<note xml:id="fn4" type="footnote" n="4">v. <bibl ref="Bullinger, Heinrich, 1504-1575. Briefwechsel, Bd. 3. Basel, 1849, S. 128</bibl>.</note>
<note xml:id="fn4" type="footnote" n="4">v. <bibl ref="l28">Basel</bibl>, <bibl ref="p8418">Myconius</bibl>, <bibl ref="l59">Bourges</bibl>, <bibl ref="p1283">Geßner</bibl>, <bibl ref="p1214">Fries</bibl>, <bibl ref="p8418">Myconius</bibl>, <bibl ref="l59">Bourges</bibl>, <bibl ref="p1283">Geßner</bibl>, <bibl ref="p1214">Fries</bibl>.</note>
<note xml:id="fn4" type="footnote" n="4"><ref type="biblio" target="#bibl-Myconium">Myconius, <date cert="high">1533</date></ref></note>
<note xml:id="fn4" type="footnote" n="4">v. <ref target="#l28" corresp="ref1">Basel</ref>. Myconium ist die lateinische Form des Namens "Mikon" oder "Mikön", eines Basler Patriziers und Kaufmanns, der als Gastgeber für Gäste aus dem reformierten Lager bekannt war. Vgl. <ref target="#l59" corresp="ref2">Bourges</ref>, <ref target="#l28" corresp="ref1">Basel</ref>, <ref target="#p8418" corresp="ref3">Mikon</ref>.</note>
<note xml:id="fn4" type="footnote" n="4">v. <bibl ref="bibl1" cert="high">HBBW-2</bibl>, p. <num>123</num>.</note>
<note xml:id="fn4" type="footnote" n="4">v. <bibl><title xml:lang="la">Histoire des réformateurs</title>, <volume>III</volume>, p. <page>123</page></bibl>.</note>"""
generated_fns = x.split("\n")
generated_fns = [get_footnote_content(fn, 4) for fn in generated_fns]

# Normalize WS:
predictions = [normalize_ws(fn) for fn in generated_fns]
references = [normalize_ws(original_fn) for _ in range(len(generated_fns))]

In [None]:
results_instruction_test1 = all_metrics(predictions, references)

calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(json.dumps(results_instruction_test1, indent=4))

{
    "with_markup": {
        "bleu": 0.10948907569623573,
        "rouge": 0.15014701902120842,
        "bertscore": {
            "roberta-base": 0.8243515908718109,
            "xlm-roberta-base": 0.8312181890010834,
            "benjamin/roberta-base-wechsel-german": 0.8067393779754639,
            "google-bert/bert-base-german-cased": 0.70190349817276
        }
    },
    "without_markup": {
        "bleu": 0.0,
        "rouge": 0.07217816635563395,
        "bertscore": {
            "roberta-base": 0.8085728049278259,
            "xlm-roberta-base": 0.815597665309906,
            "benjamin/roberta-base-wechsel-german": 0.7471520781517029,
            "google-bert/bert-base-german-cased": 0.5561822593212128
        }
    }
}


In [None]:
x = """Der Brief ist von Konrad Geßner und Johannes Fries aus Basel an Heinrich Bullinger, wo sie sich bei Myconius aufhielten.
Das Gasthaus "Myconium" in Basel.
Der Gastgeber des Briefes, <persName ref="p8418" cert="high">Myconius</persName>, war ein Basler Theologe und Freund von Bullinger.
Das Haus des <persName ref="p8418" cert="high">Myconius</persName> in Basel.
Hieronymus Myconi, ein Basler Patrizier, war Gastgeber der Schreiber in seinem Haus.
Das Haus von Myconius in Basel.
Das Haus von Myconius in Basel.
Das Gasthaus "Myconii" in Basel, wo sich die Schreiber des Briefes aufhielten.
Das Haus von <persName ref="p8418" cert="high">Myconius</persName> in <placeName ref="l28" cert="high">Basel</placeName>.
Das Haus des <persName ref="p8418" cert="high">Myconius</persName> in <placeName ref="l28" cert="high">Basel</placeName>."""

generated_fns = x.split("\n")

# Normalize WS:
predictions = [normalize_ws(fn) for fn in generated_fns]
references = [normalize_ws(original_fn) for _ in range(len(generated_fns))]


In [None]:
results_instruction_test2 = all_metrics(predictions, references)

calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating Bert scores for 4 models


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(json.dumps(results_instruction_test2, indent=4))

{
    "with_markup": {
        "bleu": 0.011294071231157719,
        "rouge": 0.1593456177418238,
        "bertscore": {
            "roberta-base": 0.8188350081443787,
            "xlm-roberta-base": 0.8185410082340241,
            "benjamin/roberta-base-wechsel-german": 0.7746267974376678,
            "google-bert/bert-base-german-cased": 0.6185996234416962
        }
    },
    "without_markup": {
        "bleu": 0.0,
        "rouge": 0.12525525525525524,
        "bertscore": {
            "roberta-base": 0.833623731136322,
            "xlm-roberta-base": 0.8462861835956573,
            "benjamin/roberta-base-wechsel-german": 0.7649896144866943,
            "google-bert/bert-base-german-cased": 0.579486471414566
        }
    }
}


# Run all prompts
**Note:** There was a problem with the createion of the QA prompt and I have to rerun it...
 - [ ] Instruct Base model
  - [x] Instruct Add
    - [x] example
    - [x] dev
  - [ ] Instruct QA
    - [x] example
    - [ ] dev
 - [ ] Instruct Add model
  - [x] Instruct Add
    - [x] example
    - [x] dev
  - [ ] Instruct QA
    - [x] example
    - [ ] dev
 - [ ] Instruct QA model
  - [ ] Instruct Add
    - [x] example
    - [ ] dev
  - [ ] Instruct QA
    - [x] example
    - [ ] dev

### Attempts to increase efficiency
Mostly failed. I think the main bottleneck is the decoding strategy of the model. In Fine-Tuning we don't actually have to worry about that, but here we do.

In [None]:
# Generating with a batch chat model, This does not really help
def generate_chat_batch(messages_list: list[list], model, tokenizer):
    # Apply chat template and tokenize the messages for each input in the batch
    if tokenizer.pad_token_id is None:
      tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding if pad_token is not set
    tokenizer.padding_side = "left"
    input_ids = tokenizer.apply_chat_template(
          messages_list,
          add_generation_prompt=True,
          padding=True,
          return_tensors="pt",
          return_attention_mask=True
        ).to(model.device)

    with torch.no_grad():
      outputs = model.generate(
          input_ids,
          eos_token_id=terminators,
          do_sample=True,
          temperature=0.6,
          max_new_tokens=256,
          top_p=0.9,
          pad_token_id=tokenizer.eos_token_id
      )
    responses = [outputs[i][input_ids.shape[-1]:] for i in range(outputs.size()[0])]
    return tokenizer.batch_decode(responses, skip_special_tokens=True)

In [None]:
messages_list =[
    [
        {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Who won the world series in 2020? makin this a very long output, to find where this max token thing is coming from... why hello äöäüüöäüö"}
      ],
    [
        {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Who won the world series in 2019?"}
    ],
]
print(generate_chat_batch(messages_list, model, tokenizer))

KeyboardInterrupt: 

In [None]:
print(generate_chat(messages_list[1], model, tokenizer))

torch.Size([95])
The Washington Nationals won the 2019 World Series. They defeated the Houston Astros in Game 7 with a score of 6-2. This was the Nationals' first World Series title in their franchise history.


In [None]:
# test the generate_batch function
if tokenizer.pad_token_id is None:
  tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding if pad_token is not set
tokenizer.padding_side = "left"
messages_list =[
    [
        {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Who won the world series in 2020? "}
      ],
    [
        {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "Who won the world series in 2019?"}
    ],
]

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


input_ids = tokenizer.apply_chat_template(
      messages_list,
      add_generation_prompt=True,
      padding=True,
      return_tensors="pt",
    ).to(model.device)

print(input_ids.size())

with torch.no_grad():
  outputs = model.generate(
      input_ids,
      eos_token_id=terminators,
      do_sample=False,
      temperature=0.6,
      max_new_tokens=256,
      top_p=0.9,
      pad_token_id=tokenizer.eos_token_id
  )
responses = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)

for response in responses:
  print(response)


torch.Size([2, 51])




The Los Angeles Dodgers won the 2020 World Series. They defeated the Tampa Bay Rays in the series 4 games to 2.
The Washington Nationals won the 2019 World Series. They defeated the Houston Astros in Game 7 with a score of 6-2, winning the series 4 games to 3.


In [None]:
for messages in messages_list:
  print(generate_chat(messages, model, tokenizer))

The Los Angeles Dodgers won the 2020 World Series. They defeated the Tampa Bay Rays in the series 4 games to 2.
The Washington Nationals won the 2019 World Series. They defeated the Houston Astros in Game 7 with a score of 6-2 on October 30, 2019.


In [None]:
torch.cuda.empty_cache()

# Run base model

In [11]:
# load the model


model_id = "Boffl/llama3.1-8B-instruct-EA"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Quantization, as shown here: https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=VPD7QS_DR-mw
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/910 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [12]:
run_llama_over_prompts("instruct_qa", "example")

100%|██████████| 112/112 [23:16<00:00, 12.47s/it]


In [None]:
from google.colab import runtime
runtime.unassign()

# Run big model?

In [None]:
# load the model



model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

device = xm.xla_device()
model = AutoModelForCausalLM.from_pretrained(model_id)
# Enable mixed precision (bfloat16) for TPU execution






NameError: name 'AutoTokenizer' is not defined

In [None]:
model.half()
model.to(device)


RuntimeError: Bad StatusOr access: RESOURCE_EXHAUSTED: Error allocating device buffer: Attempting to allocate 32.00M. That was not possible. There are 3.23M free.; (0x0x0_HBM0)

In [None]:
tokenizer.pad_token_id

151643

In [None]:
# example lorem ipsum conversation
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "lorem ipsum dolor sit amet"}
      ]

# Generating with a chat model
def generate_chat(messages:list, model, tokenizer):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    print(input_ids)

    # generate the attention mask, got a warning that it might be needed, though I am not sure if that is necessary
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    print(terminators)
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=256,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)
print(generate_chat(messages, model, tokenizer))

tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,    385,   1826,  26342,  23655,
           2444,  27212, 151645,    198, 151644,  77091,    198]],
       device='cuda:0')
[151645, None]
I'm sorry, but "lorem ipsum dolor sit amet" appears to be placeholder text and doesn't provide any specific information. Could you please clarify your question or request? I'd be happy to help if you have a specific topic or question in mind.


In [None]:
"Qwen" in model_id

True

# Qwen2.5?

In [None]:
# load the model


model_id = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Quantization, as shown here: https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=VPD7QS_DR-mw
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [None]:
run_llama_over_prompts("instruct_qa", "example")

100%|██████████| 112/112 [26:00<00:00, 13.93s/it]


# Evaluate Batch

In [None]:
import pandas as pd
from tqdm import tqdm
import multiprocessing


human_footnotes_df = pd.read_csv("../data/footnote_downsized_df.csv")
human_footnotes_df["xml_footnote"] = human_footnotes_df["xml_footnote"].apply(remove_outer_note_tag)

def flatten_dict(d, parent_key='', sep='-'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)



def evaluate_batch_old(machine_df, human_df):
    merged_df = machine_df.merge(human_df, on=["letter_id", "n_footnote"])

    predictions = list(merged_df["generated_footnote"])
    references = list(merged_df["xml_footnote"])
    return all_metrics(predictions, references)

def evaluate_individually(machine_df, human_df, total=None):
    merged_df = machine_df.merge(human_df, on=["letter_id", "n_footnote"])
    merged_df = merged_df[["letter_id", "n_footnote", "generated_footnote", "xml_footnote", "text_footnote"]]
    result_list = []
    # go through all rows, calculate the scores indifidually and add them as a column
    for index, row in tqdm(merged_df.iterrows(), total=total):
        predictions = [row["generated_footnote"]]
        references = [row["xml_footnote"]]
        results = all_metrics(predictions, references)
        # flatten results
        flat_results = flatten_dict(results)
        result_list.append(flat_results)
        if index > 50:
          break
    result_df = pd.DataFrame(result_list)
    merged_df = pd.concat([merged_df, result_df], axis=1)
    return merged_df

def evaluate_row(pred_ref_tuple):
    predictions = [pred_ref_tuple[0]]
    references = [pred_ref_tuple[1]]
    results = all_metrics(predictions, references)
    return flatten_dict(results)



def evaluate_individually_pooling(machine_df, human_df, total=None):
    merged_df = machine_df.merge(human_df, on=["letter_id", "n_footnote"])
    merged_df = merged_df[["letter_id", "n_footnote", "generated_footnote", "xml_footnote", "text_footnote"]]
    pred_ref_tuples = list(zip(merged_df["generated_footnote"], merged_df["xml_footnote"]))
    pool = multiprocessing.Pool(processes=8)
    results = list(tqdm(pool.imap(evaluate_row, pred_ref_tuples), total=total))
    pool.close()
    pool.join()
    result_df = pd.DataFrame(results)
    merged_df = pd.concat([merged_df, result_df], axis=1)
    return merged_df


def qualitative_comparison(machine_df1, machine_df2, human_df):
    merged_df = machine_df1.merge(machine_df2, on=["letter_id", "n_footnote"])
    merged_df = merged_df.merge(human_df, on=["letter_id", "n_footnote"])
    merged_df = merged_df[["letter_id", "n_footnote", "generated_footnote_x", "generated_footnote_y", "xml_footnote", "text_footnote"]]
    return merged_df

pd.set_option('display.max_colwidth', None)  # show whole text in the cells


In [None]:
generated_footnotes_df = pd.read_csv("../data/model_responses/llama/Meta-Llama-3.1-8B-Instruct_instruct_add_test.csv")

In [None]:
generated_footnotes_df = pd.read_csv("../data/model_responses/llama/Meta-Llama-3.1-8B-Instruct_instruct_add_test.csv")
# results = evaluate_batch(generated_footnotes_df, human_footnotes_df)

In [None]:
print(results)

{'with_markup': {'bleu': 0.02780840509054571, 'rouge': 0.07936249627644365, 'bertscore': {'benjamin/roberta-base-wechsel-german': 0.734339047634103}}, 'without_markup': {'bleu': 0.01218268380954241, 'rouge': 0.07034087064155778, 'bertscore': {'benjamin/roberta-base-wechsel-german': 0.7535662721318437}}}


In [None]:
individual_eval_df = evaluate_individually(generated_footnotes_df, human_footnotes_df, total=3239)


  0%|          | 0/3239 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/856k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/514k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.47M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 51/3239 [00:51<53:37,  1.01s/it]


In [None]:
!cat /proc/cpuinfo | grep "model name" | wc -l

8


In [None]:
import bert_score
from tqdm import tqdm

def compute_bertscore(predictions, references, batch_size=2):

    results = {"precision": [], "recall": [], "f1": []}
    with tqdm(total=len(predictions)) as pbar:
        for i in range(0, len(predictions), batch_size):
            batch_preds = predictions[i : i + batch_size]
            batch_refs = references[i : i + batch_size]
            batch_results = bertscore.compute(
                predictions=batch_preds, references=batch_refs, lang="de"
            )
            results["precision"].extend(batch_results["precision"])
            results["recall"].extend(batch_results["recall"])
            results["f1"].extend(batch_results["f1"])
            pbar.update(len(batch_preds))
    return results



In [None]:
    merged_df = generated_footnotes_df.merge(human_footnotes_df, on=["letter_id", "n_footnote"])
    merged_df = merged_df[["letter_id", "n_footnote", "generated_footnote", "xml_footnote", "text_footnote"]]
    preds = list(merged_df["generated_footnote"])
    refs = list(merged_df["xml_footnote"])

In [None]:


results = compute_bertscore(predictions=preds, references=refs, batch_size=64)

100%|██████████| 2873/2873 [00:39<00:00, 72.51it/s]


In [None]:
results = evaluate_batch(preds, refs, processes=8)

calculating BLEU and ROUGE...


100%|██████████| 10/10 [00:02<00:00,  3.89it/s]


calculating semantic scores with BERT, batch_size = 64...


100%|██████████| 10/10 [00:02<00:00,  4.46it/s]


In [None]:
preds = preds[:10]
refs = refs[:10]

In [None]:
for pred, ref in zip(preds, refs):

  print(bleu.compute(predictions=[pred], references=[ref])["bleu"], end=", ")
  print(rouge.compute(predictions=[pred], references=[ref])["rouge1"], end=", ")
  print(bertscore.compute(predictions=[pred], references=[ref], lang="de")["f1"])

0.0, 0.058823529411764705, [0.6235059499740601]
0.0, 0.0, [0.5267145037651062]
0.37037253582330265, 0.3125, [0.7921568155288696]
0.0, 0.0909090909090909, [0.5593177080154419]
0.0, 0.0606060606060606, [0.6036304235458374]
0.12021577610863723, 0.37037037037037035, [0.7743406891822815]
0.0, 0.06896551724137931, [0.6138825416564941]
0.0, 0.046511627906976744, [0.5720358490943909]
0.0, 0.058823529411764705, [0.5315734148025513]
0.0, 0.0, [0.5640946626663208]


In [None]:
results

{'bleu': [0.0,
  0.0,
  0.37037253582330265,
  0.0,
  0.0,
  0.12021577610863723,
  0.0,
  0.0,
  0.0,
  0.0],
 'rouge': [0.058823529411764705,
  0.0,
  0.3125,
  0.0909090909090909,
  0.0606060606060606,
  0.37037037037037035,
  0.06896551724137931,
  0.046511627906976744,
  0.058823529411764705,
  0.0],
 'bertscore': [[0.6235059499740601,
   0.5267144441604614,
   0.7921569347381592,
   0.5593176484107971,
   0.603630542755127,
   0.7743407487869263,
   0.6138826012611389,
   0.5720357894897461,
   0.531573474407196,
   0.5640945434570312]]}

In [None]:
predictions = ["hello there, how are you?", "general kenobi"]
references = ["hello there, how are you?", "general kenobi"]
compute_bleu_rouge((predictions[0], references[0]))

(1.0, 1.0)

In [None]:
bleu_results = bleu.compute(predictions=preds, references=refs)

In [None]:
bleu_results

{'bleu': 0.07867577388274566,
 'precisions': [0.23999688391613838,
  0.10085153275896613,
  0.06517927136196724,
  0.049672674715184495],
 'brevity_penalty': 0.836203989418787,
 'length_ratio': 0.8482608228773448,
 'translation_length': 102693,
 'reference_length': 121063}

In [None]:
individual_eval_df = evaluate_individually_pooling(generated_footnotes_df, human_footnotes_df, total=3239)



config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at benjamin/roberta-base-wechsel-german and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from 

KeyboardInterrupt: 

  File "/usr/local/lib/python3.10/dist-packages/transformers/models/roberta/modeling_roberta.py", line 832, in forward
    encoder_outputs = self.encoder(
  File "/usr/local/lib/python3.10/dist-packages/bert_score/utils.py", line 351, in bert_encode
    out = model(x, attention_mask=attention_mask, output_hidden_states=all_layers)
KeyboardInterrupt
  File "/usr/local/lib/python3.10/dist-packages/bert_score/utils.py", line 351, in bert_encode
    out = model(x, attention_mask=attention_mask, output_hidden_states=all_layers)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/l

In [None]:
individual_eval_df.nlargest(5, "without_markup-bertscore-benjamin/roberta-base-wechsel-german")

In [None]:
import re
def remove_outer_note_tag(xml_str):
  """remove the outer note tag, since in the footnote_xml in the df it is still contained"""
  match_obj = re.match(r"<note [^>]*?>(.*?)</note>", xml_str, re.DOTALL)
  if match_obj: return match_obj.group(1)
  else: return xml_str

In [None]:
from tqdm import tqdm
text = """<note xmlns="http://www.tei-c.org/ns/1.0" xml:id="fn10" type="footnote" n="10">1mm von einandren gan: Beim Auseinandergehen (nach der Sitzung).</note>"""
texts = [text for _ in range(3000)]
removed = []
for text in tqdm(texts):
  removed.append(remove_outer_note_tag(text))

100%|██████████| 3000/3000 [00:00<00:00, 95377.84it/s]


In [None]:
individual_eval_df

In [None]:
individual_eval_df.to_csv("../data/evaluation/Meta-Llama-3.1-8B-Instruct_instruct_add_test.csv")

Unnamed: 0,letter_id,n_footnote,generated_footnote,xml_footnote,text_footnote,with_markup-bleu,with_markup-rouge,with_markup-bertscore-benjamin/roberta-base-wechsel-german,without_markup-bleu,without_markup-rouge,without_markup-bertscore-benjamin/roberta-base-wechsel-german
0,12891,10,"Die drei Briefe, die ich empfangen habe, sind von dem König von Frankreich, dem König von England und dem König von Spanien.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn10"" type=""footnote"" n=""10"">1mm von einandren gan: Beim Auseinandergehen (nach der Sitzung).</note>",1mm von einandren gan: Beim Auseinandergehen (nach der Sitzung).,0.000000,0.038462,0.685625,0.0,0.058824,0.727468
1,11456,3,Zu den Ammianischen Geschichten.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn3"" type=""footnote"" n=""3""><persName ref=""p8011"" cert=""high"">Gwalthers</persName> Brief an <persName ref=""p167"" cert=""high"">Johann Jakob Ammann</persName> ist nicht erhalten.</note>",Gwalthers Brief an Johann Jakob Ammann ist nicht erhalten.,0.000000,0.000000,0.704371,0.0,0.000000,0.794163
2,10729,20,"1000 gl. verrösslet und nütt geschaffet, nämlich die Kosten für die Belagerung von <placeName ref=""l167"" cert=""high"">Jenff</placeName>.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn20"" type=""footnote"" n=""20""><placeName ref=""l1280"" cert=""high"">Thonon (Chablais)</placeName>.</note>",Thonon (Chablais).,0.160109,0.240000,0.812025,0.0,0.000000,0.706709
3,12453,21,Von der Saxonischen und Lünenburgischen Gesandtschaft ist nichts zu hören.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn21"" type=""footnote"" n=""21""><persName ref=""p18509"" cert=""high"">Herzog August von Sachsen</persName>.</note>",Herzog August von Sachsen.,0.000000,0.050000,0.675613,0.0,0.125000,0.730926
4,12436,3,"Crediderim ergo non inconsultum fore, si ea hora tractarem historicum aliquid ex veteri testamento, vel Genesim vel quid aliud tale, s. <ref target=""file12377"">Nr. 2330</ref>, Anm. 14.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn3"" type=""footnote"" n=""3"">Gemeint ist: historicum librum.</note>",Gemeint ist: historicum librum.,0.000000,0.039216,0.758431,0.0,0.068966,0.761406
...,...,...,...,...,...,...,...,...,...,...,...
1432,11456,26,"Vgl. <ref target=""file11449"">Nr. 1406</ref>.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn26"" type=""footnote"" n=""26"">Vgl. <bibl>Rockwell</bibl> 35.</note>",Vgl. Rockwell 35.,,,,,,
1433,12835,47,"""Angelberg"" ist ein Berg bei <placeName ref=""l16"" cert=""high"">Italia</placeName>, auf dem die <placeName ref=""l16"" cert=""high"">Itali</placeName> einen befestigten Posten hatten.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn47"" type=""footnote"" n=""47"">Angelberg (heute Ortsteil von <placeName ref=""l2387"" cert=""high"">Tussenhausen</placeName>, Lkr. Unterallgäu, Bayern).</note>","Angelberg (heute Ortsteil von Tussenhausen, Lkr. Unterallgäu, Bayern).",,,,,,
1434,12891,49,"Die drei Briefe, die ich empfangen habe, sind von dem Kaiser, von dem Herzog von Württemberg und von dem Lanzknecht Widenhuͦber.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn49"" type=""footnote"" n=""49"">Es handelt sich um Briefe. die Bullinger <persName ref=""p8055"" cert=""high"">Blarer</persName> zum Lesen gesandt hatte; s. <ref target=""file12890"">Nr. 2841</ref>,58f.</note>","Es handelt sich um Briefe. die Bullinger Blarer zum Lesen gesandt hatte; s. Nr. 2841,58f.",,,,,,
1435,11614,19,"Petitum de commendaticiis, quod in colloquio familiarissimo fuit, ut a te ipse petita essent, quae in animo clauso tibi inesse manifesta sunt.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn19"" type=""footnote"" n=""19""><persName ref=""p8011"" cert=""high"">Gwalther</persName> hatte <persName ref=""p8418"" cert=""high"">Myconius</persName> am 17. September gebeten, ihm seine in <placeName ref=""l28"" cert=""high"">Basel</placeName> zurückgelassenen Bücher zustellen zu lassen, da <persName ref=""p2811"" cert=""high"">Christoph [Rotacher]</persName> dies trotz mehrmaliger Bitte noch nicht getan habe; s. St. Gallen Kantonsbibliothek (Vadiana), Ms 34 (VBS V), 64.</note>","Gwalther hatte Myconius am 17. September gebeten, ihm seine in Basel zurückgelassenen Bücher zustellen zu lassen, da Christoph [Rotacher] dies trotz mehrmaliger Bitte noch nicht getan habe; s. St. Gallen Kantonsbibliothek (Vadiana), Ms 34 (VBS V), 64.",,,,,,


In [None]:
# todo: write a function to add the scores into the df
qual_comp_df = qualitative_comparison(generated_footnotes_df, generated_footnotes_df, human_footnotes_df)
qual_comp_df.head(10)

Unnamed: 0,letter_id,n_footnote,generated_footnote_x,generated_footnote_y,xml_footnote,text_footnote
0,12891,10,"Die drei Briefe, die ich empfangen habe, sind von dem König von Frankreich, dem König von England und dem König von Spanien.","Die drei Briefe, die ich empfangen habe, sind von dem König von Frankreich, dem König von England und dem König von Spanien.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn10"" type=""footnote"" n=""10"">1mm von einandren gan: Beim Auseinandergehen (nach der Sitzung).</note>",1mm von einandren gan: Beim Auseinandergehen (nach der Sitzung).
1,11456,3,Zu den Ammianischen Geschichten.,Zu den Ammianischen Geschichten.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn3"" type=""footnote"" n=""3""><persName ref=""p8011"" cert=""high"">Gwalthers</persName> Brief an <persName ref=""p167"" cert=""high"">Johann Jakob Ammann</persName> ist nicht erhalten.</note>",Gwalthers Brief an Johann Jakob Ammann ist nicht erhalten.
2,10729,20,"1000 gl. verrösslet und nütt geschaffet, nämlich die Kosten für die Belagerung von <placeName ref=""l167"" cert=""high"">Jenff</placeName>.","1000 gl. verrösslet und nütt geschaffet, nämlich die Kosten für die Belagerung von <placeName ref=""l167"" cert=""high"">Jenff</placeName>.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn20"" type=""footnote"" n=""20""><placeName ref=""l1280"" cert=""high"">Thonon (Chablais)</placeName>.</note>",Thonon (Chablais).
3,12453,21,Von der Saxonischen und Lünenburgischen Gesandtschaft ist nichts zu hören.,Von der Saxonischen und Lünenburgischen Gesandtschaft ist nichts zu hören.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn21"" type=""footnote"" n=""21""><persName ref=""p18509"" cert=""high"">Herzog August von Sachsen</persName>.</note>",Herzog August von Sachsen.
4,12436,3,"Crediderim ergo non inconsultum fore, si ea hora tractarem historicum aliquid ex veteri testamento, vel Genesim vel quid aliud tale, s. <ref target=""file12377"">Nr. 2330</ref>, Anm. 14.","Crediderim ergo non inconsultum fore, si ea hora tractarem historicum aliquid ex veteri testamento, vel Genesim vel quid aliud tale, s. <ref target=""file12377"">Nr. 2330</ref>, Anm. 14.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn3"" type=""footnote"" n=""3"">Gemeint ist: historicum librum.</note>",Gemeint ist: historicum librum.
5,12743,61,"Sebastian von der Weitmühl, ein bedeutender böhmischer Adeliger, der sich als Oberster der königlichen Truppen in Böhmen befand.","Sebastian von der Weitmühl, ein bedeutender böhmischer Adeliger, der sich als Oberster der königlichen Truppen in Böhmen befand.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn61"" type=""footnote"" n=""61"">Sebastian von der Weitmühl.</note>",Sebastian von der Weitmühl.
6,12835,51,"propter senium (senectutem), s. <ref target=""file12801"">Anm. 2</ref>.","propter senium (senectutem), s. <ref target=""file12801"">Anm. 2</ref>.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn51"" type=""footnote"" n=""51"">Er war damals etwa 60 Jahre alt und starb am 7. Oktober 1547; s. <hi>Schnvder, Ratslisten</hi> 586.</note>","Er war damals etwa 60 Jahre alt und starb am 7. Oktober 1547; s. Schnvder, Ratslisten 586."
7,12500,49,"Der Text für Fußnote n=49 könnte wie folgt lauten:\n\n„Der geschwurmbs“ bezieht sich auf die vielen Adligen und Bischöfe, die an dem Reichstag in Regensburg teilnehmen werden.","Der Text für Fußnote n=49 könnte wie folgt lauten:\n\n„Der geschwurmbs“ bezieht sich auf die vielen Adligen und Bischöfe, die an dem Reichstag in Regensburg teilnehmen werden.","<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn49"" type=""footnote"" n=""49"">[Pfaffen]gewimmel; s. <bibl>HBBW</bibl> XV 638 und Anm. 47.</note>",[Pfaffen]gewimmel; s. HBBW XV 638 und Anm. 47.
8,12770,18,Der Brief wurde also ganz früh am Morgen geschrieben; vgl. auch unten Z. 128.,Der Brief wurde also ganz früh am Morgen geschrieben; vgl. auch unten Z. 128.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn18"" type=""footnote"" n=""18"">Der <placeName ref=""l1976"" cert=""high"">Augsburger</placeName> Bischof <persName ref=""p7840"" cert=""high"">Otto Truchsess von Waldburg</persName>.</note>",Der Augsburger Bischof Otto Truchsess von Waldburg.
9,12500,54,Nicht erhalten. Die Fussnote n=54 ist nicht vorhanden.,Nicht erhalten. Die Fussnote n=54 ist nicht vorhanden.,"<note xmlns=""http://www.tei-c.org/ns/1.0"" xml:id=""fn54"" type=""footnote"" n=""54"">Siehe <bibl>HBBW</bibl> XVI 29.</note>",Siehe HBBW XVI 29.


## finding long letters

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
token_len_list, long_letters_set = count_prompt_tokens(model_id, "instruct_add", "test")

100%|██████████| 3239/3239 [03:23<00:00, 15.95it/s]


In [None]:
long_letters_set # {'10949', '11503', '11755' übersetzung (dt Erstfassung), '11826' (mit Entwurf)}

{'10949', '11503', '11755', '11826'}

In [None]:
#

In [None]:
token_len_list, long_letters_set = count_prompt_tokens(model_id, "instruct_add", "train")

100%|██████████| 25037/25037 [29:11<00:00, 14.30it/s]


In [None]:
long_letters_set  #

{'10013',
 '10041',
 '10050',
 '10067',
 '10099',
 '10135',
 '10140',
 '10518',
 '10538',
 '10661',
 '10986',
 '11096',
 '11148',
 '11591',
 '11593',
 '11689',
 '11820',
 '11877',
 '12080',
 '12389',
 '12448',
 '12545',
 '12659',
 '12710',
 '12748',
 '12890',
 '13088'}

{'10013',
 '10041',
 '10050',
 '10067',
 '10099',
 '10135',
 '10140',
 '10518',
 '10538',
 '10661',
 '10986',
 '11096',
 '11148',
 '11591',
 '11593',
 '11689',
 '11820',
 '11877',
 '12080',
 '12389',
 '12448',
 '12545',
 '12659',
 '12710',
 '12748',
 '12890',
 '13088'}

In [None]:
token_len_list, long_letters_set = count_prompt_tokens(model_id, "instruct_add", "dev")

0it [00:00, ?it/s]


In [None]:
long_letters_set

set()