# Dataset generation for Image-to-Text from Text-to-Text examples
- This notebook briefly covers how the synthetic slides have been generated.
- The synthetic slides feature:
	- PDFs with over 4000 pages
	- Each PDF uses a unique latex beamer theme
	- This acts as a form of data augmentation as the labels are the same but the image input looks different even though it holds the same data -> maybe find a better sounding explanation here
- How it works brief overview:
	- Latex Beamer Templates have been adapted to include instructions for the templating engine Jinja2
	- These Templates can then be read with Jinja2 which will execute said instructions and produce a new .tex file
	- Which is then compiled into a PDF
- Maybe include a listing with a latex template with jinja instructions as example and briefly explain how they work

In [8]:
import jinja2
import os
import pandas as pd
from ankipandas import Collection
import swifter

In [2]:
# setup jinja environment and define how a block, variable etc. looks
latex_jinja_env = jinja2.Environment(
	block_start_string = '\BLOCK{',
	block_end_string = '}',
	variable_start_string = '\VAR{',
	variable_end_string = '}',
	comment_start_string = '\#{',
	comment_end_string = '}',
	line_statement_prefix = '%%%&',
	line_comment_prefix = '%',
	trim_blocks = True,
	autoescape = False,
	loader = jinja2.FileSystemLoader("/Users/I516998/Documents/uni - mannheim/ankinator/ankinator-flashcard-models/ressources/image-to-text-templates/"))

In [3]:
col = Collection()
col.notes

[32mINFO: Searching for database. This might take some time. You can speed this up by specifying a search path or directly entering the path to your database.[0m


Unnamed: 0_level_0,nguid,nmod,nusn,ntags,nflds,nmodel
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1627919721277,UXs8P,1627919797,-1,[],"[Gesamtkosten, Kfix+Kvar+At+Zt]",Standard
1627919797459,KsV;M,1627919913,-1,[],"[Kalkulatorische Abschreibungen, <img src=""img...",Standard
1627919913959,_JhDD,1627919958,-1,[],"[Kalkulatorische Zinskosten, <img src=""img6198...",Standard
1627919958698,"O,353",1627919986,-1,[],"[Gewinnvergleichsrechnung, G=U-K]",Standard
1627919986532,{bHQQ,1627920027,-1,[],"[Rentabilitätsvergleichsrechnung, <img src=""im...",Standard
...,...,...,...,...,...,...
1677937216242,C4J?7,1677938427,-1,[],[Nennen sie die 2 Strategien der Beurteilung v...,Standard
1677938427446,w!Jw8,1677939124,-1,[],"[Berufskrankheit, Liegt vor, wenn eine versich...",Standard
1677939124208,U:G#5,1677939220,-1,[],"[Aufgaben der Berufsgenossenschsften, Präventi...",Standard
1677939220992,-u^ML,1677939386,-1,[],"[Definition Arbeitsunfall, Wenn eine versicher...",Standard


In [4]:
col.notes.nflds.iloc[2]

['Kalkulatorische Zinskosten', '<img src="img6198499762462923278.jpg">']

In [4]:
df = pd.DataFrame(col.notes.nflds) # extract anki data from anki database
df

Unnamed: 0_level_0,nflds
nid,Unnamed: 1_level_1
1627919721277,"[Gesamtkosten, Kfix+Kvar+At+Zt]"
1627919797459,"[Kalkulatorische Abschreibungen, <img src=""img..."
1627919913959,"[Kalkulatorische Zinskosten, <img src=""img6198..."
1627919958698,"[Gewinnvergleichsrechnung, G=U-K]"
1627919986532,"[Rentabilitätsvergleichsrechnung, <img src=""im..."
...,...
1677937216242,[Nennen sie die 2 Strategien der Beurteilung v...
1677938427446,"[Berufskrankheit, Liegt vor, wenn eine versich..."
1677939124208,"[Aufgaben der Berufsgenossenschsften, Präventi..."
1677939220992,"[Definition Arbeitsunfall, Wenn eine versicher..."


In [5]:
from typing import List, Union
import regex as re
from bs4 import BeautifulSoup

# function to clean the answers of the flashcards
def process_answers(x: List[str]) -> Union[List[str], None]:
	if re.match(r'.*<br>.*', x[1]): # check for newlines, if newline, then the answers are split to mimic bulletpoints on the slides
		inputs = x[1].split('<br>')
		x = [BeautifulSoup(i, features="lxml").text for i in inputs if not re.match(r'<img src=".*">', i) and BeautifulSoup(i, features="lxml").text != ""] # Only clean answers of additional styling if they don't include an image or if it would result in an empty string
		if len(x) == 0:
			return None
		return x
	else:
		x = [BeautifulSoup(x[1], features="lxml").text] # clean data that doesn't include line breaks
		if x[0] != "": # only return non empty answers, the others include images
			return x
		else: return None


In [37]:
import cv2
from typing import Union

def process_images(x: List[str]) -> Union[str,None]:
	col_media_path: str = os.path.join(col.path.parent.as_posix(), "collection.media") # load media collection from anki
	if re.match(r'.*<img src=".*[.](jpg|png)">', x[1]): # check if an image is present
		soup = BeautifulSoup(x[1])

		x = []
		for img_tag in soup.find_all("img"): # extract all image tags from anki html
			full_image_path = os.path.join(col_media_path, img_tag.get("src")) # get image path
			if os.path.exists(full_image_path) and re.match(r'.*[.](jpg|png)', img_tag.get("src")): # check if image actually exist on disk, sometime anki has missing references
				image = cv2.imread(full_image_path) # read in image
				if image.shape[0] > image.shape[1]: # check if image is horizontal or vertical and convert it to horizontal to reduce latex page overflow errors
					cv2.imwrite(os.path.join(col_media_path, soup.find("img").get("src")),
								cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE))

				x.append('\includegraphics[width=0.9\\textwidth,height=0.9\\textheight,keepaspectratio]{'+ full_image_path + '}') # append latex instructions on how to include the image to result
		if len(x) == 0:
			return None
		return x

	return None

In [38]:
# Apply functions to the actual data
df["answer"] = df.nflds.swifter.apply(lambda x: process_answers(x))
df["images"] = df.nflds.swifter.apply(lambda x: process_images(x))
df

Pandas Apply:   0%|          | 0/1654 [00:00<?, ?it/s]



Pandas Apply:   0%|          | 0/1654 [00:00<?, ?it/s]

Unnamed: 0_level_0,nflds,answer,images
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1627919721277,"[Gesamtkosten, Kfix+Kvar+At+Zt]",[Kfix+Kvar+At+Zt],
1627919797459,"[Kalkulatorische Abschreibungen, <img src=""img...",,"[\includegraphics[width=0.9\textwidth,height=0..."
1627919913959,"[Kalkulatorische Zinskosten, <img src=""img6198...",,"[\includegraphics[width=0.9\textwidth,height=0..."
1627919958698,"[Gewinnvergleichsrechnung, G=U-K]",[G=U-K],
1627920027341,"[Amortisationsrechnung (statisch), <img src=""i...",,"[\includegraphics[width=0.9\textwidth,height=0..."
...,...,...,...
1677937216242,[Nennen sie die 2 Strategien der Beurteilung v...,"[Präventiv, Korrektiv]",
1677938427446,"[Berufskrankheit, Liegt vor, wenn eine versich...","[Liegt vor, wenn eine versicherte Person wühre...",
1677939124208,"[Aufgaben der Berufsgenossenschsften, Präventi...","[Prävention, Rehabilitation, Entschädigung ]",
1677939220992,"[Definition Arbeitsunfall, Wenn eine versicher...",[Wenn eine versicherte Person sich bei einer v...,


In [15]:
df.loc[(df.answer.isnull()) & (df.images.isnull())]

Unnamed: 0_level_0,nflds,answer,images
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1627919986532,"[Rentabilitätsvergleichsrechnung, <img src=""im...",,
1663752845592,"[What is Power?, <img src=""Bildschirmfoto 2022...",,
1664197142201,[What is the Trait approach + the Great Man Th...,,
1666439292330,[What is a Spark DataSet/DataFrame?<br><ul><li...,,
1669627005248,[Subsumption Tree Input Domain Characterizatio...,,


In [39]:
df.drop(df.loc[(df.answer.isnull()) & (df.images.isnull())].index, inplace=True) # Filter results which can't be processed because of missing references etc.
df

Unnamed: 0_level_0,nflds,answer,images
nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1627919721277,"[Gesamtkosten, Kfix+Kvar+At+Zt]",[Kfix+Kvar+At+Zt],
1627919797459,"[Kalkulatorische Abschreibungen, <img src=""img...",,"[\includegraphics[width=0.9\textwidth,height=0..."
1627919913959,"[Kalkulatorische Zinskosten, <img src=""img6198...",,"[\includegraphics[width=0.9\textwidth,height=0..."
1627919958698,"[Gewinnvergleichsrechnung, G=U-K]",[G=U-K],
1627920027341,"[Amortisationsrechnung (statisch), <img src=""i...",,"[\includegraphics[width=0.9\textwidth,height=0..."
...,...,...,...
1677937216242,[Nennen sie die 2 Strategien der Beurteilung v...,"[Präventiv, Korrektiv]",
1677938427446,"[Berufskrankheit, Liegt vor, wenn eine versich...","[Liegt vor, wenn eine versicherte Person wühre...",
1677939124208,"[Aufgaben der Berufsgenossenschsften, Präventi...","[Prävention, Rehabilitation, Entschädigung ]",
1677939220992,"[Definition Arbeitsunfall, Wenn eine versicher...",[Wenn eine versicherte Person sich bei einer v...,


In [57]:
df.nflds.swifter.apply(pd.Series).rename(columns={0: "Question", 1: "Answer"}) # split data into columns

Pandas Apply:   0%|          | 0/1654 [00:00<?, ?it/s]

Unnamed: 0_level_0,Question,Answer
nid,Unnamed: 1_level_1,Unnamed: 2_level_1
1627919721277,Gesamtkosten,Kfix+Kvar+At+Zt
1627919797459,Kalkulatorische Abschreibungen,"<img src=""img1963991816782229408.jpg"">"
1627919913959,Kalkulatorische Zinskosten,"<img src=""img6198499762462923278.jpg"">"
1627919958698,Gewinnvergleichsrechnung,G=U-K
1627920027341,Amortisationsrechnung (statisch),"<img src=""img8608825578963105513.jpg"">"
...,...,...
1677937216242,Nennen sie die 2 Strategien der Beurteilung vo...,Präventiv<br>Korrektiv
1677938427446,Berufskrankheit,"Liegt vor, wenn eine versicherte Person wühren..."
1677939124208,Aufgaben der Berufsgenossenschsften,Prävention<br>Rehabilitation<br>Entschädigung
1677939220992,Definition Arbeitsunfall,Wenn eine versicherte Person sich bei einer ve...


In [50]:
template = latex_jinja_env.get_template('English-LaTeX-beamer-template-EMC_IMT_FEIT_OVGU_Stimulate/main.tex') # load latex template with jinja instructions
output_path = "../../../ressources/image-to-text-templates/English-LaTeX-beamer-template-EMC_IMT_FEIT_OVGU_Stimulate/test.tex" # define output path
render = template.render(data=df.iloc[50:70]) # Render new latex file from latex template with jinja
with open(output_path, "w") as f:
	f.write(render) # write latex code to file

In [51]:
import os

cwd = "../../../ressources/image-to-text-templates/English-LaTeX-beamer-template-EMC_IMT_FEIT_OVGU_Stimulate/"
# compile produced latex file
os.system(f'cd {cwd} && pdflatex -file-line-error -interaction=nonstopmode -synctex=1 -output-format=pdf "-output-directory=/Users/I516998/Documents/uni - mannheim/ankinator/ankinator-flashcard-models/out" {output_path}')

This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex)
 restricted \write18 enabled.
entering extended mode

(../../../ressources/image-to-text-templates/English-LaTeX-beamer-template-EMC_
IMT_FEIT_OVGU_Stimulate/test.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22>
(/usr/local/texlive/2023/texmf-dist/tex/latex/beamer/beamer.cls
Document Class: beamer 2023/02/20 v3.69 A class for typesetting presentations
(/usr/local/texlive/2023/texmf-dist/tex/latex/beamer/beamerbasemodes.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/etoolbox/etoolbox.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/beamer/beamerbasedecode.sty))
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty)
(/usr/local/texlive/2023/texmf-dist/tex/latex/beamer/beamerbaseoptions.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty))
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2023/texmf-dist/

0

- The above process covers the generation of the synthetic slids. Of course this is not the final code, I converted this notebook in a easy to use class which can be executed in the background and automatically produces the slides from multiple templates. The class can be found here: src/datageneration/pdf_slide_generator.py. It is readable and documented, so you can take a look at it and see if you can find more additional valuable information than what is included in this notebook