<a href="https://colab.research.google.com/github/EricaAndreose/enoam_doc/blob/main/TEIfy_doc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datetime import datetime
from bs4 import BeautifulSoup

Si importano le seguenti librerie:

**datetime**: fornisce classi per manipolare date e orari in modo semplice e potente.

**BeautifulSoup**: utilizzata per analizzare documenti HTML e XML, permettendo di estrarre informazioni e navigare nella struttura del documento.

In [None]:
# FIX AND ALIGN PARAGRAPHS TO TEI
def TEIps(soup):
    try:
        #Find all paragraph tags in the soup object
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            #Set the 'xml:id' attribute to the value of the 'id' attribute
            p['xml:id'] = p['id']
            #Set the 'n' attribute to the value of the 'data-counter' attribute
            p['n'] = p['data-counter']
            #Remove the 'id', 'class', 'data-counter' attribute from the paragraph tag
            del p['id']
            del p['class']
            del p['data-counter']
    #If an error occurs, do nothing
    except: None


`TEIps` ha lo scopo di trasformare i tag `p` di un documento HTML in modo che siano allineati con il formato TEI.



In [None]:
# FIX AND ALIGN CURATOR NOTES TO TEI
def TEIcuratornotes(soup):
    try:
        #Find the curator notes section in the soup object (ol ordered list and li list item)
        curatornotes = soup.find('ol', id='curatorNotes')
        notes = curatornotes.find_all('li')
        for li in notes:
            li.name = 'note' #change the tag name
            li['xml:id'] = li['id']
            li['corresp'] = li['about']
            li['type'] = 'footnote'
            del li['id']
            del li['about']
            del li['typeof']
            del li['property']
            del li['resource']
        #change the tag name from 'ol' to 'noteGrp'
        curatornotes.name = 'noteGrp'
        curatornotes['xml:id'] = curatornotes['id']
        del curatornotes['id']
    except: None

Questa funzione ha lo scopo di trasformare una lista ordinata di note del curatore in un gruppo di note conforme al formato TEI.

In [None]:
# FIX AND ALIGN MORO NOTES TO TEI
def TEImoronotes(soup):
    try:
        moronotes = soup.find('ol', id='moroNotes')
        notes = moronotes.find_all('li')
        for li in notes:
            li.name = 'note'
            li['xml:id'] = li['id']
            li['corresp'] = li['about']
            li['type'] = 'footnote'
            del li['id']
            del li['about']
            del li['typeof']
            del li['property']
            del li['resource']
        moronotes.name = 'noteGrp'
        moronotes['xml:id'] = moronotes['id']
        del moronotes['id']
        del moronotes['type']
        del moronotes['data-alert']
    except: None

Lo stesso processo viene eseguito per le note di Moro.

In [None]:
# CONVERT <em> INTO <hi rend="italics">
def TEIitalics(soup):
    try:
        italics = soup.find_all('em')
        for em in italics:
            if len(em.get_text(strip=True)) == 0:
                em.decompose()
            else:
                em.name = 'hi'
                em['rend'] = 'italic'
    except: None


Qui tutte le enfasi del testo vengono propriamente taggate per il formato TEI con l'attributo 'italics'.

In [None]:
# CONVERT <a> INTO <ref>
def TEIas(soup):
    try:
        arefs = soup.find_all('a')
        for a in arefs:
            a.name = 'ref'
            if a.has_attr('id'):
                a['xml:id'] = a['id']
                del a['id']
            a['target'] = a['href']
            del a['href']
    except: None

Qui i link inseriti nel testo con il tag `a href `vengono convertiti in ref per la formattazione TEI.

In [None]:
# CONVERT <sup> INTO <hi rend="sup">
def TEIsups(soup):
    try:
        sups = soup.find_all('sup')
        for sup in sups:
            if len(sup.get_text(strip=True)) == 0:
                sup.decompose()
            else:
                sup.name = 'hi'
                sup['rend'] = 'sup'
    except: None

La stessa cosa viene fatta per i tag `sup`.

In [None]:
# CONVERT <strong> INTO <hi rend="bold">
def TEIstrongs(soup):
    try:
        strongs = soup.find_all('strong')
        for strong in strongs:
            if len(strong.get_text(strip=True)) == 0:
                strong.decompose()
            else:
                strong.name = 'hi'
                strong['rend'] = 'bold'
    except: None

La stessa cosa viene fatta per i tag `bold`.

In [None]:
# REMOVE EMPTY TAGS
def cleaner_empty(soup):
    try:
        tags = soup.sourceDesc.find_all()
        for tag in tags:
            if len(tag.contents) == 0:
                tag.decompose()
    except: None
    return soup.prettify()


Qui si rimuovono possibili tag senza contenuto che sono rimasti nell'html.

In [None]:
# CONVERT MENTIONED ENTITIES
def convert_entities(soup, old_soup, typeof):
    #find all elements in old_soup with the attribute 'typeof' matching the given type
    category = old_soup.find_all(attrs={'typeof': typeof})
    #create a new tag list based on the typeof parameter
    if typeof == 'foaf:Person':
        lst = soup.new_tag('listPerson')
    elif typeof == 'foaf:Organization':
        lst = soup.new_tag('listOrg')
    elif typeof == 'dcterms:Location':
        lst = soup.new_tag('listPlace')
    elif typeof == 'fabio:Expression':
        lst = soup.new_tag('listBibl')
        lst['xml:id'] = 'bibliographic-references'
    for entity in category:
        # Create new tags based on the typeof parameter
        if typeof == 'foaf:Person':
            name = soup.new_tag('persName')
            tag = soup.new_tag('person')
        elif typeof == 'foaf:Organization':
            name = soup.new_tag('orgName')
            tag = soup.new_tag('org')
        elif typeof == 'dcterms:Location':
            name = soup.new_tag('placeName')
            tag = soup.new_tag('place')
        elif typeof == 'fabio:Expression':
            tag = soup.new_tag('bibl')

        try:
            #append the name tag to the main tag
            tag.append(name)
        except: None

        #find all mentions of the entity in soup and metadata in old_soup
        mentions = soup.find_all(attrs={'property': 'dcterms:references', 'resource': entity['about']})
        metas = old_soup.find_all(attrs={'about': entity['about']})
        bibrefs = soup.find_all(attrs={'property': 'biro:references', 'resource': entity['about']})

        #process metadata to extract and set attributes
        if metas:
            for meta in metas:
                xmlid = meta['about'].split('/')[-1]
                tag['xml:id'] = xmlid
                tag['corresp'] = meta['about']
                if meta.has_attr('property') and meta['property'] == 'rdfs:label':
                    name.string = meta['content']
                elif meta.has_attr('property') and meta['property'] == 'owl:sameAs':
                    tag['sameAs'] = meta['resource']
                elif meta.has_attr('property') and meta['property'] == 'dcterms:bibliographicCitation':
                    tag.string = meta['content']

        #convert mentions to the corresponding TEI format
        if mentions:
            for mention in mentions:
                if typeof == 'foaf:Person':
                    mention.name = 'persName'
                elif typeof == 'foaf:Organization':
                    mention.name = 'orgName'
                elif typeof == 'dcterms:Location':
                    mention.name = 'placeName'
                mention['ref'] = f'#{tag["xml:id"]}'
                del mention['about']
                del mention['class']
                del mention['id']
                del mention['property']
                del mention['resource']
                del mention['typeof']

        #convert bibliographic references to the TEI format
        if bibrefs:
            for bibref in bibrefs:
                bibref.name = 'bibl'
                bibref['ref'] = f'#{tag["xml:id"]}'
                del bibref['about']
                del bibref['class']
                del bibref['id']
                del bibref['property']
                del bibref['resource']
                del bibref['typeof']

        #append the tag to the list
        lst.append(tag)

    #append the list to the source description section of the TEI document
    soup.sourceDesc.append(lst)

In [None]:
# INSERT METADATA
def add_metadata(soup, old_soup, metadata):

    # TITLE
    title = metadata['title']
    soup.title.insert(0, title)


    # AUTHOR
    author = metadata['author']
    soup.author.insert(0, author)


    # CURATOR
    curator = metadata['curator']
    soup.principal.insert(0, curator)


    # IDENTIFIER
    identifier = old_soup.find(attrs={'name': 'dcterms.identifier'})
    soup.idno.insert(0, identifier['content'])


    # ABSTRACT
    abstract = metadata['abstract']
    soup.abstract.insert(0, abstract)


    # STATUS
    status = metadata['docstatus']
    soup.revisionDesc['status'] += status
    change = soup.new_tag('change')
    change.string = f'{datetime.now().strftime("%Y-%m-%d")}'
    soup.revisionDesc.insert(0, change)


    # DOCUMENT TYPE, DOCUMENT SUBJECT, AUTHOR ROLE
    catrefs = soup.find_all('catRef')
    for catref in catrefs:
        if catref.has_attr('scheme') and catref['scheme'] == 'https://www.w3id.org/moro/voc/types/':
            for doctype in metadata['doctypeList']:
                catref['target'] += 'https://www.w3id.org/moro/voc/types/' + doctype.replace('.', '')[3:] + ' '
        elif catref.has_attr('scheme') and catref['scheme'] == 'https://www.w3id.org/moro/voc/subjects/':
            for subject in metadata['doctopicList']:
                catref['target'] += 'https://www.w3id.org/moro/voc/subjects/' + subject.replace('doctopic.', 'subject') + ' '
        elif catref.has_attr('scheme') and catref['scheme'] == 'https://www.w3id.org/moro/voc/roles/':
            for role in metadata['roleList']:
                catref['target'] += 'https://www.w3id.org/moro/voc/roles/' + role.replace('.', '') + ' '
        catref['target'] = catref['target'][:-1]

In [None]:
# PROVENANCE
    try:
        for prov in metadata['provenance']:
            bibl = soup.new_tag('bibl')
            bibl.string = prov
            soup.sourceDesc.listBibl.append(bibl)
    except: None

In [None]:
# DATE AND SPATIAL COVERAGE
    try:
        if metadata['eventPlace'] or metadata['eventDate']:
            xenodata = soup.new_tag('xenoData')

            if metadata['eventPlace']:
                eventplace = metadata['eventPlace']
                coverage = soup.new_tag('dcterms:spatial')
                coverage.string = eventplace
                xenodata.append(coverage)

            if metadata['eventDate']:
                eventdate = metadata['eventDate']

                if '-' in eventdate:
                    if eventdate.endswith('--'):
                        eventdate = eventdate[:-2]
                    elif eventdate.endswith('-'):
                        eventdate = eventdate[:-1]
                    split_eventdate = eventdate.split('-')
                    new_split_date = []
                    for t in split_eventdate:
                        if len(t) == 1:
                            t = '0' + t
                        new_split_date.append(t)
                    eventdate = '-'.join(new_split_date)

                coverage = soup.new_tag('dcterms:date')
                coverage.string = eventdate
                xenodata.append(coverage)
            soup.teiHeader.append(xenodata)
    except: None

In [None]:
# CONVERT QUOTES
def convert_quotes(soup):
    quotes = soup.find_all(attrs={'typeof': 'doco:TextChunk'})
    for quote in quotes:
        quote.name = 'quote'
        del quote['about']
        del quote['class']
        del quote['id']
        del quote['typeof']

In [None]:
# MAIN FUNCTION FOR CONVERTING HTML INTO TEI-XML
def teify(html_path, metadata):

    # OPEN FILE AND CREATE SOUP
    with open(html_path, encoding='utf-8') as fp:
        old_soup = BeautifulSoup(fp, 'html.parser')
        fp.close()


    # CREATE NEW TEI SOUP
    tei = '<?xml version="1.0" encoding="UTF-8"?><TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:dcterms="http://purl.org/dc/terms/"><teiHeader><fileDesc><titleStmt><title></title><author></author><principal></principal></titleStmt><publicationStmt><publisher>Università di Bologna</publisher><pubPlace>Bologna</pubPlace><date>2021</date><idno type="DOI"></idno><availability><licence target="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribuzione - Non commerciale 4.0 Internazionale</licence></availability></publicationStmt><sourceDesc><listBibl xml:id="sources"></listBibl></sourceDesc></fileDesc><encodingDesc><projectDesc>Il documento è stato annotato per essere indicizzato all\'interno dell\'Edizione Nazionale delle Opere di Aldo Moro.</projectDesc><editorialDecl><correction status="high" method="silent">Gli unici interventi redazionali, peraltro marginali, sono eseguiti per correggere evidenti refusi nel processo di stampa, le sviste banali, le omissioni di punteggiatura e piccole, lapalissiane correzioni e integrazioni a lacune di testo che si rendono necessarie. Quando indispensabile, una preposizione o una congiunzione mancante si inserisce tra parentesi quadre. Gli ulteriori interventi nel testo vanno segnalati con il segno grafico convenzionale delle parentesi quadre, applicato raramente anche alla punteggiatura, solamente quando rischia di compromettere la corretta comprensione del testo. In tutti gli altri casi, occorre segnalare ugualmente con il segno grafico consueto del «sic» tra parentesi quadre, per evocare un intervento dei curatori.</correction><normalization>Si modificano gli accenti gravi, che sono stati resi in acuti, anche perché corrispondono ai limiti degli impianti tipografici utilizzati all’epoca.</normalization></editorialDecl></encodingDesc><profileDesc><abstract></abstract><textClass><catRef scheme="https://www.w3id.org/moro/voc/types/" target=""/><catRef scheme="https://www.w3id.org/moro/voc/subjects/" target=""/><catRef scheme="https://www.w3id.org/moro/voc/roles/" target=""/></textClass></profileDesc><revisionDesc status=""></revisionDesc></teiHeader><text><body><head></head></body></text></TEI>'
    soup = BeautifulSoup(tei, 'xml')


    # INSERT METADATA IN TEI SOUP
    add_metadata(soup, old_soup, metadata)


    # POPULATE NEW BODY
    if old_soup.find('body'):
        soup.body.insert(1, old_soup.body)
        soup.body.body.unwrap()


    # CONVERT <h1>, <h2>, ... INTO <head>
    h1 = soup.find('h1')
    h2 = soup.find_all('h2')
    if h1:
        soup.head.string = soup.h1.get_text(strip=True)
        soup.h1.decompose()
    if h2:
        for h in h2:
            h.name = 'head'


    # LAUNCH OTHER FUNCTIONS
    convert_entities(soup, old_soup, 'foaf:Person')
    convert_entities(soup, old_soup, 'foaf:Organization')
    convert_entities(soup, old_soup, 'dcterms:Location')
    convert_entities(soup, old_soup, 'fabio:Expression')
    convert_quotes(soup)
    TEIps(soup)
    TEIitalics(soup)
    TEIstrongs(soup)
    TEIas(soup)
    TEIsups(soup)
    TEIcuratornotes(soup)
    TEImoronotes(soup)
    cleaner_empty(soup)


    # RETURN TEI AS STRING
    return str(soup)