In [1]:
import os
import re
import json

import markdown
from bs4 import BeautifulSoup
from django_tenants.utils import schema_context

from django.apps import apps
from django.db import connection
from django.contrib.contenttypes.models import ContentType

In [2]:

REFERENCE_HREF = re.compile(r'(?:(?:http|https)://dalme.org)?/project/bibliography/#([A-Z0-9]{8})')
BIBLIO_IDS = []


In [3]:
def get_class_name(obj):
    return str(obj.__class__)[8:-2].split('.')[-1]

def get_fields_by_type(model, field_types, as_map=False):
    # Wagtail subclasses Django fields without redefining the method
    # or declaring the __name__ attribute
    # so for certain types we need to use the field's class instead
    # of calling field.get_internal_type()
    filter_by_class = ['RichTextField']

    if not isinstance(field_types, list):
        field_types = [field_types]

    target_types = [t for t in field_types if t not in filter_by_class]
    target_classes = [t for t in field_types if t in filter_by_class]

    fields = []
    for field in model._meta.get_fields():  # noqa: SLF001
        try:
            field_cls_name = get_class_name(field)
            if field.get_internal_type() in target_types or field_cls_name in target_classes:
                fields.append((field.name, field.get_internal_type()))
        except AttributeError:  # it's a GenericForeignKey
            continue
    return fields if as_map else [i[0] for i in fields]


In [4]:

def markdown_to_html(content):
    # <span data-footnote=\"For Bons-Enfants, houses of poor grammar-school students attested across northern France and francophone Flanders (as Douai),
    # Joan M. Reitzel, &quot;[The Medieval House of Bons-Enfants](http://dalme.org/project/bibliography/#ZUMTQ5B9),&quot; _Viator_ 11 (1980): 179-207.\"
    # data-note_id=\"1fe36ff5-2e03-4890-82b1-71090d6333d7\">✱</span>
    soup = BeautifulSoup(markdown.markdown(content), features='lxml')
    references = soup.find_all('a', href=REFERENCE_HREF)
    if references:
        soup = convert_references(soup)

    return str(soup.body.findChildren(recursive=False))

def convert_references(soup):
    for ref in soup.find_all('a', href=REFERENCE_HREF):
        # format: <a data-biblio="54" data-id="R476GUY2" data-reference="(Telliez, 2011)" linktype="reference">consumption</a>
        match = REFERENCE_HREF.fullmatch(ref['href'])
        if match:
            ref['data-biblio'] = biblio_page_id()
            ref_id = match.group(1)
            BIBLIO_IDS.append(ref_id)
            ref['data-id'] = ref_id
            # ref['data-reference'] = TODO: figure out a way to get the citation!
            ref['linktype'] = 'reference'
            del ref['href']

    return soup

def fix_entities(text, page):
    soup = BeautifulSoup(text, features='lxml')
    # references
    references = soup.find_all('a', href=REFERENCE_HREF)
    if references:
        soup = convert_references(soup)

    # footnotes
    footnotes = soup.find_all('span', attrs={'data-footnote': True})
    if footnotes:
        for fn in footnotes:
            # format: <a data-footnote="c847f9da-3780-4085-9426-73e7a0228b3d" linktype="footnote">✱</a>
            fn_content = markdown_to_html(fn['data-footnote'])
            with schema_context('dalme'):
                from public.models import Footnote

                new_footnote = Footnote.objects.create(page=page, text=fn_content)

            fn.name = 'a'
            fn['data-footnote'] = new_footnote.id
            fn['linktype'] = 'footnote'
            del fn['data-note_id']

    return str(soup.body.findChildren(recursive=False))

def process_content_field(field_value, field_type, obj):
    if field_type == 'JSONField':
        if isinstance(field_value, dict):
            content = json.loads(field_value['body'])
            for block in content:
                if isinstance(block, dict) and block.get('type') == 'text':  # noqa: SIM102
                    if block.get('value') and isinstance(block.get('value'), list):
                        block['value'] = fix_entities(block['value'], obj)

            field_value['body'] = json.dumps(content)
            
        else:
            for block in field_value:
                if block.block_type == 'text':
                    block.value = block.to_python(
                        fix_entities(block.get_prep_value(), obj)
                    )

    else:
        field_value = fix_entities(field_value, obj)

    return field_value

def biblio_page_id():
    with schema_context('dalme'):
        from public.models import Bibliography

        return Bibliography.objects.first().page_ptr_id


In [5]:
targets = {
            'wagtailcore': ['revision'],
            'public': [
                # 'bibliography',
                'collection',
                'collections',
                'corpus',
                'essay',
                'featuredinventory',
                'featuredobject',
                'features',
                'flat',
            ],
        }

for app_label, models in targets.items():
    for model_name in models:
        qualified_name = f'{app_label}_{model_name}'
        print(f'Processing {qualified_name}')
        with schema_context('dalme'):
            model = apps.get_model(app_label=app_label, model_name=model_name)
            target_fields = get_fields_by_type(model, ['JSONField', 'RichTextField'], as_map=True)

            for instance in model.objects.all():
                updated_fields = []
                for field_name, field_type in target_fields:
                    field_value = getattr(instance, field_name, None)
                    if field_value:
                        updated_fields.append(field_name)
                        setattr(
                            instance, 
                            field_name, 
                            process_content_field(field_value, field_type, instance),
                        )

                instance.save(update_fields=updated_fields)
                
print(BIBLIO_IDS)

Processing wagtailcore_revision
Processing public_collection


AttributeError: 'StreamChild' object has no attribute 'to_python'

In [10]:
with schema_context('dalme'):
    model = apps.get_model(app_label='public', model_name='Collection')
    target = model.objects.first()
    payload = target.body.raw_data
    
    changes = []
    
    for idx, block in enumerate(payload):
        if block['type'] == 'text':
            block['value'] = fix_entities(block['value'], target)
            changes.append((idx, block))
    
    if changes:
        print(len(changes))
        for change in changes:
            payload.insert(*change)
    
    target.body = payload
    target.save()
        
        

#     for block in payload:
#         if block.block_type == 'text':
#             block.value = block.block.to_python(fix_entities(block.block.get_prep_value(block.value), target))
#             print(block.get_prep_value())
#             print(block.block.raw_data)
    
#     target.body = payload
#     target.save()


    
    

24


TypeError: the JSON object must be str, bytes or bytearray, not RawDataView

In [8]:
print(changes[1])

(2, {'id': '6c96ccc3-06cf-4441-8dd9-5670fe310f43', 'type': 'text', 'value': '[<p data-block-key="z0knd">Since the early Middle Ages, cathedral communities and religious houses maintained ecclesiastical inventories, that is, itemized lists of the objects, books, vestments, and accrued “stuff” stored in the institution’s treasury or sacristy. According to <a data-biblio="54" data-id="6NUFKTEG" linktype="reference">de Mely</a>, the earliest surviving ecclesiastical inventory is that from the abbey of St.-Riquier in Picardy.<a data-footnote="3fa07183-8bc2-4d76-838e-e4a714d40f64" linktype="footnote">✱</a></p>, <p data-block-key="cdgln">The inventories vary in what they record, but most list or briefly detail the relics, books, liturgical objects, and vestments collected and preserved, and often recorded the names of associated donors as the objects passed from the secular world into the sacred. Indeed, since the church is an institution that does not die, objects in ecclesiastical collectio

In [4]:
with schema_context('dalme'):
    from public.models import Collection

    c_page = Collection.objects.first()
    body = c_page.body.get_prep_value()
    for block in body:
        if block.get('type') == 'bibliography':
            block['value'] = block['value']['collection']
    
    biblio_page.body = body
#     biblio_page.body = biblio_page.body.block.to_python(body)

#     biblio_page.save(update_fields=['body'])

NameError: name 'Bibliography' is not defined

In [9]:
with schema_context('dalme'):
    model = apps.get_model(app_label='wagtailcore', model_name='revision')
    target = model.objects.first()
    print(get_class_name(target.content['body']))
#     payload = target.body
#     for block in payload:
#         if block.block_type == 'text':
#             block.value = block.block.to_python(fix_entities(block.block.get_prep_value(block.value), target))
    
#     target.body = payload
#     target.save()


str


In [19]:
with schema_context('dalme'):
    from wagtail.models import Page, Revision
    from public.extensions.team.models import TeamRole, TeamMember
    
    
    ROLES = {
        'Project Team': TeamRole.objects.create(role='Core Team'),
        'Contributors': TeamRole.objects.create(role='Contributor'),
        'Advisory Board': TeamRole.objects.create(role='Advisory Board'),
    }

    people_page = Page.objects.get(title='People').specific
    current_role = None
    
    for block in people_page.body:
        if block.block_type == 'subsection':
            block_value = block.block.get_prep_value(block.value)
            current_role = ROLES.get(block_value.get('subsection'))
        
        elif block.block_type == 'person':
            block_value = block.block.get_prep_value(block.value)
            tm = TeamMember.objects.create(
                name=block_value.get('name'),
                title=block_value.get('job'),
                affiliation=block_value.get('institution'),
                url=block_value.get('url'),
                photo_id=block_value.get('photo'),
            )
            tm.roles.add(current_role)


            




In [41]:
with schema_context('dalme'):
    model = apps.get_model(app_label='public', model_name='Essay')
    target = model.objects.first()
    print(get_class_name(target.body))
#     payload = target.body
#     for block in payload:
#         if block.block_type == 'text':
#             block.value = block.block.to_python(fix_entities(block.block.get_prep_value(block.value), target))
    
#     target.body = payload
#     target.save()


StreamValue
