Converting from a dump of rendered HTML that was in Chameleon's Django CMS database

In [1]:
import hashlib
import json
import string

import parsel

In [2]:
with open('./dump.json', 'r', encoding='utf-8') as f:
    dump = json.load(f)

In [3]:
type(dump), len(dump)

(list, 136)

In [4]:
type(dump[0])

dict

In [5]:
dump[0].keys()

dict_keys(['cmsplugin_ptr_id', 'body'])

In [6]:
', '.join(
    str(x) 
    for x 
    in sorted(len(d['body']) for d in dump)
)

'0, 0, 19, 22, 22, 31, 53, 56, 58, 72, 155, 156, 164, 164, 169, 169, 172, 173, 174, 240, 264, 286, 286, 295, 296, 301, 319, 324, 326, 342, 367, 367, 375, 375, 399, 409, 420, 420, 452, 453, 453, 486, 540, 549, 563, 563, 576, 580, 639, 639, 641, 657, 904, 956, 985, 993, 1074, 1111, 1111, 1161, 1192, 1193, 1619, 1728, 1920, 1920, 2033, 2062, 2076, 2181, 2185, 2185, 2299, 2307, 2336, 2386, 2480, 2515, 2669, 2990, 2996, 3085, 3175, 3175, 3212, 3253, 3287, 3746, 3746, 5250, 5283, 5341, 5462, 5646, 5751, 6327, 6403, 6403, 6433, 6454, 6821, 7143, 7213, 7219, 7365, 7389, 8894, 9021, 9110, 9154, 9218, 9432, 9553, 9586, 10664, 11069, 11702, 11777, 14960, 15820, 15851, 17813, 22932, 24345, 26584, 26674, 26880, 26975, 32532, 32666, 37036, 37717, 43075, 44965, 70808, 70859'

In [7]:
stop_words = {'on', 'and', 'the', 'for', 'a', 'an', 'to'}
def normalize(title):
    words = title.split()
    return '-'.join(''.join(c for c in w.lower() if c in string.ascii_letters) for w in words if w not in stop_words)

In [8]:
def infer_name(body):
    if not body.strip():
        return None

    sel = parsel.Selector(body)
    for hlevel in range(1, 4+1):
        header = sel.xpath('//h{:d}//text()'.format(hlevel)).extract_first()
        if header:
            break
    else:
        return None

    return normalize(header.strip())

In [9]:
docs = {}
seen = set()
for doc in dump:
    if len(doc['body']) < 1000:
        continue
    hash_ = hashlib.md5(doc['body'].encode('utf-8')).hexdigest()
    if hash_ in seen:
        continue
    seen.add(hash_)
    
    name = infer_name(doc['body'])
    if name in docs:
        for n in range(2, 20):
            other_name = '{}-{}'.format(name, n)
            if other_name not in docs:
                name = other_name
                break
        else:
            raise RuntimeError('too many things named "{}"'.format(name))
    
    docs[name] = doc['body']
len(docs)

74

In [10]:
docs.keys()

dict_keys(['nsfcloud-workshop-experimental-support-cloud-computing', 'about', 'openstack-kvm-user-guide', 'hotel-user-guide', 'openstack-load-balancer-as-service-user-guide', 'default-chameleon-environment', 'image-management', 'activating-futuregrid-project', 'openstack-load-balancer-as-service-user-guide-2', 'getting-started', 'using-resource-discovery-api', 'using-resource-discovery-api-2', 'talks-about-chameleon-project', 'a-configurable-experimental-environment-largescale-cloud-research', 'pi-eligibility', 'bare-metal-user-guide', 'project-policies', 'chameleon-documentation', 'about-2', 'standard-cloud-units', 'appliances', 'centos--appliance', 'project-policies-2', 'centos--kvm-sriov-appliance', 'centos--kvm-sriov-appliance-2', 'ironic-user-guide', 'centos--docker-appliance', 'centos--appliance-2', 'centos--docker-appliance-2', 'openstack-kvm-user-guide-2', 'nsfcloud-workshop-experimental-support-cloud-computing-2', 'appliances-2', 'bare-metal-user-guide-2', 'configure-interact-

In [11]:
import os
os.environ.setdefault('PYPANDOC_PANDOC', '/usr/local/bin/pandoc')

import pypandoc

In [12]:
for doc, html in docs.items():
    rst = pypandoc.convert_text(html, 'rst', 'html')
    with open('{}.rst'.format(doc), 'w', encoding='utf-8') as f:
        f.write(rst)