# Genizah Medical Data

Some of the metadata descriptions for the [Cairo Genizah](https://cudl.lib.cam.ac.uk/collections/genizah/) fragements are medical in nature, for example [T-S Ar.43.324](https://cudl.lib.cam.ac.uk/view/MS-TS-AR-00043-00324/1).

We'd like to analyse the descriptions of these fragments to see what we can learn about medicine.

This repository's `medical-data` dir contains `genizah-tei.tar.lz`, which is a collection of all of the Genizah TEI metadata. (This file is generated by [bundle-genizah-tei.sh](../medical-data/bundle-genizah-tei.sh).)

In [1]:
import re
import sys
import tarfile
import warnings

from lxml import etree
import pandas as pd

In [2]:
# Suppress warnings about messy metadata
class GenizahDataWarning(UserWarning):
    pass
warnings.filterwarnings('ignore', category=GenizahDataWarning)

In [3]:
bundle = tarfile.open('../medical-data/genizah-tei.tar.lz')

Define some functions to work with the TEI metadata.

Unfortunatley the metadata uses namespaces in some files and not in others, so we need to match TEI elements with no namespace in addition to the TEI namespace.

In [7]:
namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'}
genizah_subject = 'http://id.loc.gov/authorities/subjects/sh85018717.html'

filedesc = '/tei:TEI/tei:teiHeader/tei:fileDesc'
ms_desc = f'{filedesc}/tei:sourceDesc/tei:msDesc'


def _strip_tei_ns(expr):
    return re.sub(r'\btei:', '', expr)


def _xpath_optional_ns(el, expr):
    '''Evaluate an xpath expression with and without the tei namespace.'''
    return (el.xpath(expr, namespaces=namespaces) or
            el.xpath(_strip_tei_ns(expr)))


def is_genizah_item(root_el):
    return _xpath_optional_ns(root_el, (
        f'boolean(/tei:TEI/tei:teiHeader/tei:profileDesc/'
        f'tei:textClass/tei:keywords//tei:ref[@target="{genizah_subject}"])'))


def is_medical_item(root_el):
    return 'medical' in get_title(root_el).lower()


def get_title(root_el):
    return _xpath_optional_ns(root_el,
                              f'normalize-space({ms_desc}/'
                              f'tei:msContents/tei:msItem[1]/tei:title)')


def get_summary(root_el):
    return _xpath_optional_ns(root_el, f'normalize-space({ms_desc}/'
                                       f'tei:msContents/tei:summary)')


def get_date_range(root_el):
    dates = _xpath_optional_ns(
        root_el, f'{ms_desc}/tei:history/tei:origin/tei:date[1]')

    if dates:
        date = dates[0]
        start = date.attrib['notBefore']
        end = date.attrib['notAfter']
        return (start, end)

def get_material_type(root_el):
    return _xpath_optional_ns(
        root_el, f'normalize-space({ms_desc}'
                 '/tei:physDesc/tei:objectDesc/tei:supportDesc/@material)') or None
    
def get_fragment_size(root_el):    
    dimensions = _xpath_optional_ns(
        root_el, f'{ms_desc}/tei:physDesc/tei:objectDesc/'
                 'tei:supportDesc/tei:extent/tei:dimensions[@unit="cm"][count(*) = 2][tei:height][tei:width]/*/child::text()')

    if dimensions:
        try:
            width, height = (float(x) for x in dimensions)
            return (width, height)
        except ValueError as e:
            warnings.warn(f'non-numeric value for tei:width or tei:height column: {dimensions}',
                          GenizahDataWarning)


def get_layout(root_el):
    layouts = _xpath_optional_ns(
        root_el, f'{ms_desc}/tei:physDesc/tei:objectDesc/tei:layoutDesc/tei:layout[@columns]')
    if not layouts:
        return
    
    layout = layouts[0]
    try:
        columns = int(layout.attrib['columns'])
    except ValueError as e:
        warnings.warn(f'non-integer value for tei:layout columns attribute: {layout.attrib["columns"]}',
                      GenizahDataWarning)
        return
    
    lines_expr = re.search(r'\b(\d+) lines\b', layout.text or '')
    if lines_expr:
        return (columns, int(lines_expr.group(1)))
    
    
def filename(path):
    return re.sub('^(?:.*/)?([^/]+)\.[a-z]+$', r'\1', path)


def parse_xml(name, file):
    try:
        return etree.parse(file)
    # Some of the TEI files have invalid id attribute values, ignore those
    except etree.XMLSyntaxError as e:
        msg = str(e)
        if 'xml:id' in msg and 'is not an NCName' in msg:
            warnings.warn(f"Ignoring XML file with invalid id attribute: {name}",
                          GenizahDataWarning)
            return None
        raise


def get_data(path, root_el):
    date_range = get_date_range(root_el)
    size = get_fragment_size(root_el)
    layout = get_layout(root_el)
    return {
        'classmark': filename(path),
        'title': get_title(root_el),
        'summary': get_summary(root_el),
        'material': get_material_type(root_el),
        'date_start': date_range[0] if date_range else None,
        'date_end': date_range[1] if date_range else None,
        'width': size[0] if size else None,
        'height': size[1] if size else None,
        'columns': layout[0] if layout else None,
        'lines': layout[1] if layout else None
    }

def medical_elements():
    all_els = ((entry.name, parse_xml(entry.name, bundle.extractfile(entry))) 
            for entry in bundle)
    
    return ((name, el) for name, el in all_els
            if el and is_genizah_item(el) and is_medical_item(el))

Load descriptions of medical fragments and store them in a pandas data frame.

In [8]:
data = pd.DataFrame.from_records(
    (get_data(path, root) for path, root in medical_elements()),
    index='classmark')
data.head()

Unnamed: 0_level_0,columns,date_end,date_start,height,lines,material,summary,title,width
classmark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MS-OR-01080-00001-00063,1.0,1899-12-31,0500-01-01,21.2,21.0,paper,"Pharmacopoeia, containing diagrams and symbols...",Medical,14.3
MS-OR-01080-00001-00072,1.0,1899-12-31,0500-01-01,36.4,22.0,vellum,"Discussion of various medical treatments, regi...",Medical,16.8
MS-OR-01080-00001-00081,1.0,1899-12-31,0500-01-01,25.4,12.0,paper,"Medical work on the composition of the body, c...",Medical,16.8
MS-OR-01080-00001-00087,1.0,1233-12-31,1213-01-01,,5.0,paper,Recto: a short medical recipe. Verso: a respon...,Medical,
MS-OR-01080-00002-00070,1.0,1199-12-31,1100-01-01,31.5,35.0,paper,Autograph draft of a medical work by Moses Mai...,Medical,22.8


Create a JSON dataset from our Genizah medical metadata.

In [9]:
with open('../medical-data/genizah-medical.json', 'w', encoding='utf-8') as f:
    data.to_json(f, orient='index')