In [3]:
from bs4 import BeautifulSoup
print(BeautifulSoup.__doc__)


    This class defines the basic interface called by the tree builders.

    These methods will be called by the parser:
      reset()
      feed(markup)

    The tree builder may call these methods from its feed() implementation:
      handle_starttag(name, attrs) # See note about return value
      handle_endtag(name)
      handle_data(data) # Appends to the current data node
      endData(containerClass=NavigableString) # Ends the current data node

    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
    'data' events, and "done with data" events.

    If you encounter an empty-element tag (aka a self-closing tag,
    like HTML's <br> tag), call handle_starttag and then
    handle_endtag.
    


In [121]:
import glob, os
#function 

def get_fields_from_bs(bs_object, field_dict):
    """This function takes a dictionary of fields, with bs.find arguments as values,
       and/or findall, and returns a dictionary of those fields   
    """
    row = {}
    for u in field_dict.keys():
        try:
            field_data = bs_object.select(field_dict[u])
            field_data = [e.text for e in field_data]
        except:
            #respond if it errors or is empty
            field_data = ''
        #parse field if needed ... with conditionals
        row[u] = field_data
    return row

get_fields_from_bs(bs, {'collection_title': 'archdesc[\'level\'=\'collection\'] > did > unittitle'})
# 'subject_headings':'controlaccess > *'
# "repository":"origination['label'='creator'] > *"

{'collection_title': ['American Left Ephemera Collection']}

In [106]:
from pymarc import MARCReader, record_to_xml

def get_bs_from_xml(_dir, source_type):
    """File types can be ead, mods, or marc binary, returns a list of bs_objects"""
    if source_type == 'marc':
        filenames = glob.glob(_dir+'*.mrc')
    if source_type == 'mods' or source_type == 'ead':
        filenames = glob.glob(_dir+'*.xml')  
          
    bs_objects = []
    for z in filenames: 
        if source_type == 'marc':
            with open(z, 'rb') as fh:
                reader = MARCReader(fh)
                for record in reader:
                    xml = record_to_xml(record).decode("utf-8")
                    
        if source_type == 'mods' or source_type == 'ead':
            with open(z) as f:
                xml = f.read()
        bs = BeautifulSoup(xml)
        bs_objects.append(bs)
        return bs_objects
    

In [124]:
def base_layer_maker(location, collection_type, collection_subtype):
    """This function accepts three arguments and writes data to base-layers
       Location should be a folder name only that can be found in source-data, such as 'american-left-ephemera'
       collection_type is a controlled vocabulary ('archive', 'serial', 'monograph') ... errors if you pass anything else
       collection_subtype is 'digital' or 'print'
    """
    #create subdirectory in base-layers for that location
    newdir = "base-layers/" + location
    
    try:
        os.stat(newdir)
    except:
        os.mkdir(newdir)
    
    #conditionals here?
    if collection_type == 'archive':
        
        #set collection_dir
        collection_dir = "source-data/%s/ead/" % location
        #retrieve relevant BeautifulSoup object(s) as list
        collection_data = get_bs_from_xml(collection_dir, 'ead')
        
        #set item_dir
        item_dir = "source-data/%s/mods/" % location
        #retrieve relevant BeautifulSoup object(s) as list
        item_data = get_bs_from_xml(item_dir, 'mods')
        
        #make datastores of base fields you want 
        collection_fields = {'identifier': 'eadid', 
                             'finding_aid_title':'titleproper',
                             'acquisition_number':'num',
                             'finding_aid_creator': 'author',
                             'repository':'repository > corpname',
                             'publisher': 'publisher',
                             'date_of_publication':'publicationstmt>date',
                             'date_of_creation': 'profiledesc > creation > date',
                             'collection_title': 'archdesc[\'level\'=\'collection\'] > did > unittitle',
                             # one or many
                             'extent': 'physdesc > extent',
                             'temporal_coverage': 'archdesc[\'level\'=\'collection\'] > did > unitdate',
                             # one or many, 1 child per
                             'collection_creator': 'origination[\'label\'=\'creator\'] > *',
                             'conditions_governing_use': 'userestrict > p',
                             # one or many, one p per 
                             'related material': 'relatedmaterial > p',
                             # one or many
                             'collection_scope_and_content': 'archdesc > scopecontent > p',
                             # has em tags, one or many)
                             'biography_or_history': 'bioghist > p',
                             'preferred_citation': 'prefercite > p',
                             'subject_headings': 'controlaccess > *',
                            }
        
        ### get c01 - c09, walk down series, subseries, and otherlevel
                             # get did> unitid , did> unittitle
                             # scopecontent > p (one or many)
                             # series_type is level attribute
                             #'series_titles':,
                             #series_numbers':,
                             #'series_types':,
                             #'series_scope_and_content':,
        
        collection_output_rows = {}
        for x in collection_data:
            row = get_fields_from_bs(x, collection_fields)
            for key in row.keys():
                try:
                    collection_output_rows[key].append(row[key])
                except:
                    collection_output_rows[key] = row[key]
                    
        item_fields = {'title':'mods:title', 'identifier': ['mods:identifier', {'type':'pitt'}]}
        
        item_output_rows = {}
        #loop fields and get
        for x in item_data:
            row = get_fields_from_bs(x, item_fields)
            #create keys if they don't exist
            for key in row.keys():
                try:
                    #if they do exist, append values from row
                    item_output_rows[key].append(row[key])
                except:
                    #if not, create and add first value as item in list
                    item_output_rows[key] = [row[key],]
        
        return collection_output_rows
        #convert output_rows to pandas dataframe and use to_csv()
        #write row to yml file
        
    if collection_type == 'serial': 
        return True
    
    if collection_type == 'monograph':
        return True
            
#function call example
result = base_layer_maker('american-left-ephemera', 'archive', 'digital')

In [125]:
result

{'identifier': ['US-PPiU-ais200711'],
 'finding_aid_title': ['Guide to the American Left Ephemera Collection, 1875-2015\n                    AIS.2007.11\n'],
 'acquisition_number': ['AIS.2007.11'],
 'finding_aid_creator': ['Finding aid prepared by Lindsay Bedford and Patrick Trembeth with assistance provided by Dr. Richard Oestreicher.'],
 'repository': ['ULS Archives & Special Collections'],
 'publisher': ['ULS Archives & Special Collections'],
 'date_of_publication': '',
 'date_of_creation': ['2017-08-29T07:46-0400'],
 'collection_title': ['American Left Ephemera Collection'],
 'extent': ['22.75 linear feet', '(37 boxes)', '\n', '\n3'],
 'temporal_coverage': ['1875-2015'],
 'collection_creator': ['Oestreicher, Richard Jules, 1947-'],
 'conditions_governing_use': ['The University of Pittsburgh holds the property rights to the material in this collection, but the copyright may still be held by the original creator/author. Researchers are therefore advised to follow the regulations set 

In [31]:
#result[0].find('mods:title')
fields = {'title':'mods:title', 'identifier': ['mods:identifier', {'type':'pitt'}]}
get_fields_from_bs(result[1], fields)

"Delegate, Special Convention Communist Party, U.S.A., July 4-7, 1968"
31735061659946


{'title': '"Delegate, Special Convention Communist Party, U.S.A., July 4-7, 1968"',
 'identifier': '31735061659946'}

In [24]:
#TO DO
        
#feed it a location, and a kind of collections data ... DONE
#it writes files, returns True at the end ... DONE
#where to get the info changes, columns change ... DONE

#metadata.yml
## for archival ... ead, mods to yaml
## for a monograph collection, every marc in the folder (write an empty yaml with one field for "description")
## for archival collections, put series.csv yaml? 
## for series? Need to finish base layer

#csv items
## for archival, items.csv
#### columns: 
## for monographs, items.csv
#### columns:
## for serials, items.csv
#### columns:

#csv columns are the base data layer items for that kind ... See Base Data Layer Config on Google Drive
#finish and push

In [98]:
#example code 
_dir = "source-data/american-left-ephemera/ead/pitt_US-PPiU-ais200711_EAD.xml"
with open(_dir) as f:
    ead = f.read()
    
bs = BeautifulSoup(ead)


_dir = "source-data/american-left-ephemera/mods/pitt_31735051654956_MODS.xml"
with open(_dir) as f:
    mods = f.read()
bs_mods = BeautifulSoup(mods)

In [99]:
bs_mods

<html><body><mods:mods xmlns="http://www.loc.gov/mods/v3" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:ns2="http://www.w3.org/1999/xlink" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-3.xsd http://www.cdlib.org/inside/diglib/copyrightMD http://www.cdlib.org/groups/rmg/docs/copyrightMD.xsd">
<mods:titleinfo>
<mods:title>"Resistance At Penn" Flyer,</mods:title>
</mods:titleinfo>
<mods:name>
<mods:namepart>University of Pittsburgh</mods:namepart>
<mods:role>
<mods:roleterm type="text">depositor</mods:roleterm>
</mods:role>
</mods:name>
<mods:typeofresource>text</mods:typeofresource>
<mods:genre>archival document</mods:genre>
<mods:origininfo>
<mods:dateother type="display">April 11 1967</mods:dateother>
<mods:datecreated encoding="iso8601" keydate="yes">1967-04-11</mods:datecreated>
<mods:dateother type="sort">1967-04-11T00:00:00</mods:dateother>
<

In [60]:
title = mods_bs.find('mods:title')
identifier = mods_bs.find('mods:identifier', {'type':'pitt'}).text

#<mods:identifier type="pitt">31735051654956</mods:identifier>

'31735051654956'

In [23]:
ead[:1000]

'<mods:mods xmlns:ns2="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-3.xsd http://www.cdlib.org/inside/diglib/copyrightMD http://www.cdlib.org/groups/rmg/docs/copyrightMD.xsd">\n  <mods:titleInfo>\n    <mods:title>"Resistance At Penn" Flyer,</mods:title>\n  </mods:titleInfo>\n  <mods:name>\n    <mods:namePart>University of Pittsburgh</mods:namePart>\n    <mods:role>\n      <mods:roleTerm type="text">depositor</mods:roleTerm>\n    </mods:role>\n  </mods:name>\n  <mods:typeOfResource>text</mods:typeOfResource>\n  <mods:genre>archival document</mods:genre>\n  <mods:originInfo>\n    <mods:dateOther type="display">April 11 1967</mods:dateOther>\n    <mods:dateCreated keyDate="yes" encoding="iso8601">1967-04-11</mods:dateCreated>\n    <mods:dateOthe

In [35]:
container = ead_bs.find('mods:note', {'type':'container'})

In [41]:
container_str = container.text
box = container_str.split(",")[0].replace("Box ", "")
folder = container_str.split(",")[1].replace("Folder ", "")

In [112]:
#title author split (experiment)
import os
_mods_dir = "source-data/american-left-ephemera/mods/"
results = []
for dirName, subdirList, fileList in os.walk(_mods_dir):
    for f in fileList:
        with open(dirName+f) as mf:
            xml = mf.read()
            bs = BeautifulSoup(xml, 'xml')
            title_author = bs.find('title')
            try:
                container = bs.find('note', {'type':'container'})
                container_str = container.text
                box = container_str.split(",")[0].replace("Box ", "")
                folder = container_str.split(",")[1].replace("Folder ", "")
            except:
                print(f)
            try: 
                results.append(title_author.text)
            except:
                print(dirName+f)
            

pitt_31735065151247_MODS.xml
pitt_31735067913180_MODS.xml
pitt_31735068355142_MODS.xml
pitt_31735068354921_MODS.xml
pitt_31735067911333_MODS.xml
pitt_31735065150975_MODS.xml
pitt_31735068355134_MODS.xml
pitt_31735065151056_MODS.xml
pitt_31735067913164_MODS.xml
pitt_31735065150397_MODS.xml
pitt_31735065150892_MODS.xml
pitt_31735065151205_MODS.xml
pitt_31735067911499_MODS.xml
pitt_31735065150157_MODS.xml
pitt_31735068354822_MODS.xml
pitt_31735065150330_MODS.xml
pitt_31735068354962_MODS.xml
pitt_31735065150017_MODS.xml
pitt_196504.1.2.002_MODS.xml
pitt_196504.1.2.003_MODS.xml
pitt_31735068355100_MODS.xml
pitt_31735065150207_MODS.xml
pitt_31735067913248_MODS.xml
pitt_31735065151015_MODS.xml
pitt_31735068354855_MODS.xml
pitt_31735067911325_MODS.xml
pitt_31735067913222_MODS.xml
pitt_31735067911416_MODS.xml
pitt_31735067911556_MODS.xml
pitt_31735065151148_MODS.xml
pitt_31735067913172_MODS.xml
pitt_31735067913255_MODS.xml
pitt_31735068353410_MODS.xml
pitt_31735067911606_MODS.xml
pitt_317350683

In [104]:
results[0:10]

['"Democracy Should Begin At Home"',
 '"Delegate, Special Convention Communist Party, U.S.A., July 4-7, 1968"',
 'Tenement Children Protest, New York City,',
 'Mine-Mill union',
 '"What Are You Doing About Your Undeclared War?" Flyer and Event Details,',
 'Student Peace Union Recruitment Brochure',
 'What is the New Deal?, by Earl Browder',
 '"The Real Huey P. Long", By Sender Garlin,',
 'Letter from Dr. Martin Luther King, Jr.,',
 '"The Socialization of Money", By E.F. Mylius']

In [98]:
split = []
for r in results:
    if ", by " in r.lower():
        two_fields = r.split(", by ")
        if len(two_fields) == 1:
            two_fields = r.split(", By ")
        split.append(two_fields)
    else:
        split.append([r, ''])

In [105]:
[u[0] for u in split[0:200]]

['"Democracy Should Begin At Home"',
 '"Delegate, Special Convention Communist Party, U.S.A., July 4-7, 1968"',
 'Tenement Children Protest, New York City,',
 'Mine-Mill union',
 '"What Are You Doing About Your Undeclared War?" Flyer and Event Details,',
 'Student Peace Union Recruitment Brochure',
 'What is the New Deal?',
 '"The Real Huey P. Long"',
 'Letter from Dr. Martin Luther King, Jr.,',
 '"The Socialization of Money"',
 'Wrecking the Labor Banks',
 'War and the 4th International,',
 '"Capitalism, Socialism, Communism?- A Debate"',
 'Drum: Wildcat Strike,',
 'The International Socialist Review,',
 'Mine-Mill union',
 '"World-Wide Unemployment", 20,000,000 Unemployed',
 'Women of Yesterday and Today',
 'The Position of Negro Women',
 'May Day,',
 '"Children Under Capitalism"',
 '"In Flanders Field..."',
 '"The Red Baiting Racket and How it Works"',
 '"Smash Hitler\'s Spring Offensive Now!"',
 'Coming! W.E.B. DuBois',
 'Free Billy Smith',
 '"Five Years of Hitler"',
 'A Statement 

In [51]:
from pymarc import MARCReader, record_to_xml
with open('source-data/african-and-african-diasporic-serials/marc_binary/african-and-african-diasporic-serials.mrc', 'rb') as fh:
    reader = MARCReader(fh)
    bs_objects = []
    for record in reader:
        print((record_to_xml(record).decode("utf-8")))
        break

<record><leader>02532cas a2200565Ii 4500</leader><controlfield tag="001">7964458</controlfield><controlfield tag="005">20151002073725.0</controlfield><controlfield tag="006">m     q  d        </controlfield><controlfield tag="007">co cga||||||||</controlfield><controlfield tag="008">141013c20109999cu gr   q b   1    0spa d</controlfield><datafield ind1=" " ind2=" " tag="020"><subfield code="a">9789592840102</subfield><subfield code="q">2010</subfield></datafield><datafield ind1=" " ind2=" " tag="020"><subfield code="a">9789592840126</subfield><subfield code="q">2011</subfield></datafield><datafield ind1=" " ind2=" " tag="035"><subfield code="a">(OCoLC)ocn899081806</subfield></datafield><datafield ind1=" " ind2=" " tag="035"><subfield code="a">7964458</subfield></datafield><datafield ind1=" " ind2=" " tag="040"><subfield code="a">TJC</subfield><subfield code="b">eng</subfield><subfield code="e">rda</subfield><subfield code="c">TJC</subfield><subfield code="d">TJC</subfield><subfield cod

In [139]:
import re 
def parse_container(container_desc, container_type):
    """ 
    We pass this a natural language container description, 
    use the container_type (e.g. 'folder') as a regex needle, match text directly after, 
    and return a substring based on the result
    Example: Box 1, folder 1
    """
    base_needle = '\s?(\d+)\D?'
    try:
        matches = re.search(container_type.lower()+base_needle, container_desc.lower())
        match = matches.group(1)
    except: 
        match = ""
    return match

parse_container("Box 1 folder 4 drawer 11", 'drAwer')

'11'