In [2059]:
# Import statements 
import sys
sys.path.append("/home/dsu/python_libs/open-tamil")

import csv
import tamil
import pandas as pd
from pymarc import XMLWriter, Record, Field
from transliterate import azhagi, jaffna, combinational, UOM, ISO, itrans, algorithm

In [2060]:
# Set up of ISO table, MARC writer, and csv file to be read
ISO_table = ISO.ReverseTransliteration.table

#csv_data = pd.read_csv('simple-example.csv')
#csv_data = pd.read_csv('edgecase-example.csv')
#csv_data = pd.read_csv('tamil-example.csv')
csv_data = pd.read_csv('complex-example.csv')

# Declare arrays to track record fields 
trans_feild_list = []
def_field_list = []

In [2061]:
def get_code(title: str) -> str:
    """ Returns the letter code for a 
    subfield given the title of the csv
    column. Returns 'a' if no code is
    found. """
    
    # Find index of $
    i = title.find('$')
    
    # Return letter following $ if it exists,
    # return '!' otherwise to indicate no given letter
    if i == -1: return '!'
    else: return title[i+1]

In [2062]:
def get_tag(title: str) -> str:
    """ Returns the tag for a record
    given the title of the csv
    column. Returns 'None' if there
    is no title. """
    
    # Check if title exists
    if title == 'nan':
        # Return 'None' if none exists
        return "None"
    else:
        # If title is found, get indexes of
        # first space and slash
        i = title.find(' ')
        j = title.find('/')
        
        # If no space and no slash is found
        if i == -1 and j == -1:
            # Return the entire title
            return title
        # If no space and a slash is found
        elif i == -1 and j != -1:
            # Return value up to slash
            return title[:j]
        # Otherwise there is a space
        else:
            # Return tag up to space
            return title[:i]

In [2063]:
def get_transliterated(content: str) -> str:
    """ Returns the transliterated
    content string with ISO 15919. """
    
    # Use open-tamil algorithim to transliterate
    return algorithm.Direct.transliterate(ISO_table, content)

In [2065]:
def get_subfield_array(content: str, category: str, title: str, key: str) -> list:
    """ Returns an array with subfield codes
    and data, given the column category, title,
    key and cell content. Tamil content will be
    transliterated. """
    
    # Create array to be returned
    subfield_array = []
    
    # Check if content exists
    if ('---' in content) or (content == 'nan'):
        # If it does not exist, return no subfields
        return subfield_array
    else:
        # Get subfield code
        sub_code = get_code(title)
        
        # Get transliterated data from open-tamil
        if "(Tamil)" in category:
            # Find number of corresponding transliteration subfield
            if key in def_field_list or key == 'None':
                trans_count = len(def_field_list)
            else:
                def_field_list.append(key)
                trans_count = len(def_field_list)
            
            # Construct subfield array with Tamil specifications
            subfield_array = ["6", "880-" + str(trans_count), sub_code, get_transliterated(content)]
        else:
            # Construct subfield array without Tamil specifications
            subfield_array = [sub_code, content]
    
    # Return array
    return subfield_array

In [2066]:
def get_trans_subfield_array(content: str, category: str, title: str, key: str) -> list:
    """ Returns an array with subfield codes
    and data, given the column category, title,
    key, and cell content for a Tamil column. """
    
    # Create array to be returned
    subfield_array = []
    
    # Check if content exists
    if ('---' in content) or (content == 'nan'):
        # If it does not exist, return no subfields
        return subfield_array
    else:
        # Get subfield code
        sub_code = get_code(title)
        
        # Get transliterated data from open-tamil
        if "(Tamil)" in category:
            # Find number of corresponding transliteration subfield
            if key in trans_feild_list or key == 'None':
                trans_count = len(trans_feild_list)
            else:
                trans_feild_list.append(key)
                trans_count = len(trans_feild_list)
                
            # Construct subfield array with Tamil specifications
            subfield_array = ["6", key + "-" + str(trans_count) + "/Taml", sub_code, content]
        else:
            # Construct subfield array without Tamil specifications
            subfield_array = [sub_code, content]
    
    # Return array
    return subfield_array

In [2067]:
def get_trans_dict(row) -> dict:
    """Returns a dictionary mapping record tags
    to the subfield arrays, for the non-transliterated
    880 fields."""
    
    # Create dictionary
    trans_dict = {}
    
    # Loop through each column
    for category, col in csv_data.iteritems():
        # If Tamil is in the column header
        if "(Tamil)" in category:
            
            # Get title, content, and key
            title = str(col[0])
            content = str(row[category])
            key = get_tag(title)

            # Create subfield arrary
            sub_array = get_trans_subfield_array(content, category, title, key)
            trans_count = len(trans_feild_list)
            
            # Check if tag exists
            if key != 'None' and len(sub_array) > 0:
                # Check if tag is already in dictionary
                if "880-" + str(trans_count) in trans_dict:
                    # If tag is in dictionary, update subfield array
                    trans_dict['880-' + str(trans_count)] = trans_dict['880-' + str(trans_count)] + sub_array[2:]
                else:
                    # If tag is not in dictionary, add subfield array to dictionary
                    trans_dict['880-' + str(trans_count)] = sub_array

    return trans_dict

In [2068]:
def get_record_dict(row) -> dict:
    """Returns a dictionary mapping record tags
    to the subfield arrays."""
    
    # Create dictionary
    record_dict = {}
    
    # Loop through each column
    for category, col in csv_data.iteritems(): 
        
        # Get title, content, and tag
        title = str(col[0])
        content = str(row[category])
        raw_key = get_tag(title)

        # Create subfield arrary
        sub_array = get_subfield_array(content, category, title, raw_key)
        #print(sub_array)

        # Check if tag exists
        if raw_key != 'None':
            key = str(int(float(raw_key)))
            # Check if tag is already in dictionary
            if key in record_dict:
                # If tag is in dictionary, update subfield array
                if "6" in record_dict[key] and len(sub_array) == 4:
                    record_dict[key] = record_dict[key] + sub_array[2:]
                else:
                    record_dict[key] = record_dict[key] + sub_array
            else:
                # If tag is not in dictionary, add subfield array to dictionary
                record_dict[key] = sub_array
                
    return record_dict

In [2069]:
def generate_xml(outputname: str) -> None:
    """Generates MARC xml file from csv file,
    using the pymarc library. Output is named
    with outputname. """
    
    # Loop through rows (records) of csv
    for i, row in csv_data.iterrows():
        # Skip first row (title row)
        if i != 0:
            # Get output file name
            writer = XMLWriter(open(str(i) + '-' + outputname + '.xml','wb'))

            # Clear out transliteration tracking arrays
            def_field_list.clear()
            trans_feild_list.clear()

            # Create new record and get dictionaries
            record = Record()
            record_dict = get_record_dict(row)
            trans_dict = get_trans_dict(row)

            # Loop through tags in dictionary
            for col_tag in record_dict:
                # Add record to XML
                record.add_field(
                    Field(
                        tag = col_tag,
                        indicators = ['0','1'],
                        subfields = record_dict[col_tag]
                    )
                )
            
            # Loop through tags in 880 dictionary
            for col_tag in trans_dict:
                # Add record to XML
                record.add_field(
                    Field(
                        tag = '880',
                        indicators = ['0','1'],
                        subfields = trans_dict[col_tag]
                    )
                )
            
            print("[MARC Record saved to file " + str(i) + '-' + outputname + '.xml' + "]")
            # Write record to XML
            writer.write(record)
    
            # Close writer
            writer.close()

In [2070]:
# Run script on csv file with name 'OUTPUT'
generate_xml("OUTPUT")

[MARC Record saved to file 1-OUTPUT.xml]
[MARC Record saved to file 2-OUTPUT.xml]
[MARC Record saved to file 3-OUTPUT.xml]
[MARC Record saved to file 4-OUTPUT.xml]
[MARC Record saved to file 5-OUTPUT.xml]
[MARC Record saved to file 6-OUTPUT.xml]
[MARC Record saved to file 7-OUTPUT.xml]
[MARC Record saved to file 8-OUTPUT.xml]
[MARC Record saved to file 9-OUTPUT.xml]
