In [505]:
# Import statements 
import sys
sys.path.append("/home/dsu/python_libs/open-tamil")

import csv
import tamil
import pandas as pd
from pymarc import XMLWriter, Record, Field
from transliterate import azhagi, jaffna, combinational, UOM, ISO, itrans, algorithm

In [506]:
# Set up of ISO table, MARC writer, and csv file to be read
ISO_table = ISO.ReverseTransliteration.table
writer = XMLWriter(open('output.xml','wb'))
csv_data = pd.read_csv('full_example.csv')

In [507]:
def get_code(title: str) -> str:
    """ Returns the letter code for a 
    subfield given the title of the csv
    column. Returns 'a' if no code is
    found. """
    
    # Find index of $
    i = title.find('$')
    
    # Return letter following $ if it exists,
    # assume 'a' otherwise
    if i == -1: return '!'
    else: return title[i+1]

In [508]:
def get_tag(title: str) -> str:
    """ Returns the tag for a record
    given the title of the csv
    column. Returns 'None' if there
    is no title. """
    
    # Check if title exists
    if title == 'nan':
        # Return 'None' if none exists
        return "None"
    else:
        # If title is found, get indexes of
        # first space and slash
        i = title.find(' ')
        j = title.find('/')
        
        # If no space and no slash is found
        if i == -1 and j == -1:
            # Return the entire title
            return title
        # If no space and a slash is found
        elif i == -1 and j != -1:
            # Return value up to slash
            return title[:j]
        # Otherwise there is a space
        else:
            # Return tag up to space
            return title[:i]

In [509]:
def get_transliterated(content: str) -> str:
    """ Returns the transliterated
    content string. """
    
    # Use open-tamil algorithim to transliterate
    return algorithm.Direct.transliterate(ISO_table, content)

In [510]:
def get_subfield_array(content: str, category: str, title: str) -> list:
    """ Returns an array with subfield codes
    and data, given the column category, title,
    and cell content. Tamil content will be
    transliterated. """
    
    # Create array to be returned
    subfield_array = []
    
    # Check if content exists
    if ('---' in content) or (content == 'nan'):
        # If it does not exist, return no subfields
        return subfield_array
    else:
        # If it does exist, get subfield code
        sub_code = get_code(title)
        # Create array with sub code and content
        subfield_array = [sub_code, content]
        
        # Get transliterated data from open-tamil
        if "(Tamil)" in category:
            subfield_array.append(sub_code + "t")
            subfield_array.append(get_transliterated(content))
    
    # Return array
    return subfield_array

In [511]:
def get_record_dict(row) -> dict:
    """Returns a dictionary
    mapping tags to subfields."""
    
    # Create dictionary
    record_dict = {}
    
    # Loop through each column
    for category, col in csv_data.iteritems(): 
        # Get title, content, and tag
        title = str(col[0])
        content = str(row[category])
        key = get_tag(title)

        # Create subfield arrary
        sub_array = get_subfield_array(content, category, title)

        # Check if tag exists
        if key != 'None':
            # Check if tag is already in dictionary
            if key in record_dict:
                # If tag is in dictionary, update subfield array
                record_dict[key] = record_dict[key] + sub_array
            else:
                # If tag is not in dictionary, add subfield array to dictionary
                record_dict[key] = sub_array
                
    return record_dict

In [512]:
def generate_xml() -> None:
    """Generates MARC xml file from csv file,
    using the pymarc library"""
    
    # Loop through rows (records) of csv
    for i, row in csv_data.iterrows():
        # Skip first row (title row)
        if i != 0:
            # Create new record
            record = Record()
            print("[Getting data from CSV...]")
            record_dict = get_record_dict(row)

            # Loop through tags in dictionary
            print("[Creating subfields...]")
            for col_tag in record_dict:
                # Add record to XML
                record.add_field(
                    Field(
                        tag = col_tag,
                        indicators = ['0','1'],
                        subfields = record_dict[col_tag]
                    )
                )
            # Write record to XML
            writer.write(record)
    
    # Close writer
    writer.close()

In [513]:
# Run function
generate_xml()

[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
[Getting data from CSV...]
[Creating subfields...]
