## Converting Tag Tog annotations to useable data

Imports

In [1]:
import os
import io
import re
import codecs
import pathlib
from   shutil import copy
from   bs4    import BeautifulSoup

Functions

In [2]:
# Returns a list of tuples contains all paths and files of a specific type in a directory and its sub directory
# Pass an extension to override .json
def listAllFiles(folder, extension = '.json'):
    files = list()
    for (dirpath, dirnames, filenames) in os.walk(folder):
        files += [(dirpath, file) for file in filenames if file.endswith(extension)]
    return files

In [3]:
# Matches JSON files to their TXT file equivelants
# Must be passed a list of tuples for each type. Tuples must be in (path, file) format
def matchFiles(filesLeft, filesRight):
    
    # Extract file types
    # Left tuple list
    path, file           = filesLeft[0]
    leftName, leftType   = os.path.splitext(os.path.join(path, file))
    
    # Right tuple list
    path, file           = filesRight[0]
    rightName, rightType = os.path.splitext(os.path.join(path, file))
        
    # Match files to files
    # Start with empty list
    returnItem = list()
    
    # Iterate through left list
    for leftFile in filesLeft:
        
        # Extract path and file name from item
        leftPath, leftName = leftFile
        
        # Iterate through right list
        for rightFile in filesRight:
            
            # Extract path and file name from item
            rightPath, rightName = rightFile
            
            # Match the names
            # HTML matching
            if   (leftType == '.html') and re.sub('.plain.html', '.ann.json', leftName) == rightName:
                
                # If matched, add to return item and begin on next item in the left list
                returnItem += [((leftPath, leftName), (rightPath, rightName))]
                continue
                
            # JSON matching
            elif (leftType == '.json') and re.sub('.ann.json', '.plain.html', leftName) == rightName:
            
                # If matched, add to return item and begin on next item in the left list
                returnItem     += [((leftPath, leftName), (rightPath, rightName))]
                continue
            
    # Return
    return returnItem

In [4]:
# Takes a list of matched JSON and HTML files and writes them to a container folder
# file parameter takes the output of matchFiles()
def toDestination(files, destination):
  
    # Iterate through each file tuple
    for file in files:
        
        # Extract path and name
        ((leftPath, leftName), (rightPath, rightName)) = file

        # Extract file type
        fileType = os.path.splitext(os.path.join(leftPath, leftName))[1]

        # Get folder
        destFolder = destination + leftPath.split('\\')[1]
        
        # Create one level of direcory
        if not os.path.exists(destFolder):
            os.mkdir(destFolder)
        
        # Write to new folder
        writeToDest(leftPath, leftName, destFolder)
        writeToDest(rightPath, rightName, destFolder)

In [10]:
# Writes files to their container folder
def writeToDest(inPath, inFile, destination):

    # Drop the garbage off the file name
    outName         = dropGarbage(inFile)
    
    # Extract file type
    fileType        = os.path.splitext(os.path.join(inPath, inFile))[1]
    
    # Set destination path based on file type of left item
    if fileType    == '.json':
        destFolder  = destination + '\\' + outName.replace('.ann.json', '')
        
    elif fileType  == '.html':
        destFolder  = destination + '\\' + outName.replace('.plain.html', '')
        
    # Create directory if it doesn't exist
    if not os.path.exists(destFolder):
        os.mkdir(destFolder)
    
    # Write JSON files
    if fileType    == '.json':
        copy(os.path.join(inPath, inFile), os.path.join(destFolder, outName))

    # Write HTML to TEXT files
    elif (fileType == '.html'):
        
        # Extract the txt from the html 
        txt         = extractTextFromHTML(os.path.join(inPath, inFile))
        outFileName = os.path.join(destFolder, outName.replace('.plain.html', '.txt'))
        
        # Write it to file
        writeTxtFile(txt, outFileName)

In [11]:
# Drops all characters up to the first '-' in the TagTog file name 
def dropGarbage(fileName):
    
    # Temp variable
    temp = fileName
    
    # Get buffer for post index starting point
    if temp[temp.index('-') + 1] == '_':
        buffer = 2
        
    else:
        buffer = 1
        
    # Return from the starting point post index
    return temp[temp.index('-') + buffer:]


In [12]:
# Takes the text from the TagTog HTML file as it appears in the browser
def extractTextFromHTML(inFile):
    
    # Open the html file
    with open(inFile, encoding="utf-8") as fp:
        
        # Soup it to parsed HTML
        soup = BeautifulSoup(fp, "html.parser")

        # Return the txt
        return soup.get_text()

In [13]:
# Writes text to a file
def writeTxtFile(txt, file):
        f = io.open(file, encoding='utf-8', mode='w')    
        f.write(txt)
        f.close()

Script

In [14]:
# Set directories
root_src_dir  = 'ManuallyAnnotatedData\\'
root_dest_dir = 'ExtractedAnnotatedData\\'

# Get a list of all JSON files
JSONfiles = listAllFiles(root_src_dir)

# Filter out the legends
JSONfiles = [x for x in JSONfiles if x[1] != 'annotations-legend.json'] 

# Get a list of all html files
HTMLfiles  = listAllFiles(root_src_dir, '.html')

# Match them to a list based on names
matchedList = matchFiles(JSONfiles, HTMLfiles)

# Write to destination
toDestination(matchedList, root_dest_dir)

end.