# manuscriptFiles

> This module has commands to handle the manuscript file structure and relates the annotation application to files saved in the manuscripts folder

In [None]:
#| default_exp manuscriptFiles

In [None]:
#| hide
from nbdev.showdoc import *

In [1]:
#| export
import os
import io
import cv2
import re
import numpy as np
import base64
from PIL import Image

## createManuscriptDirectory

The ```createManuscriptDirectory``` function creates a new manuscript directory from scratch, including:
- a manuscript image folder
- an annotation states folder
- a metadata file
- a folder for exported transcriptions
- a folder for imported transcriptions

This function relies on two methods: ```directoryNameClean``` and ```dictToList```. These are simple calls for simple purposes, but just complicated enough to warrant their own functions for future optimizations.

In [2]:
#| export
def createManuscriptDirectory(metadata:dict):
    # This function creates a directory and metadata file for a new manuscript and returns the new manuscript root directory
    
    # Establishing the baseDirectory the web app is running in
    baseDirectory = os.getcwd()
    os.chdir('manuscripts')
    allManuscriptsDirectory = os.getcwd()

    # Creating the root directory for a new manuscript
    title = directoryNameClean(metadata['Work'])
    manuscriptDirectory = os.path.join(allManuscriptsDirectory,title)
    os.mkdir(manuscriptDirectory)
    
    # Creating directories
    imagesDirectory = os.path.join(manuscriptDirectory,'images')
    os.mkdir(imagesDirectory)
    statesDirectory = os.path.join(manuscriptDirectory,'states')
    os.mkdir(statesDirectory)
    exportTranscriptDirectory = os.path.join(manuscriptDirectory,'exportTranscripts')
    os.mkdir(exportTranscriptDirectory)
    importTranscriptDirectory = os.path.join(manuscriptDirectory,'importTranscripts')
    os.mkdir(importTranscriptDirectory)
    
    # Creating metadata file as config file
    os.chdir(manuscriptDirectory)
    f = open(title + '.cfg', 'w')
    
    # Writes relevant metadata to file
    printable = dictToList(metadata)
    for data in printable:
        f.write(data + '\n')
    
    # Moves into 'states' directory to add line and bbox folders
    os.chdir(statesDirectory)
    linesDirectory = os.path.join(statesDirectory, 'lines')
    os.mkdir(linesDirectory)
    bboxesDirectory = os.path.join(statesDirectory, 'bboxes')
    os.mkdir(bboxesDirectory)
    
    os.chdir(baseDirectory)
    
    return manuscriptDirectory

In [24]:
# Testing on a real world example
Stav53 = {
    'Work': 'Stavronikita Monastery Greek handwritten document Collection no.53',
    'Author': '',
    'Language': 'Greek',
    'Country': 'Greece',
    'City': 'Mount Athos',
    'Institution': 'Stavronikita Monastery',
    'Centuries': '14th Century'
}

createManuscriptDirectory(Stav53)

'/home/dc/glyptodon/glyptodon/manuscripts/stvrnktmnstrygrkcllctnn53'

## dictToList

The ```dictToList``` function creates a printable list out of a dictionary that keeps keys separate from the items stored to them and creates a list that can be printed to a file, specifically the manuscript ```title.cfg``` file.

In [12]:
#| export
def dictToList(thisdict:dict):
    # This function turns a dictionary into a list of printable strings
    keys = []
    for key in thisdict.keys():
        keys.append(key)

    values = []
    for value in thisdict.values():
        values.append(value)
    
    printable = []
    for i in range(len(keys)):
        printable.append(str(keys[i]) + ':' + str(values[i]))
    
    return printable

In [None]:
# Testing dictToList

thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

printable = dictToList(thisdict)

for i in range(len(printable)):
    print(printable[i])

brand:Ford
model:Mustang
year:1964


## directoryNameClean

The ```directoryNameClean``` function removes the characters from a string that are not allowed characters for a directory name and shortens the name. It is called in other manuscript file methods and is indirectly used to generate a folder key for the select manuscripts. It cuts down the length of the title to no more than 26 characters to avoid creating excessively long directory titles and surpassing the 260 character limit. While it is unlikely for the path limit to be reached, caution is good. It proceeds in this order:
- Remove words that are not capitalized at the beginning or are not numerical
- Remove illegal characters
- If name length is greater than 26:
  - Remove vowels
  - Remove trailing characters

In [4]:
#| export
def directoryNameClean(string):
    # This breaks a string down into individual words and
    words = string.split()
    upperWords = []
    for word in words:
        if word[0].isupper():
            upperWords.append(word)
        elif word.isalpha() == False:
            upperWords.append(word)
    
    string = ''
    for word in upperWords:
        string = string + word
    
    # This loop removes any of the illegal characters for directories
    # It also removes periods as a stylistic choice (file extensions are found at periods)
    illegalChars = ['\\','#','%','&','{','}','<','>','*','?','/',' ','$','!',"'",'"',':','@','+','`','|','=','.']
    for char in illegalChars:
        removalCount = 0
        numChars = len(string)
        for i in range(len(string)):
            if string[i] == char:
                string = string[:i] + string[i+1:]
                removalCount = removalCount + 1

            if numChars - removalCount - 1 == i:
                break
    
    # This loop removes any vowels in an overly long string
    vowels = ['a','e','i','o','u']
    if len(string) > 26:
        for char in vowels:
            removalCount = 0
            numChars = len(string)
            for i in range(len(string)):
                if string[i] == char:
                    while string[i] == char:
                        string = string[:i] + string[i+1:]
                        removalCount = removalCount + 1
                
                if numChars - removalCount - 1 == i:
                    break
    
    if len(string) > 26:
        string = string[0:26]
    
    return string.lower()

In [None]:
# Testing the function on a real world example
newstring = 'Stavronikita Monastery Greek handwritten document Collection no.53'

print(directoryNameClean(newstring))

stvrnktmnstrygrkcllctnn.53


In [None]:
# Testing the directoryNameClean function. ```re.sub``` didn't work for whatever reason, something about nothing being at 0, despite that not being an expected behavior. Difficulty iterating over multiple chars? Also didn't take backslashes at all.

illegalChars = ['\\','#','%','&','{','}','<','>','*','?','/',' ','$','!',"'",'"',':','@','+','`','|','=']
newstring=''

for char in illegalChars:
    newstring = newstring + char + 'aaa'
    
print(newstring)

print(directoryNameClean(newstring))

\aaa#aaa%aaa&aaa{aaa}aaa<aaa>aaa*aaa?aaa/aaa aaa$aaa!aaa'aaa"aaa:aaa@aaa+aaa`aaa|aaa=aaa
a


## saveImage [Convert to Dash ```Upload```]

The ```saveImage``` function takes image content from the upload widget value dictionary and saves them as files to the relevant manuscripts folder. Borrows code from [Ulrich Stern's answer](https://stackoverflow.com/a/54162776/1628638) and iterates over the elements of the dictionary with that code. It takes the arguments:
- ```files```: a dictionary with image data for several images keyed in memory
- ```targetDirectory```: a path to some manuscript directory for image saving

Passed into the ```files``` argument **MUST** be a ```<FileUpload widget>.value``` key from an uploader with image data. This contains the memory locations for the images.

In [None]:
#| export
def saveImages(contents, filenames, targetDirectory):
    # This function saves content from memory into storage using the keys in the passed files dict (from a FileUpload widget)
    # This
    baseDirectory = os.getcwd()
    os.chdir(os.path.join(targetDirectory, "images"))
    
    if type(contents) != list:
        contents = [contents]
        filenames = [filenames]
    
    for i in range(0, len(contents)):
        string64 = contents[i].encode("utf8").split(b";base64,")[1]
        imdata = base64.b64decode(string64)
        pilImage = Image.open(io.BytesIO(imdata))
        cv2Image = cv2.cvtColor(np.array(pilImage), cv2.COLOR_BGR2RGB)
        cv2.imwrite("test" + filenames[i], cv2Image)
    
    os.chdir(baseDirectory)

In [None]:
# testing for saveImages
from dash import Dash, State, Input, Output, callback, dcc, html
import base64
from PIL import Image

app = Dash(__name__)
app.layout = html.Div(
    [
        dcc.Upload(html.Button("upload"), id="test-upload", multiple = True),
        html.Pre(id="annotations-data-pre"),
    ]
)

@callback(
    Output("annotations-data-pre", "children"),
    Input("test-upload","contents"),
    State("test-upload","filename"),
    prevent_initial_call=True,
)
def simplePrint(contents, filenames):
    print(filename)
    for thing in content:
        string64 = thing.encode("utf8").split(b";base64,")[1]
    
        imdata = base64.b64decode(string64)
        pilImage = Image.open(io.BytesIO(imdata))
        cv2Image = cv2.cvtColor(np.array(pilImage), cv2.COLOR_BGR2RGB)
        cv2.imwrite("test" + filename, cv2Image)
    return content

if __name__ == "__main__":
    app.run(debug=True)

## saveTranscript

Like ```saveImage```, the ```saveTranscript``` function takes in:
- ```files```: a dictionary with data from a ```FileUploader``` widget
- ```targetDirectory```: a path to a directory for saving the relevant files

In [19]:
#| export
def saveTranscripts(contents, filenames, targetDirectory):
    baseDirectory = os.getcwd()
    os.chdir(os.path.join(targetDirectory, "importTranscripts"))
    
    if type(contents) != list:
        contents = [contents]
        filenames = [filenames]
    
    for i in range(0, len(contents)):
        string64 = contents[i].encode("utf8").split(b";base64,")[1]
        message = base64.b64decode(string64).decode('utf-8')
        f = open(filenames[i], 'w')
        f.write(message)
    
    os.chdir(baseDirectory)

Just a cell

In [18]:
# testing for saveTranscript
from dash import Dash, State, Input, Output, callback, dcc, html
import base64

app = Dash(__name__)
app.layout = html.Div(
    [
        dcc.Upload(html.Button("upload"), id="test-upload", multiple = True),
        html.Button("test", id="query"),
        html.Pre(id="annotations-data-pre"),
    ]
)

@callback(
    Output("annotations-data-pre", "children"),
    Input("query","n_clicks"),
    State("test-upload","contents"),
    State("test-upload","filename"),
    prevent_initial_call=True,
)
def simplePrint(clicks, contents, filenames):
    print(type(contents))
    print(type(filenames))
    return contents

if __name__ == "__main__":
    app.run(debug=True)

<class 'NoneType'>
<class 'NoneType'>


## currentManuscripts

We also want a function that can search through the current directories in manuscripts and find all the manuscripts in the directory to return the metadata of the manuscripts. To that end, we have the ```currentManuscripts``` function.

In [15]:
#| export

def currentManuscripts():
    # If this is run on any computer, it will have a unique file structure. This implementation works with that file structure.
    baseDirectory = os.getcwd()
    manuscriptDirectory = os.path.join(baseDirectory,'manuscripts')
    
    # This is necessary to keep directories accessible. Without os.path.join, we can't keep a full directory name and access files inside specific directories
    directories = []
    for path in os.listdir(manuscriptDirectory):
        directories.append(os.path.join(manuscriptDirectory,path))
    
    # This is necessary to store metadata from .cfg files
    manuscriptMetadata = []
    
    # This is necessary to search each directory in the manuscripts folder
    for directory in directories:
        # This looks through each file in a given directory
        for file in os.listdir(directory):
            # This opens config files and reads metadata from them
            if file.endswith('.cfg'):
                fileDirectory = os.path.join(directory,file)
                f = open(fileDirectory, 'r')
                metadata = {}

                for line in f:
                    key, value = line.split(':')
                    metadata[key] = value[:-1]

                manuscriptMetadata.append((directory, metadata))
    
    os.chdir(baseDirectory)
    return manuscriptMetadata

In [5]:
#| hide
test = currentManuscripts()

print(len(test))

for metadata in test:
    print(metadata[0])
    print(metadata[1]["Centuries"])

1
/home/dc/glyptodon/nbs/manuscripts/stvrnktmnstrygrkcllctnn.53
14th Century


## zipManuscript

Being able to compress and download a digital manuscript with its transcription is a necessary output of ```glyptodon```, so this function provides the backend for that. It takes in a manuscript directory, a list of possible options for directories that need to be zipped and exported, and a name that is inputted by the user for the folder. It returns the pathway to the zipped folder.

It uses the ```zipfile``` library, something only used in this method. Because of that, the import is left inside the method. From the available compression algorithms, this function uses the [```ZIP_DEFLATED```](https://en.wikipedia.org/wiki/Deflate) option because it is lossless.

To dial in the best settings for compression, the 9 different levels of compression were tested. After testing out the different compression levels of ```ZIP_DEFLATED``` on the images and xml files in [this dataset](https://zenodo.org/record/5595669), the greatest relative decrease in directory size acheived was a roughly 2 percent decrease in size using compression level 3. This was acheived in 6.663 seconds, the second fastest compression time. It seems clear for this use case that compression level 3 is what is best.

The whole set of results is here:

| Compression Level | Seconds | MB    |
|-------------------|---------|-------|
| No Compression    | 0.000   | 247.1 |
| Level 1           | 6.638   | 241.9 |
| Level 2           | 6.724   | 241.8 |
| Level 3           | 6.663   | 241.8 |
| Level 4           | 7.385   | 242.8 |
| Level 5           | 7.433   | 242.8 |
| Level 6           | 7.164   | 242.8 |
| Level 7           | 7.262   | 242.8 |
| Level 8           | 7.363   | 242.8 |
| Level 9           | 7.337   | 242.8 |

In [15]:
#| export
def zipManuscript(directoryOptions: list, manuscriptDirectory, name: str):
    import zipfile
    # standard call here to avoid getting the system lost in directories
    baseDirectory = os.getcwd()
    
    lowerOptions = []
    for option in directoryOptions:
        lowerOptions.append(option.lower())
    
    files = []
    for path in os.listdir(manuscriptDirectory):
        # this deletes any currently zipped folder
        if path.endswith(".zip"):
            os.remove(os.path.join(manuscriptDirectory, path))
        
        # this collects all the files inside option folders
        if path in lowerOptions:
            if path == "states":
                tempDirectoryStates = os.path.join(manuscriptDirectory, path)
                for statesPath in os.listdir(tempDirectoryStates):
                    if statesPath in ["bboxes","lines"]:
                        tempDirectory = os.path.join(tempDirectoryStates, statesPath)
                        for file in os.listdir(tempDirectory):
                            files.append(os.path.join(tempDirectory, file))
            else:
                tempDirectory = os.path.join(manuscriptDirectory, path)
                for file in os.listdir(tempDirectory):
                    files.append(os.path.join(tempDirectory, file))
    
    # this zips the collected files
    os.chdir(manuscriptDirectory)
    for file in files:
        zipfile.ZipFile(
            file=name + ".zip",
            mode="a",
            compression=zipfile.ZIP_DEFLATED,
            compresslevel=3,
        ).write(file)
    
    # standard call here to move back to the application's main directory
    os.chdir(baseDirectory)

    return os.path.join(manuscriptDirectory, name + ".zip")

In [41]:
#| hide
print(zipManuscript(['images','importTranscripts'], os.path.join(os.getcwd(), 'manuscripts/stvrnktmnstrygrkcllctnn.53'), 'test1'))

print(zipManuscript(['images','importTranscripts'], os.path.join(os.getcwd(), 'manuscripts/stvrnktmnstrygrkcllctnn.53'), 'test2'))

'/home/dc/glyptodon/nbs/manuscripts/stvrnktmnstrygrkcllctnn.53/testy.zip'

## updateMetadata

This function is designed to update the metadata in a file given a dictionary of information being passed into it and a path to the relevant directory. This dictionary should be based on calls from ```Text``` widgets in the Information stage. This was designed using [this answer](https://stackoverflow.com/questions/3964681/find-all-files-in-a-directory-with-extension-txt-in-python/3964690#3964690) from stackoverflow.

In [41]:
#| export
def updateMetadata(directory, information):
    baseDirectory = os.getcwd()
    
    for file in os.listdir(directory):
        if file.endswith('.cfg'):
            os.chdir(directory)
            f = open(file, 'w')
            printable = dictToList(information)
            for data in printable:
                f.write(data + '\n')
    
    os.chdir(baseDirectory)

In [43]:
#| hide

directory = '/home/dc/glyptodon/glyptodon/manuscripts/stvrnktmnstrygrkcllctnn.53'
information = {'Work': 'Stavronikita Monastery Greek handwritten document Collection no.53',
               'Author': 'Anonymous',
               'Language': 'Greek',
               'Country': 'Greece',
               'City': 'Mount Athos',
               'Institution': 'Stavronikita Monastery'}

updateMetadata(directory, information)

## manuscriptImages

This function is designed to get the relative paths of the selected manuscript's images into an ordered ```list```. This is to link up to the dropdown selection menu in the annotation tab. Like the save functions, this one takes in the ```targetDirectory``` as the directory of a manuscript. It returns paths relative to ```os.getcwd()``` for the ```cv2``` library to import images. Without relative pathing listed in this manner, more complex pathing needs to be done with ```cv2```. This avoids that difficulty.

In [38]:
#| export
def manuscriptImages(targetDirectory):
    baseDirectory = os.getcwd()
    # Now, getting a relative pathway to the manuscript
        # The slice removes an annoying slash from the string at the first index
    relativeToManuscript = re.sub(baseDirectory, "", targetDirectory)[1::]
    relativeToImages = os.path.join(relativeToManuscript, 'images')
    # Now we pull out every file in the directory into a list
    files = []
    for file in os.listdir(relativeToImages):
        if file.startswith('.') == False:
            files.append(file)
        
    # We sort the list alphanumerically
    files.sort()
    
    # Then we join each file to the relative pathway to the images folder
    # It also keeps an index that can be accessed by the dropdown
    relativePaths = []
    for file in files:
        relativePaths.append(os.path.join(relativeToImages, file))
    
    return relativePaths

In [39]:
#| hide
test = currentManuscripts()
print(test)
images = []
for metadata in test:
    images.append(manuscriptImages(metadata[0]))

for manuscript in images:
    for tup in manuscript:
        print(tup)

[('/home/dc/glyptodon/nbs/manuscripts/stvrnktmnstrygrkcllctnn.53', {'Work': 'Stavronikita Monastery Greek handwritten document Collection no.53', 'Author': '', 'Language': 'Greek', 'Country': 'Greece', 'City': 'Mount Athos', 'Institution': 'Stavronikita Monastery', 'Centuries': '14th Century'})]
/home/dc/glyptodon/nbs
/manuscripts/stvrnktmnstrygrkcllctnn.53
manuscripts/stvrnktmnstrygrkcllctnn.53/images
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0006_f_3r_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0007_f_3v_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0008_f_4r_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0009_f_4v_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0010_f_5r_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0011_f_5v_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0012_f_6r_res.png
manuscripts/stvrnktmnstrygrkcllctnn.53/images/15_01_0053_0013_f_6v_re

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()