# manuscriptFiles

> This module has commands to handle the manuscript file structure and relates the annotation application to files saved in the manuscripts folder

In [1]:
#| default_exp core

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import os
import io
import cv2
import numpy as np

The ```createManuscriptDirectory``` function creates a new manuscript directory from scratch, including:
- a manuscript image folder
- an annotation states folder
- a metadata file
- a folder for exported transcriptions
- a folder for imported transcriptions

This function relies on two methods: ```directoryNameClean``` and ```dictToList```. These are simple calls for simple purposes, but just complicated enough to warrant their own functions for future optimizations.

In [19]:
#| export
def createManuscriptDirectory(metadata:dict):
    # This function creates a directory and metadata file for a new manuscript and returns the new manuscript root directory
    
    # Establishing the baseDirectory the web app is running in
    baseDirectory = os.getcwd()
    os.chdir('manuscripts')
    allManuscriptsDirectory = os.getcwd()

    # Creating the root directory for a new manuscript
    title = directoryNameClean(metadata['Work'])
    manuscriptDirectory = os.path.join(allManuscriptsDirectory,title)
    os.mkdir(manuscriptDirectory)
    
    # Creating directories
    imagesDirectory = os.path.join(manuscriptDirectory,'images')
    os.mkdir(imagesDirectory)
    statesDirectory = os.path.join(manuscriptDirectory,'states')
    os.mkdir(statesDirectory)
    exportTranscriptDirectory = os.path.join(manuscriptDirectory,'exportTranscripts')
    os.mkdir(exportTranscriptDirectory)
    importTranscriptDirectory = os.path.join(manuscriptDirectory,'importTranscripts')
    os.mkdir(importTranscriptDirectory)
    
    # Creating metadata file as config file
    os.chdir(manuscriptDirectory)
    f = open(title + '.cfg', 'w')
    
    # Writes relevant metadata to file
    printable = dictToList(metadata)
    for data in printable:
        f.write(data + '\n')
    
    os.chdir(baseDirectory)
    
    return manuscriptDirectory

In [67]:
Euclid = {
    'Work': 'Data',
    'Author': 'Euclid',
    'Language': 'Greek',
    'Country': 'Egpyt',
    'City': 'Alexandria',
    'Institution': 'The Great Library',
    'Centuries': '3rd century BC'
}

createManuscriptDirectory(Euclid)

Plato = {
    'Work': 'Republic',
    'Author': 'Plato',
    'Language': 'Greek',
    'Country': 'Greece',
    'City': 'Athens',
    'Institution': 'The Academy',
    'Centuries': '4th century BC'
}

createManuscriptDirectory(Plato)

Aristotle = {
    'Work': 'Nicomachean Ethics',
    'Author': 'Aristotle',
    'Language': 'Greek',
    'Country': 'Greece',
    'City': 'Athens',
    'Institution': 'The Academy',
    'Centuries': '4th century BC'
}

createManuscriptDirectory(Aristotle)

'C:\\Users\\David\\web-app-template\\manuscripts\\nicomacheanethics'

The ```dictToList``` function creates a printable list out of a dictionary that keeps keys separate from the items stored to them and creates a list that can be printed to a file, specifically the manuscript ```title.cfg``` file.

In [6]:
#| export
def dictToList(thisdict:dict):
    # This function turns a dictionary into a list of printable strings
    keys = []
    for key in thisdict.keys():
        keys.append(key)

    values = []
    for value in thisdict.values():
        values.append(value)
    
    printable = []
    for i in range(len(keys)):
        printable.append(str(keys[i]) + ':' + str(values[i]))
    
    return printable

In [7]:
# Testing dictToList

thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

printable = dictToList(thisdict)

for i in range(len(printable)):
    print(printable[i])

brand:Ford
model:Mustang
year:1964


The ```directoryNameClean``` function removes the characters from a string that are not allowed characters for a directory name. It is called in other manuscript file methods and is indirectly used to generate a folder key for the select manuscripts. It also cuts down the length of the title to no more than 26 characters to avoid creating excessively long directory titles and surpassing the 260 character limit. While it is unlikely for the path limit to be reached, caution is good.

In [8]:
#| export
def directoryNameClean(string):
    # This function removes any of the illegal characters for directories
    illegalChars = ['\\','#','%','&','{','}','<','>','*','?','/',' ','$','!',"'",'"',':','@','+','`','|','=']
    
    for char in illegalChars:
        for i in range(len(string)):
            if string[i] == char:
                string = string[:i] + string[i+1:]
                break
    
    if len(string) > 26:
        string = string[0:26]
    
    return string.lower()

In [9]:
# Testing the directoryNameClean function. ```re.sub``` didn't work for whatever reason, something about nothing being at 0, despite that not being an expected behavior. Difficulty iterating over multiple chars? Also didn't take backslashes at all.

illegalChars = ['\\','#','%','&','{','}','<','>','*','?','/',' ','$','!',"'",'"',':','@','+','`','|','=']
newstring=''

for char in illegalChars:
    newstring = newstring + char + 'aaa'
    
print(newstring)

print(len(directoryNameClean(newstring)))

\aaaa#aaaa%aaaa&aaaa{aaaa}aaaa<aaaa>aaaa*aaaa?aaaa/aaaa aaaa$aaaa!aaaa'aaaa"aaaa:aaaa@aaaa+aaaa`aaaa|aaaa=aaaa
26


The ```saveImage``` function takes image content from the upload widget value dictionary and saves them as files to the relevant manuscripts folder. Borrows code from [Ulrich Stern's answer](https://stackoverflow.com/a/54162776/1628638) and iterates over the elements of the dictionary with that code. It takes the arguments:
- ```files```: a dictionary with image data for several images keyed in memory
- ```targetDirectory```: a path to some manuscript directory for image saving

Passed into the ```files``` argument **MUST** be a ```<FileUpload widget>.value``` key from an uploader with image data. This contains the memory locations for the images.

In [33]:
#| export
def saveImages(files:dict, targetDirectory):
    # This function saves content from memory into storage using the keys in the passed files dict (from a FileUpload widget)
    # This 
    baseDirectory = os.getcwd()
    os.chdir(os.path.join(targetDirectory, 'images'))
    
    # Not sure what exactly this does, this is what I borrowed
    for i in range(len(files)):
        # I think this takes the content from the memory marker and reads it into Python readable code
        img_stream = io.BytesIO(files[i]['content'])
        # I have no clue why Numpy is needed here (probably some ungodly matrices and formats), but this turns the data into a cv2 readable image
        img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1)
        # This writes an image to some target directory
        cv2.imwrite(files[i]['name'], img)
        
    os.chdir(baseDirectory)

In [23]:
# testing for saveImages
import ipywidgets as widgets

uploader = widgets.FileUpload(
    accept = '',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple = True,  # True to accept multiple files upload else False
    description = 'Upload Manuscript Images',
    layout = widgets.Layout(height='auto', width='auto')
)
uploader

FileUpload(value=(), description='Upload Manuscript Images', layout=Layout(height='auto', width='auto'), multi…

In [68]:
target = os.path.join(os.path.join(os.getcwd(), 'manuscripts'), 'elements')

saveImages(uploader.value, target)

In [69]:
uploader.value

({'name': 'ASU MAP.png',
  'type': 'image/png',
  'size': 1239112,
  'content': <memory at 0x00000288A023B700>,
  'last_modified': datetime.datetime(2021, 8, 18, 17, 51, 4, 79000, tzinfo=datetime.timezone.utc)},
 {'name': 'ASU MAP_LI (2).jpg',
  'type': 'image/jpeg',
  'size': 3556528,
  'content': <memory at 0x00000288A023B7C0>,
  'last_modified': datetime.datetime(2021, 8, 18, 17, 55, 27, 146000, tzinfo=datetime.timezone.utc)},
 {'name': 'ASU MAP_LI (3).jpg',
  'type': 'image/jpeg',
  'size': 3563565,
  'content': <memory at 0x00000288A023B280>,
  'last_modified': datetime.datetime(2021, 8, 18, 17, 57, 15, 758000, tzinfo=datetime.timezone.utc)},
 {'name': 'Documentation.PNG',
  'type': 'image/png',
  'size': 282634,
  'content': <memory at 0x00000288A023B880>,
  'last_modified': datetime.datetime(2021, 4, 6, 2, 8, 15, 361000, tzinfo=datetime.timezone.utc)})

We also want a function that can search through the current directories in manuscripts and find all the manuscripts in the directory to return the metadata of the manuscripts. To that end, we have the ```currentManuscripts``` function.

In [31]:
#| export

def currentManuscripts():
    # If this is run on any computer, it will have a unique structure
    baseDirectory = os.getcwd()
    manuscriptDirectory = os.path.join(baseDirectory,'manuscripts')
    
    # This is necessary to keep directories accessible. Without os.path.join, we can't keep a full directory name and access files inside specific directories
    directories = []
    for path in os.listdir(manuscriptDirectory):
        directories.append(os.path.join(manuscriptDirectory,path))
    
    # This is necessary to store metadata from .cfg files
    manuscriptMetadata = []
    
    # This is necessary to search each directory in the manuscripts folder
    for directory in directories:
        # This looks through each file in a given directory
        for file in os.listdir(directory):
            # This opens config files and reads metadata from them
            if file.endswith('.cfg'):
                fileDirectory = os.path.join(directory,file)
                f = open(fileDirectory, 'r')
                metadata = {}

                for line in f:
                    key, value = line.split(':')
                    metadata[key] = value[:-1]

                manuscriptMetadata.append([directory, metadata])
    
    os.chdir(baseDirectory)
    return manuscriptMetadata

In [70]:
test = currentManuscripts()

print(len(test))

for metadata in test:
    print(metadata[1])

6
{'Work': 'Data', 'Author': 'Euclid', 'Language': 'Greek', 'Country': 'Egpyt', 'City': 'Alexandria', 'Institution': 'The Great Library', 'Centuries': '3rd century BC'}
{'Work': 'Elements', 'Author': 'Euclid', 'Language': 'Greek', 'Country': 'Egpyt', 'City': 'Alexandria', 'Institution': 'The Great Library', 'Centuries': '3rd century BC'}
{'Work': 'Nicomachean Ethics', 'Author': 'Aristotle', 'Language': 'Greek', 'Country': 'Greece', 'City': 'Athens', 'Institution': 'The Academy', 'Centuries': '4th century BC'}
{'Work': 'Physics', 'Author': 'Aristotle', 'Language': 'Greek', 'Country': 'Greece', 'City': 'Athens', 'Institution': 'The Academy', 'Centuries': '4th century BC'}
{'Work': 'Republic', 'Author': 'Plato', 'Language': 'Greek', 'Country': 'Greece', 'City': 'Athens', 'Institution': 'The Academy', 'Centuries': '4th century BC'}
{'Work': 'Symposium', 'Author': 'Plato', 'Language': 'Greek', 'Country': 'Greece', 'City': 'Athens', 'Institution': 'The Academy', 'Centuries': '4th century BC'

If we ever want to export transcriptions from inside this document, having an automatic zip function would be incredibly useful. To that end, here is some testing on a file zipping function.

In [62]:
# Reading in file directories from zipTest folder to zip them

baseDirectory = os.getcwd()
zipDirectory = 'zipTest'

files = []
paths = []
for path in os.listdir(zipDirectory):
    if path == 'images':
        paths.append(os.path.join(zipDirectory, path))
    elif path == 'exportTranscripts':
        paths.append(os.path.join(zipDirectory, path))

for path in paths:
    for file in os.listdir(path):
        files.append(os.path.join(path, file))
        
print(files)
# not sure if this is needed, but keeping it for best practice
os.chdir(baseDirectory)

['zipTest\\images\\ASU MAP.png', 'zipTest\\images\\ASU MAP_LI (2).jpg', 'zipTest\\images\\ASU MAP_LI (3).jpg', 'zipTest\\images\\Documentation.PNG']


In [66]:
# Trying different zip strategies
import zipfile

current = os.getcwd()

# base folder size is 8641 KB

# This iterates over a list of files
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED,).write(file)

# no compresslevel corresponds to 8156 KB

In [65]:
## Testing compression levels
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest1.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 1).write(file)

# result: 8153 KB
    
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest2.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 2).write(file)

# result: 8151 KB
    
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest3.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 3).write(file)

# result: 8150 KB

# results for levels 4-9: 8156 KB

for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest4.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 4).write(file)
    
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest5.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 5).write(file)

for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest6.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 6).write(file)

for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest7.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 7).write(file)
    
for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest8.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 8).write(file)

for file in files:
    # This writes a file to a particular zip folder
    zipfile.ZipFile(file = 'zipTest9.zip', mode = 'a', compression = zipfile.ZIP_DEFLATED, compresslevel = 9).write(file)

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)


In [5]:
#| hide
import nbdev; nbdev.nbdev_export()