In [None]:
#Shundlikht
##Takes a Shund.org search result (CSV) and returns a fulltext transcription in target language

In [None]:
#Usage:
##Make local directory, named after target work
##Export Shund.org search results (CSV) and place in new folder
##In "Globals", below:
###Set Google Application Credentials
###Set workDir to new folder pathname
###Choose target language
##"Run all"

In [None]:
#Libraries
import requests
import os
import sys
import time
import pandas as pd
from pathlib import Path
from pypdf import PdfReader
from google.cloud import translate_v3beta1 as translate


In [None]:
#Globals
##google auth
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="..." #local .json credentials filepath

##target working directory
workDir = "..." #working directory

#identify NLI installment URLs
for path in sorted(Path(workDir).rglob('*.csv')):
    tableIn = pd.read_csv(path)
    print("Working from ", path)
    urls = tableIn["installment_url_pdf"].dropna() #dataframe of urls from work csv
print("\n")

#extract work name
workName = workDir.split("works/") #work name
workName = workName[1]
print("Work name: " , workName)

#make sub-directories + BoW
installments = os.makedirs(workDir + "/installments")
installments = (workDir + "/installments/")

transcriptions = os.makedirs(workDir + "/transcriptions")
transcriptions = (workDir + "/transcriptions/")

plaintext = os.makedirs(workDir + "/plaintext")
plaintext = (workDir + "/plaintext/")

BoW = open(workDir + "/" + workName + '_bagOfWords.txt', "wb")
BoW.close()


In [None]:
#download installment pdfs from work CSV
#Specify 'User-Agent' to circumvent GET restrictions, as per https://stackoverflow.com/questions/38489386/how-to-fix-403-forbidden-errors-when-calling-apis-using-python-requests
headers = {'User-Agent':"...'}
for value in urls.iteritems():
    url = value[1]
    order = (value[0] + 1)
    print("Installment #",str(order), "located for target work @:", url)
    print("Downloading...")
    print("\n")
    myfile = requests.get(url, headers=headers)
    open(installments + workName + "-installment_" + str(order) + ".pdf" , 'wb').write(myfile.content)
    myfile.close()
print("Installments successfully downloaded from NLI")


In [None]:
#Configure transcribe/translate function
##from: https://cloud.google.com/translate/docs/advanced/translate-documents
def translate_document(
        project_id: str,
        file_path: str,
        out_path: str,
) -> translate.TranslationServiceClient:
    """Translates a document.

    Args:
        project_id: The GCP project ID.
        file_path: The path to the file to be translated.
        out_path: Local output filepath

    Returns:
        The translated document.
    """

    client = translate.TranslationServiceClient()
    location = "us-central1"
    parent = f"projects/{project_id}/locations/{location}"

    # Supported file types: https://cloud.google.com/translate/docs/supported-formats
    with open(file_path, "rb") as document:
        document_content = document.read()

    document_input_config = {
        "content": document_content,
        "mime_type": "application/pdf",
    }

    response = client.translate_document(
        request={
            "parent": parent,
            #"source_language_code": "yi",
            "target_language_code": "yi",
            "document_input_config": document_input_config,
        }
    )
    # To output the translated document, uncomment the code below.
    f = open(out_path, 'wb')
    f.write(response.document_translation.byte_stream_outputs[0])
    f.close()

    # If not provided in the TranslationRequest, the translated file will only be returned through a byte-stream
    # and its output mime type will be the same as the input file's mime type
    print(f"Response: Detected Language Code - {response.document_translation.detected_language_code}")

    return response


In [None]:
#Translate/transcribe pdfs with Google Translate
start = time.time()
for path in sorted(Path(installments).rglob('*.pdf')):
    absolute = (str(path.parent) + "/" + path.name)
    print("working on: " + str(path.name))
    print("\n")
    translate_document("shund-386513", absolute, transcriptions + workName + str(path.stem) + "_transcribed.pdf")
end = time.time()
print(str(end - start) + " seconds elapsed on GCT" )
    

In [None]:
#Extract plaintext from transcribed/translated pdfs
from pypdf._text_extraction import set_custom_rtl
rtl_dir: bool = False #set to "True" for left-to-right languages
    
for path in sorted(Path(transcriptions).rglob('*.pdf')):
    name = str(path.name.split(".pdf")[0])
    reader = PdfReader(path)
    page = reader.pages[0]
    with open(plaintext + "/" + name + ".txt", "w" ) as txt:
        print("Extracting text from: " + str(path.name))
        print("\n")
        txt.write(name)       
        txt.write(page.extract_text())
        txt.write("\n")
print("Plaintext transcriptions created.")


In [None]:
#Append bag-of-words document with headers and transcriptions
###Note: Bag-of-Words function only works for English (EN)
BoW = open(workDir + "/" + workName + '_bagOfWords.txt', "a")
BoW.write(workName) #write header in document
BoW.write("\n")

#write transcriptions to bag-of-words
for path in sorted(Path(plaintext).rglob('*.txt')):
    absolute = (str(path.parent) + "/" + path.name)
    contents = open(absolute, "r") 
    text = contents.read()
    text = text.split("Date: ", 1)
    text = text[1]
    text = text.replace("Machine Translated by Google","")
    BoW.write(text)
    BoW.write("\n")
    print(path.name + " added to bag-of-words")
    print("\n") 
BoW.close()
print("\n")


In [None]:
####Generate Bag-of-Words txt output from set of transcriptions.
##Use for non-English transcript set

#declarations
BoW = open(workDir + "/" + workName + '_bagOfWords.txt', "a")
BoW.write(workName) #write header in document
BoW.write("\n")

#append bag-of-words with headers and transcriptions
for path in sorted(Path(plaintext).rglob('*.txt')):
    absolute = (str(path.parent) + "/" + path.name)
    contents = open(absolute, "r") 
    contents = contents.read()
    contents = contents.replace("Machine Translated by Google","")
    contents = contents.replace(workName, "")
    BoW.write(str(contents))
    print(path.name + " added to bag-of-words")
    BoW.write("\n")
BoW.close()

print("\n")
print("have a nice day")

In [None]:
#TO DO
##annotate filepaths with language code extension, if target language differs from source
##Add spelling correction

#Caveats
##runs locally
##translation may have errors (in addition to transcriptions)
##GCT (Google) is black-boxed 

In [None]:
#Matt Cook - 2023
##mncook.net