In [None]:
###pdf parser
###PyMuPDF: https://pymupdf.readthedocs.io/en/latest/
import sys 
import fitz
fname = "xxx"  # get document filename
doc = fitz.open(fname)  # open document
out = open(fname + ".txt", "wb")  # open text output
for page in doc:  # iterate the document pages
    text = page.get_text().encode("utf8")  # get plain text (is in UTF-8)
    out.write(text)  # write text of page
    out.write(bytes((12,)))  # write page delimiter (form feed 0x0C)
out.close()

In [None]:
#Takes local image set and returns absolute filepaths (.txt) for Handprint input + lookup table (.csv)
##Matt Cook - 2021
import pandas as pd
import os
import sys
from pathlib import Path

#inputs
target ="..." #target image directory

#outputs
pathsOut = open("...txt", "w") # for Handprint
tableOut = "...csv"# transcription batch csv

#dataframe
df = pd.DataFrame()

#cross-check that DRS FILE-OSN values exist in target directory and add matches to pathsOut + dataframe
for path in sorted(Path(target).rglob('*.jpg')):
    absolute = (str(path.parent) + "/" + path.name) #absolute path for images
    pathsOut.write(str(absolute)) #write paths to pathsOut
    pathsOut.write("\n")
    df = df.append({'FILENAME':path.stem,'IMG-PATH':absolute}, ignore_index=True) #append data frame  
    print("image " + path.stem + " located " + "at " + absolute) #console out

#create new lookup table from dataframe
with open(tableOut, mode = 'a') as f:
    df.to_csv(f,index=False) #append tableOut with FILE-OSN and IMG-PATH values

print("\n")
print("lookup table created for collection")
pathsOut.close()


In [None]:
#Run Handprint using paths from local file
##Mike Hucka designed and implemented Handprint beginning in mid-2018.
##installation instructions at https://github.com/caltechlibrary/handprint

##generate Microsoft results
!handprint --service microsoft -@".../HTR_log.txt" --from-file "...txt" --no-grid --extended --output "..."


In [None]:
import os
import sys
import re
from pathlib import Path
from textblob import TextBlob

#input/output
paths = '...txt'#plain text list of urls or filenames (pre-existing)
target = '...MSFT' #HP outputs (pre-existing)
textOut = '...txt'#Bag-of-Words output

#declarations
BoW = open(textOut, "w")
#append bag-of-words with headers and transcriptions
for path in sorted(Path(target).rglob('*.txt')):
    absolute = (str(path.parent) + "/" + path.name)
    contents = open(absolute, "r") 
    for line in contents.readlines():
            print("\n")
            b = TextBlob(line)
            print(str(b.correct()))
            BoW.write(str(b.correct()))
    BoW.write("\n")
BoW.close()
print("\n")
print("have a nice day")
        

In [None]:
###Fuzzy Search Bag-of-Words
##Matt Cook - 2021

from fuzzy_search.fuzzy_phrase_searcher import FuzzyPhraseSearcher
from fuzzy_search.fuzzy_phrase_model import PhraseModel
import json
import re
from pathlib import Path

#declarations
text = ".../Baptismal Records/Lag 1838-1869/Lag 1838-1869_BOW.txt"
target = '.../Baptismal Records/Lag 1838-1869/Lag 1838-1869_MSFT' #HP outputs (pre-existing)
variants = []
passages = open(".../Baptismal Records/Lag 1838-1869/Lag 1838-1869_passages.txt", "w") #output text

#user input
inputString = input("Fuzzy search document for keyword: ")
passages.write("Fuzzy search document for keyword: " + str(inputString))
passages.write("\n")
counter = 0

#threshold configuration (Sarah to customize)
config = {
    #these thresholds work when there are few OCR errors
    'char_match_threshold': 0.5,
    'ngram_threshold': 0.5,
    'skipgram_threshold': 0.3,
    'levenshtein_threshold': 0.5,
    'include_variants': False, # for phrases that have variant phrasings
    'filter_distractors': False, # avoid matching with similar but different phrases
    "ignorecase": False, # Is upper/lowercase a meaningful signal?
    "use_word_boundaries": False,# should matches follow word boundaries?
    "max_length_variance": 3, # matching string can be lower/shorter than prhase
}

# initialize a new searcher instance with the config
fuzzy_searcher = FuzzyPhraseSearcher(config)
phrase_model = PhraseModel(phrases=[inputString])
fuzzy_searcher.index_phrase_model(phrase_model)

#identify matches in the text using fuzzy search package
BoW = open(text, "r")
for match in fuzzy_searcher.find_matches(BoW.read()):
    variant = match.json()
    variant = variant['string']
    variants.append(variant)
print("Variants detected in bag-of-words include: ")
passages.write("Variants detected in bag-of-words include: ")
passages.write("\n\n")
print("\n")
print(variants)
passages.write(str(variants))
passages.write("\n\n")
print("\n")
BoW.close()

print("variants appear on the following records:")
print("\n")

for path in sorted(Path(target).rglob('*.txt')):
    header = path.stem.split('.')
    absolute = (str(path.parent) + "/" + path.name)
    contents = open(absolute, "r")
    contents = contents.read()
    for variant in variants:
        if variant in contents:
            passages.write(header[0] + "\n" + contents)
            passages.write("\n\n")
            print("Record: " + header[0] + "\n" + "Transcription: " + contents)
            print("\n")
passages.close()

print("matched records saved to disk")
print("have a nice day")
        


In [None]:
###Named entity recognition and frequency visualization of bag-of-transcriptions
##Modified from "named-entity-recognition" repo by Mary Chester-Kadwell (https://github.com/mchesterkadwell/named-entity-recognition/blob/main/LICENSE)
#Entity types: https://github.com/mchesterkadwell/named-entity-recognition/blob/main/2-named-entity-recognition-of-henslow-data.ipynb
##Matt Cook - November 2021

import spacy
import it_core_news_sm
from spacy import displacy
from pathlib import Path
import matplotlib.pyplot as plt
from collections import Counter

#declarations
nlp = it_core_news_sm.load()
text_file = Path('data', '...txt')
entOut = open("...", "w")
    
###named entity recognition
with open(text_file, encoding="utf-8") as file:
    iliad = file.read()
document = nlp(iliad)
document.text
entities = []
for entity in document.ents:
    if entity.label_ == "GPE" or "NORP": 
        entities.append(entity.text)
        print(entity.text)
entOut.write(str(entities))
displacy.render(document, style="ent")
        
#print high-frenquency entities
word_freq = Counter(entity)
common_words = word_freq.most_common(50)
print(common_words)

#Display the plot in the notebook with interactive controls and save plot to disk
%matplotlib notebook
words = [word for word,_ in common_words]
freqs = [count for _,count in common_words]
plt.title("Named Entities")
plt.xlabel("Entity type")
plt.ylabel("# of appearances")
plt.xticks(range(len(words)), [str(s) for s in words], rotation=90)
plt.grid(b=True, which='major', color='#333333', linestyle='--', alpha=0.5)
plt.gcf().subplots_adjust(bottom=0.35)
plt.plot(freqs)
plt.show()
plt.savefig('....png', bbox_inches="tight")

#close files
file.close()
entOut.close()

print("have a nice day")

In [None]:
###Topic Modeling (LDA) Longhand inputs
###Cook 2021
##portions of code from https://radimrehurek.com/gensim/auto_examples
import os
import sys
import gensim
from gensim import corpora
import pprint
from collections import defaultdict
from gensim import models
from gensim.models import LdaModel
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#declarations
doc = open("xxx", "r")
doc = doc.read()
documents = [doc]

stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in doc.lower().split() if word not in stoplist]
    for doc in documents
]
#print(texts)

#create dictionary
dictionary = corpora.Dictionary(texts)
print(dictionary)

#create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

doc.close()