In [None]:
#Takes Innodata transcriptions (.csv or .txt) and returns named entities and associated dataframe
##Matt Cook - 2023


In [None]:
#Declarations + I/O
import pandas as pd
import os
import spacy
import sys
from pathlib import Path
import string

#I/O
target = "xxx/Volumes/Trade Statistics/Transcriptions" #Innodata transcriptions 
namedEntities = "xxx/Volumes/Trade Statistics/NER" #Working directory
entities = []
tableOut = "xxx/Volumes/Trade Statistics/NER/NER.csv"#Entities CSV

#NLP Config
model = "en_core_web_lg"
nlp = spacy.load(model)
nlp.max_length = 100000000
stopwords = nlp.Defaults.stop_words
labels = ["PRODUCT","EVENT","FAC","LOC","NORP","GPE","LAW","PERSON","ORG","LANGUAGE"] 

#custom stopwords
ignore = ['Tls', 'Hk. Tts','Hk.Tts.m.c.c', 'Tts', '&c', 'Hk.Tts','Äî']


In [None]:
#identify and export named entities for all .txt files in target directory
for path in sorted(Path(target + "/txt").rglob('*.txt')):
    if os.path.getsize(path) > 7:
        absolute = (str(path.parent) + "/" + path.name)
        with open(absolute, "r", encoding='utf-8-sig') as transcription:#absolute path for images
            iliad = transcription.read()
            document = nlp(iliad)
            for entity in document.ents:
                if (entity.label_ in labels) & (entity.text[0].isdigit() == False) & (entity.text not in ignore):
                    entities.append(str(entity.text))
                    nerPath = namedEntities + "/txt/" + str(path.stem) + "_txt.txt"
                    with open(nerPath, 'w') as file:
                        file.write(str(entities)) 
                        file.close()
        print("The following entities have been added to " + nerPath + ":")
        print("entities: " + str(entities))
        print("\n")
    entities.clear()
                            
print("have a nice day")


In [None]:
#identify and export named entities for all .csv files in target directory
for path in sorted(Path(target + "/csv").rglob('*.csv')):
    if os.path.getsize(path) > 7:
        absolute = (str(path.parent) + "/" + path.name)
        with open(absolute, "r", encoding='utf-8-sig') as transcription:#absolute path for images
            iliad = transcription.read()
            document = nlp(iliad)
            for entity in document.ents:
                if (entity.label_ in labels) & (entity.text[0].isdigit() == False) & ('"' not in entity.text) & (entity.text not in ignore):
                    entities.append(entity.text)
                    nerPath = namedEntities + "/csv/" + str(path.stem) + "_csv.txt"
                    with open(nerPath, 'w') as file:
                        file.write(str(entities)) 
                        file.close()
        print("The following entities have been added to " + nerPath + ":")
        print("entities: " + str(entities))
        print("\n")
        entities.clear()
print("have a nice day")


In [None]:
#create dataframe with input filename, NER output path, and entities
df = pd.DataFrame()

#cross-check that DRS FILE-OSN values exist in target directory and add matches to pathsOut + dataframe
for path in sorted(Path(namedEntities).rglob('*.txt')):
    absolute = (str(path.parent) + "/" + path.name)
    with open(absolute, "r",encoding='utf-8-sig') as text:
        entities = text.read()
        entities = entities.strip("][")
        entities = entities.replace("'","")
        entities = entities.replace("Äî","")
        entities = entities.replace("\n","")
        print(path.name + ": " + str(entities) + "\n")
    df = df.append({'FILENAME':path.stem,'ENTITIES':entities}, ignore_index=True) #append data frame  
    
#create new lookup table from dataframe
with open(tableOut, mode = 'a') as f:
    df.to_csv(f,index=False) #append tableOut with FILENAME and ENTITIES values

print("\n")
print("lookup table created for named entity collection")
print("have a nice day")


In [None]:
##strip specific list characters? (e.g. "[]")
###numerals? (regex pattern = r'[0-9]') or .isalpha()
##customize stopwords? (e.g. "T1s")
#Relevant entity labels? (e.g. "PERSON")
#dataframe append deprecated? 