## Run SpaCy NER with custom vocabularies on plain text transcripts
See `custom` folder for info on creating custom vocabularies

Install packages and model if not already installed

In [None]:
#installs the jsonlines package
!pip install jsonlines
#installs spacy
!pip install -U spacy
#installs model
!python -m spacy download en_core_web_lg
#RESTART THE RUNTIME AFTER THIS FINISHES!

In [None]:
import spacy
import jsonlines
import os
from spacy import displacy
import csv
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

### Define functions for CSV and HTML display output

In [0]:
def outputCsv(doc, filestem):
    """Outputs csv file of detected entities, their types, and character offsets"""
    entities = []
    for token in doc.ents:
        entity = {}
        entity['text'] = token.text
        entity['type'] = token.label_
        entity['start_char'] = token.start_char
        entity['end_char'] = token.end_char
        entities.append(entity)
    zname = 'custom/' + filestem + '_spacy_custom.csv'
    z = open(zname, 'w')
    fieldnames = ['text', 'type', 'start_char', 'end_char']
    writer = csv.DictWriter(z, fieldnames=fieldnames)
    writer.writeheader()
    for e in entities:
        writer.writerow(e)
    z.close()

def outputHtml(doc, filestem):
    """Outputs html file of DisplaCy visualization"""
    html = displacy.render(doc, style="ent", page=True)
    zname = 'custom/' + filestem + '_spacy.html'
    z = open(zname, 'w')
    z.write(html)
    z.close()

### Open and read in the text file

In [None]:
#change the following filename for the source transcript you want to use
textfilename = '../input-transcripts/ground-truth/astin-patten_gt_transcript.txt'
textfile = open(textfilename, 'r')
text = textfile.read()
folderpath = os.path.dirname(textfilename)


### Create an EntityRuler instance and read in the patterns file (JSONL)
More info on EntityRuler: https://spacy.io/usage/rule-based-matching#entityruler

In [0]:
#create an instance of EntityRuler for the custom entities
nlp = spacy.load('en_core_web_lg')
ruler = EntityRuler(nlp, validate=True, overwrite_ents=True)

#load the custom vocabs for IU from the appropriate patterns.jsonl file and add the ruler to the pipeline
patternfile = 'custom/custom-vocabularies/iu_archives/patterns.jsonl'
ruler.from_disk(patternfile)
nlp.add_pipe(ruler)

### Run the NER, output results to CSV and display Displacy viz in this notebook

In [4]:
#run the NER
doc = nlp(text)

#optional--show visualization below
displacy.render(doc, style="ent", jupyter=True)

#output csv (you may need to rename output file and move to where you want to keep it)
filestem = os.path.splitext(textfilename)[0]
outputCsv(doc, filestem)

#uncomment the following to write Displacy results to HTML
outputHtml(doc, filestem)