## Scripts to reshape output from named entity recognition (NER)

In [1]:
import re
import csv
import json

### Remove bracketed words from plain text ground truth transcripts

In [6]:
filename = 'input-transcripts/aws-transcribe/astin-patten_aws_transcript.txt'
filestem = filename.split('/')[-1][0:-4]
with open(filename, 'r') as f:
    text = f.read()
    output = re.sub('\[[a-zA-Z,\s]*\]', '', text)
zname = 'Transcripts/' + filestem + '_for_NLP.txt'
z = open(zname, 'w')
z.write(output)
z.close()

### Convert IBM Watson json to CSV

In [7]:
def parse_ents(data):
    entities = []
    ents = data['entities']
    for e in ents:
        entity = {}
        entity['ibm_type'] = 'entity'
        entity['type'] = e['type']
        entity['text'] = e['text']
        entity['relevance'] = e['relevance']
        if 'disambiguation' in e:
            entity['subtypes'] = ','.join(e['disambiguation']['subtype'])
        entities.append(entity)
    return entities

def parse_concepts(data):
    concepts = []
    cons = data['concepts']
    for c in cons:
        concept = {}
        concept['ibm_type'] = 'concept'
        concept['text'] = c['text']
        concept['relevance'] = c['relevance']
        concepts.append(concept)
    return concepts   

def parse_keywords(data):
    keywords = []
    keys = data['keywords']
    for k in keys:
        keyword = {}
        keyword['ibm_type'] = 'keyword'
        keyword['text'] = k['text']
        keyword['relevance'] = k['relevance']
        keywords.append(keyword)
    return keywords

def write_to_csv(filename, entities):
    with open(filename, 'w') as csvfile:
        fieldnames = ['ibm_type', 'type', 'text', 'relevance', 'subtypes']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for e in entities:
            writer.writerow(e)

def get_all_terms(data):
    entities = parse_ents(data)
    concepts = parse_concepts(data)
    keywords = parse_keywords(data)
    all = entities + concepts + keywords
    return all

In [9]:
#edit the filepath for the specific transcript input file you want to convert
women_and_aids = 'ibm-watson/women-and-aids/kaldi-input/women-and-aids_kaldi-input_ner_ibm.json'
student_admin = 'ibm-watsom/student-admin-forum/kaldi-input/student-admin-forum_kaldi-input_ner_igm.json'
astin_patten = 'ibm-watson/astin-patten/kaldi-input/astin-patten_kaldi-input_ner_ibm.json'

jsonfiles = [women_and_aids, student_admin, astin_patten]
for j in jsonfiles:
    filestem = j[0:-5]
    data = json.load(open(j, 'r'))
    ents = get_all_terms(data)
    write_to_csv(filestem + '.csv', ents)

### Convert Google Natural Language API JSON to CSV

In [2]:
def parse_ents(data):
    entities = []
    ents = data['entities']
    for e in ents:
        entity = {}
        entity['type'] = e['type']
        entity['text'] = e['name']
        entity['salience'] = e['salience']
        entity['num_mentions'] = len(e['mentions'])
        # if "PROPER" is type for at least one mention, call the entity a proper noun, else common.
        mentiontypes = [m['type'] for m in e['mentions']]
        if 'PROPER' in mentiontypes:
            entity['proper_or_common'] = 'PROPER'
        else:
            entity['proper_or_common'] = 'COMMON'
        entity['mentions'] = ', '.join([m['text']['content'] for m in e['mentions']])
        entities.append(entity)
    return entities

def write_to_csv(filename, entities):
    with open(filename, 'w') as csvfile:
        fieldnames = ['type', 'text', 'proper_or_common', 'salience', 'num_mentions', 'mentions']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for e in entities:
            writer.writerow(e)

In [4]:
#edit the filepath for the specific transcript input file you want to convert
women_and_aids = 'google-nlp/women-and-aids/ground-truth-input/women-and-aids_gt-input_ner_google.json'
student_admin = 'google-nlp/student-admin/ground-truth-input/student-admin-forum_gt-input_ner_google.json'
astin_patten = 'google-nlp/astin-patten/ground-truth-input/astin-patten_gt-input_ner_google.json'
jsonfiles = [women_and_aids, student_admin, astin_patten]
for j in jsonfiles:
    filestem = j[0:-5]
    data = json.load(open(j, 'r'))
    ents = parse_ents(data)
    write_to_csv(filestem + '.csv', ents)



### AWS Comprehend to CSV

In [2]:
def parse_ents(data):
    entities = []
    ents = data['Entities']
    for e in ents:
        entity = {}
        entity['type'] = e['Type']
        entity['text'] = e['Text']
        entity['score'] = e['Score']
        entity['start'] = e['BeginOffset']
        entity['end'] = e['EndOffset']
        entities.append(entity)
    return entities

def write_to_csv(filename, entities):
    with open(filename, 'w') as csvfile:
        fieldnames = ['type', 'text', 'score', 'start', 'end']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for e in entities:
            writer.writerow(e)
               

In [None]:
#edit the filepath for the specific transcript input file you want to convert
women_and_aids = 'aws-comprehend/women-and-aids/ground-truth-input/women-and-aids_gt-input_ner_aws.json'
student_admin = 'aws-comprehend/student-admin-forum/ground-truth-input/student-admin-forum_gt-input_ner_aws.json'
astin_patten = 'aws-comprehend/astin-patten/ground-truth-input/astin-patten_gt-input_ner_aws.json'
jsonfiles = [women_and_aids, student_admin, astin_patten]
for j in jsonfiles:
    filestem = j[0:-5]
    data = json.load(open(j, 'r'))
    ents = parse_ents(data)
    write_to_csv(filestem + '.csv', ents)

In [None]:
for j in jsonfiles:
    filestem = j[0:-5]
    data = json.load(open(j, 'r'))
    ents = parse_ents(data)
    write_to_csv(filestem + '.csv', ents)    

### Stanford CoreNLP to CSV

In [54]:
def parse_ents(data):
    """Extract tags (in format "word/tag") with types and join continguous tags with matching types (ex. United/LOCATION
     States/LOCATION = United States)"""
    entities = []
    fulltext = [line.replace('\n', '') for line in data]
    fulltext = ''.join(fulltext).replace('  ', ' ')
    words = fulltext.split(' ')
    lasttag = ['','O']
    for w in words:
        if w not in ['\n']:
            tag = w.split('/')
            if len(tag) > 1:
                if lasttag[1] != 'O':
                    if entities[-1]['type'] == tag[1]:
                        print(entities[-1])
                        print(tag)
                        entities[-1]['text'] = entities[-1]['text'] + ' ' + tag[0]             
                elif tag[1] != 'O':
                    entity = {}
                    entity['text'] = tag[0]
                    entity['type'] = tag[1]
                    entities.append(entity)
                lasttag = tag
    return entities

def write_to_csv(filename, entities):
    with open(filename, 'w') as csvfile:
        fieldnames = ['type', 'text']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for e in entities:
            writer.writerow(e)

In [None]:
women_and_aids = 'stanford/women-and-aids/ground-truth-input/women-and-aids_gt-input_ner_stanford.txt'
student_admin = 'stanford/student-admin-forum/ground-truth-input/student-admin-forum_gt-input_ner_stanford.txt'
astin_patten = 'stanford/astin-patten/ground-truth-input/women-and-aids_gt-input_ner_stanford.txt'
txtfiles = [women_and_aids, student_admin, astin_patten]
for t in txtfiles:
    filestem = t[0:-4]
    data = open(t, 'r')
    ents = parse_ents(data)
    write_to_csv(filestem + '.csv', ents)