### Improving spaCy NER models... 
## But Surprise, Backtracking to Superior Transformers

### Jonathan Juarez
### Date: 2/29/2024

My objective here is to improve the quality of NER models using spaCy. I will first try using out-of-box models, using a smaller and bigger based labeling models and compare their results.

In [223]:
# Import necessary libraries
import spacy
import os
import random
from text_parser import TextParser
import pandas as pd

In [296]:
from pathlib import Path
import os

current_dir = Path(os.path.dirname(os.path.realpath("__file__")))
project_root = current_dir.parent.parent

# Define the path to the text_files directory
text_files_dir = project_root / "text_files"

In [224]:
# Load and preprocess the data
cases = os.listdir(text_files_dir)
cases = [case for case in cases if case.endswith(".txt")]
random.seed(43)
cases = random.sample(cases, 5)  # Randomly select 10 cases for processing
text_parser = TextParser(text_files_dir, nlp_task=None)

texts = []
for case in cases:
    # Extract the report number from the filename (assuming the format '1234567.txt')
    report_number = case.split('.')[0]
    
    with open(os.text_files_dir.join(text_files_dir, case), 'r') as file:
        text = file.read()
        case_text = text_parser.preprocess(text, return_as_list=False, remove_numbers=False, stem=False)
    
    # Store the tuple (report_number, case_text)
    texts.append((report_number, case_text))

Initializing parsers for None


In [10]:
def extract_unique_entities_with_labels(texts, entity_types=None, model="en_core_web_sm", batch_size=1000, counts=False):
    """
    Extracts unique entities and their labels from a list of texts using SpaCy, with an option to include counts.

    Parameters:
    texts (list of str): The texts from which to extract entities.
    entity_types (list of str or str, optional): The types of entities to extract (e.g., 'PERSON', 'ORG').
                                                 If None, all types are extracted.
    model (str, optional): The SpaCy model to use for entity extraction. Defaults to 'en_core_web_sm'.
    batch_size (int, optional): The number of texts to process at a time. Defaults to 1000.
    counts (bool, optional): If True, includes the count of each unique entity. Defaults to False.

    Returns:
    list: A list of tuples, each containing an entity and its label, and optionally its count.
    """
    nlp = spacy.load(model)
    
    if isinstance(entity_types, str):
        entity_types = [entity_types]
    
    entity_counts = {}

    for doc in nlp.pipe(texts, batch_size=batch_size):
        for ent in doc.ents:
            if entity_types is None or ent.label_ in entity_types:
                key = (ent.text, ent.label_)
                entity_counts[key] = entity_counts.get(key, 0) + 1

    # Convert the dictionary to a list of tuples
    if counts:
        return [(entity, label, count) for (entity, label), count in entity_counts.items()]
    else:
        return [(entity, label) for (entity, label) in entity_counts.keys()]


In [43]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")
print(nlp_sm.get_pipe("ner").labels)
print(nlp_lg.get_pipe("ner").labels)
# add entities of interest for analysis
entity_types = ['PERSON', 'ORG', 'LOC', 'DATE', 'TIME',]

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [44]:
# Select the first report's text and its number
report_number, report_text = texts[0]

# Extract entities from the first report's text
extracted_entities = extract_unique_entities_with_labels([report_text], counts=True)

# Create a DataFrame from the extracted entities
df_ner_small = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_sm")
display(df_ner_small.loc[df_ner_small['Label']=='PERSON'])

Entities extracted from report 1086919 with en_core_web_sm


Unnamed: 0,Entity,Label,Count
13,xxxx2013,PERSON,2
14,xxxx1981,PERSON,1
64,marshfield,PERSON,2
85,basement9,PERSON,1
131,avenue20,PERSON,1
134,door21,PERSON,1
137,v hinton,PERSON,1
163,kelly,PERSON,1
176,search22,PERSON,1


In [45]:
# Select the second report's text and its number
report_number, report_text = texts[1]

# Extract entities from the first report's text
extracted_entities = extract_unique_entities_with_labels([report_text], counts=True)

# Create a DataFrame from the extracted entities
df_ner_small = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_sm")
display(df_ner_small.loc[df_ner_small['Label']=='LOC'])

Entities extracted from report 2019-1092559 with en_core_web_sm


Unnamed: 0,Entity,Label,Count
5,west madison,LOC,5


In [150]:
# Extract entities from the first report's text using en_core_web_lg
# python3 -m spacy download en_core_web_lg
report_number, report_text = texts[1]

extracted_entities = extract_unique_entities_with_labels([report_text], model='en_core_web_lg', counts=True)

# Create a DataFrame from the extracted entities
df_ner_lg = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_lg")
display(df_ner_lg.loc[df_ner_lg['Label']=='PERSON'])

Entities extracted from report 2019-1092559 with en_core_web_lg


Unnamed: 0,Entity,Label,Count
27,dollar3,PERSON,1
43,le ft10,PERSON,1
44,bwc,PERSON,1
68,andrea kersten,PERSON,2


In [64]:
# Select the second report's text and its number
report_number, report_text = texts[1]

extracted_entities = extract_unique_entities_with_labels([report_text], model='en_core_web_lg', counts=True)

# Create a DataFrame from the extracted entities
df_ner_lg = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_lg")
df_ner_lg.loc[df_ner_lg['Label']=='PERSON']

Entities extracted from report 2019-1092559 with en_core_web_lg


Unnamed: 0,Entity,Label,Count
27,dollar3,PERSON,1
43,le ft10,PERSON,1
44,bwc,PERSON,1
68,andrea kersten,PERSON,2


I notice that these models are not that great at properly identifying person labels. They are great at identifying dates and times, but address names are consistently misclassified as persons, random groupings of words such as "recover g" and "fl oor" are labeled as persons, etc. Identifying "LOC" (locations) is difficult for these models. Because of this I was able to create a list of labeled persons and locations, based off the data gathered by the non-profit [Invisible Institute](https://github.com/invinst). I will cross-reference the correct labels with the models and try to fine-tune based of these results.

In [51]:
combined_path = "../combined_entities.csv"
true_labels_df = pd.read_csv(combined_path)

Here I create a get_best_label function, which is designed to perform fuzzy matching between an extracted entity and a list of potential candidates (choices) to find the best matching labeled entity from the labeled dataset (true_labels_df in this case). 

I implement a similarity score threshold to discard low-confidence matches. fuzzywuzzy provides a score indicating the similarity between the query string and each possible match, and we can use this score to decide whether to accept or reject the match.

In [61]:
from fuzzywuzzy import process

def get_best_label(extracted_entity, true_labels_df, entity_column, label_column, threshold=95):
    """
    Finds the best matching label for an extracted entity using fuzzy matching with a threshold.

    Parameters:
    extracted_entity (str): The entity extracted by the NER model.
    true_labels_df (pd.DataFrame): The DataFrame containing the true labels.
    entity_column (str): The column name in true_labels_df that contains the entity names.
    label_column (str): The column name in true_labels_df that contains the entity labels.
    threshold (int): The minimum similarity score to accept a match.

    Returns:
    str: The best matching label or None if no match exceeds the threshold.
    """
    # Normalize the entity names for matching
    choices = true_labels_df[entity_column].str.lower().tolist()
    normalized_entity = extracted_entity.lower().strip()
    
    # Find the best match for the extracted entity
    best_match, score = process.extractOne(normalized_entity, choices)

    # Check if the best match score exceeds the threshold
    if score >= threshold:
        return true_labels_df.loc[true_labels_df[entity_column].str.lower() == best_match, label_column].iloc[0]
    else:
        return None  # or you could return a default label or indicate uncertainty


In [63]:
extracted_entity = 'xxxx marshfield ave'
best_label = get_best_label(extracted_entity, true_labels_df, 'Entity', 'Label')
print(f"Extracted Entity: {extracted_entity} | Best Match Label: {best_label}")

Extracted Entity: xxxx marshfield ave | Best Match Label: None


In [67]:
# Assuming get_best_label function and true_labels_df are already defined

# Apply the get_best_label function to cross-reference and potentially correct the labels
df_ner_lg['Corrected Label'] = df_ner_lg.apply(
    lambda row: get_best_label(row['Entity'], true_labels_df, 'Entity', 'Label', threshold=85) if row['Label'] == 'PERSON' else row['Label'],
    axis=1
)

# Filter to view corrections (if any) for PERSON labels
corrected_persons = df_ner_lg.loc[df_ner_lg['Label'] == 'PERSON']

# Display the DataFrame with potentially corrected labels
print(f"Entities extracted from report {report_number} with potential corrections:")
display(corrected_persons)


Entities extracted from report 2021-0002232 with potential corrections:


Unnamed: 0,Entity,Label,Count,Corrected Label
7,9208,PERSON,1,
22,oscar,PERSON,1,PERSON
23,matthew haynam,PERSON,2,PERSON
26,oscar ponce,PERSON,3,PERSON
31,rules1,PERSON,1,
34,orders2,PERSON,1,
45,julian marin,PERSON,1,PERSON
47,bwcs,PERSON,1,
50,kno wn,PERSON,1,
57,oscar ponce4,PERSON,1,PERSON


For NER training in SpaCy, the context in which each entity appears is crucial. The training data should include the full text and the start and end positions of each entity within that text. Since combined_entities.csv only contains isolated entities without the surrounding context, we won't be able to directly convert this list into the proper SpaCy training format.

I move into creating a data augmentation task, the script of this is found in data_augmentation.py. I then load the augmented dataset with random context relevant to each entity. Sentences were randomly generated using ChatGPT-4 and were injected with a randomly selected entity.

In [99]:
import importlib
import template  # or your specific file name without .py
importlib.reload(template)

<module 'template' from '/Users/jonathanjuarez/Documents/Advanced ML/NLP-Police-Complaints/src/exploratory/template.py'>

In [100]:
from template import templates, validation_templates, testing_templates
import pandas as pd
import random

# Assume true_labels_df is your DataFrame loaded from a CSV or other source
entities_df = true_labels_df

# Shuffle your DataFrame to ensure random distribution
entities_df = entities_df.sample(frac=1).reset_index(drop=True)

# Initialize sets
TRAIN_DATA = []
VALIDATION_DATA = []
TEST_DATA = []

# Define split points based on desired ratios
train_split = int(0.7 * len(entities_df))
validation_split = int(0.85 * len(entities_df))  # 70% for train + 15% for validation

# Allocate data to each set using the corresponding templates
for i, row in entities_df.iterrows():
    entity = row['Entity']
    label = row['Label']
    
    # Select the template set based on the current index
    if i < train_split:
        template_set = templates
    elif i < validation_split:
        template_set = validation_templates
    else:
        template_set = testing_templates

    # Choose a random template from the selected set
    template = random.choice(template_set[label])
    sentence = template.replace(f'[{label}]', entity)
    start, end = sentence.find(entity), sentence.find(entity) + len(entity)
    
    # Create the data point and append it to the appropriate dataset
    data_point = (sentence, {'entities': [(start, end, label)]})
    if i < train_split:
        TRAIN_DATA.append(data_point)
    elif i < validation_split:
        VALIDATION_DATA.append(data_point)
    else:
        TEST_DATA.append(data_point)

In [134]:
TRAIN_DATA[0]

('John Connelly was mentioned in the previous deposition.',
 {'entities': [(0, 13, 'PERSON')]})

After splitting the augmented data into their seperate training, validation and testing sets I can then run the training script below.

In [105]:
import spacy
from spacy.training import Example

# Load a pre-existing SpaCy model or create a blank one
nlp = spacy.load("en_core_web_lg")  # For a new model or spacy.load('en_core_web_lg') to update an existing one
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipes during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    for itn in range(10):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses, sgd=optimizer)
        print(f"Losses at iteration {itn}: {losses}")




Losses at iteration 0: {'ner': 1061.1003570554574}
Losses at iteration 1: {'ner': 445.94922605181176}
Losses at iteration 2: {'ner': 433.6636161576546}
Losses at iteration 3: {'ner': 371.85459191260975}
Losses at iteration 4: {'ner': 406.3931996861936}
Losses at iteration 5: {'ner': 381.6451515367644}
Losses at iteration 6: {'ner': 320.27752029588044}
Losses at iteration 7: {'ner': 296.9262994444416}
Losses at iteration 8: {'ner': 378.02317155452727}
Losses at iteration 9: {'ner': 243.54589263553652}


In [106]:
# Save the trained model to disk
model_path = '~/Advanced ML/NLP-Police-Complaints/spacy_ner_checkpoint'
nlp.to_disk(model_path)
print(f"Model saved to spacy_ner_checkpoint")

Model saved to spacy_ner_checkpoint


In [135]:
nlp_eval = spacy.load(model_path)
scorer = Scorer(nlp_eval)

# Print available methods and attributes
print(dir(scorer))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'cfg', 'nlp', 'score', 'score_cats', 'score_deps', 'score_links', 'score_spans', 'score_token_attr', 'score_token_attr_per_feat', 'score_tokenization']


In [141]:
print(spacy.__version__)

3.7.2


In [None]:
from spacy.scorer import Scorer
from spacy.training import Example

def evaluate(ner_model, dataset):
    examples = []
    for text, annot in dataset:
        doc = ner_model.make_doc(text)
        example = Example.from_dict(doc, annot)
        example.predicted = ner_model(text)
        examples.append(example)
    
    scorer = Scorer()
    scores = scorer.score(examples)
    return scores

In [None]:
validation_scores = evaluate(nlp_eval, VALIDATION_DATA)
simplified_scores = {
    'Overall Entity Precision': validation_scores['ents_p'],
    'Overall Entity Recall': validation_scores['ents_r'],
    'Overall Entity F1-score': validation_scores['ents_f'],
    'Person Precision': validation_scores['ents_per_type']['PERSON']['p'],
    'Person Recall': validation_scores['ents_per_type']['PERSON']['r'],
    'Person F1-score': validation_scores['ents_per_type']['PERSON']['f'],
    'Location Precision': validation_scores['ents_per_type']['LOC']['p'],
    'Location Recall': validation_scores['ents_per_type']['LOC']['r'],
    'Location F1-score': validation_scores['ents_per_type']['LOC']['f'],
}

In [147]:
print(simplified_scores)

{'Overall Entity Precision': 0.9956316617158832, 'Overall Entity Recall': 0.996850944716585, 'Overall Entity F1-score': 0.9962409301512369, 'Person Precision': 1.0, 'Person Recall': 0.9992542878448919, 'Person F1-score': 0.999627004848937, 'Location Precision': 0.931129476584022, 'Location Recall': 0.9602272727272727, 'Location F1-score': 0.9454545454545454}


In [None]:
test_scores = evaluate(nlp_eval, TEST_DATA)
simplified_test_scores = {
    'Overall Entity Precision': test_scores['ents_p'],
    'Overall Entity Recall': test_scores['ents_r'],
    'Overall Entity F1-score': test_scores['ents_f'],
    'Person Precision': test_scores['ents_per_type']['PERSON']['p'],
    'Person Recall': test_scores['ents_per_type']['PERSON']['r'],
    'Person F1-score': test_scores['ents_per_type']['PERSON']['f'],
    'Location Precision': test_scores['ents_per_type']['LOC']['p'],
    'Location Recall': test_scores['ents_per_type']['LOC']['r'],
    'Location F1-score': test_scores['ents_per_type']['LOC']['f'],
}

In [149]:
print(simplified_test_scores)

{'Overall Entity Precision': 0.9793601651186791, 'Overall Entity Recall': 0.996674251706634, 'Overall Entity F1-score': 0.9879413550793789, 'Person Precision': 0.9819037278320666, 'Person Recall': 0.9996315401621223, 'Person F1-score': 0.9906883330290305, 'Location Precision': 0.9305555555555556, 'Location Recall': 0.9403508771929825, 'Location F1-score': 0.9354275741710297}


{'Overall Entity Precision': 0.9793601651186791, 'Overall Entity Recall': 0.996674251706634, 'Overall Entity F1-score': 0.9879413550793789, 'Person Precision': 0.9819037278320666, 'Person Recall': 0.9996315401621223, 'Person F1-score': 0.9906883330290305, 'Location Precision': 0.9305555555555556, 'Location Recall': 0.9403508771929825, 'Location F1-score': 0.9354275741710297}

These scores are near perfect and are thus quite suspicious. I verify the performance on the model on text data from a report we already analyzed above with the base spaCy model.

In [166]:
# Select the second report's text and its number
report_number = '2022-0000603'
report_text = "On February 20, 2022, at 10:24 a.m., Police Officer Jared Kundrat and Police Officer Ryan \
Ritchie responded to after receiving a call from regarding  \
a dispute with his neighbor . told the officers  that he was involved in an argument with \
his neighbor,  who he had an ongoing dispute with. told the officers \
that threatened him; Specifically , told the officers that stated words to \
the effect of , “I’m gonna have  my day with you ,” and “ You’re gonna get yours,”  placing him in \
fear of receiving a battery. Sergeant  (Sgt.)  Majdi Shalabi arrived at the scene  and directed the \
officers to arrest after  said  he wanted  to sign a complaint  and press charges \
against"

extracted_entities = extract_unique_entities_with_labels([report_text], model=model_path, counts=True)

# Create a DataFrame from the extracted entities
df_ner_lg = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with nlp_eval")
df_ner_lg.loc[df_ner_lg['Label']=='PERSON']

Entities extracted from report 2022-0000603 with nlp_eval


Unnamed: 0,Entity,Label,Count
0,On February,PERSON,1
1,Jared Kundrat,PERSON,1
2,Ryan Ritchie,PERSON,1
3,him; Specifically,PERSON,1
4,“I,PERSON,1
5,”,PERSON,1
7,Majdi Shalabi,PERSON,1


It appears using augmented data led to worse results when compared to the base model. The nlp_eval model considers dates and places  as PERSONS. This reflects the importance of using diverse contextual training data when trying to create a new model from a pre-built model.

I revert to using bert-large model from Hugging Face.

In [170]:
from transformers import pipeline

def extract_entities_transformers(text, model="dbmdz/bert-large-cased-finetuned-conll03-english", counts=False):
    nlp = pipeline("ner", model=model, tokenizer=model)
    entities = nlp(text)
    
    entity_counts = {}
    for ent in entities:
        # The key combines the word and its entity label.
        key = (ent['word'], ent['entity'])
        entity_counts[key] = entity_counts.get(key, 0) + 1
    
    # Convert the dictionary to a list of tuples
    if counts:
        return [(entity, label, count) for (entity, label), count in entity_counts.items()]
    else:
        return [(entity, label) for (entity, label) in entity_counts.keys()]

In [174]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Print available labels
print(model.config.id2label)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}


I determined that when using BERT transformer models, pre-processing of the text is usually not needed, as these models require the usage of context such as punctuation and capitalizations. So I reload a case example and try on the raw text data.

In [193]:
PATH = text_files_dir + "2022-0000331.txt"

with open(PATH, 'r') as file:
    text = file.read()
#no preprocessing needed
case_text_2022 = text

CIVILIAN OFFICE OF POLICE ACCOUNTABILITY  LOG# 2022 -0000331  
1  
SUMMARY REPORT OF INVESTIGATION 
 
I. EXECUTIVE SUMMARY  
 
Date of Incident:  January 27, 2022  
Time of Incident:  8:29 p.m.  
Location of Incident:  736 N. Ridgeway Ave Chicago, IL 60624  
Date of COPA Notification:  January 28, 2022  
Time of COPA Notification:  10:36 a.m.  
 
COPA was notified of a death in custody by email from CPIC on January 28, 2022, at 
10:36 a.m.1 The subject, was stopped while walking on the street by Officers ; 
Owens, Hinojosa, Carreon, and Hanson for drinking a substance that appeared to be alcohol  and 
smoking  what  appear ed to be marijuana. During the street stop, Officer Hanson asked if he could 
search responded with words to the effect of, “Yeah, go ahead .”2 During the 
rest of the stop, she did not respond to any other questions the officers ’ asked her. Officer s Owens 
and  Hinojosa handcuffed her.  She appeared to walk  backward, shake, and  fall to the ground . The 
officers

In [212]:
extracted_entities = extract_entities_transformers(case_text_2022, counts=True)
# Create a DataFrame from the extracted entities
df_ner_bert = pd.DataFrame(extracted_entities, columns=['entity', 'label', 'count'])
# Display the DataFrame
print(df_ner_bert)
# print(f"Entities extracted from report {report_number} with BERT")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


       entity   label  count
0         ##I  I-MISC      1
1         ##L  I-MISC      1
2        ##IA  I-MISC      1
3           N   I-LOC      1
4       Ridge   I-LOC      1
5       ##way   I-LOC      1
6     Chicago   I-LOC      1
7          IL   I-LOC      1
8          CO   I-ORG      4
9        ##PA   I-ORG      2
10         CP   I-ORG      1
11       ##IC   I-ORG      1
12      Owens   I-PER      3
13         Hi   I-PER      3
14       ##no   I-PER      3
15      ##jos   I-PER      3
16        ##a   I-PER      3
17       Carr   I-PER      1
18      ##eon   I-PER      1
19     Hanson   I-PER      4
20          N  I-MISC      1
21       ##AR  I-MISC      1
22  Norwegian   I-LOC      1
23   Hospital   I-LOC      1
24      Monty   I-PER      1
25    Ricardo   I-PER      1


In [216]:
print(extracted_entities)

[('##I', 'I-MISC', 1), ('##L', 'I-MISC', 1), ('##IA', 'I-MISC', 1), ('N', 'I-LOC', 1), ('Ridge', 'I-LOC', 1), ('##way', 'I-LOC', 1), ('Chicago', 'I-LOC', 1), ('IL', 'I-LOC', 1), ('CO', 'I-ORG', 4), ('##PA', 'I-ORG', 2), ('CP', 'I-ORG', 1), ('##IC', 'I-ORG', 1), ('Owens', 'I-PER', 3), ('Hi', 'I-PER', 3), ('##no', 'I-PER', 3), ('##jos', 'I-PER', 3), ('##a', 'I-PER', 3), ('Carr', 'I-PER', 1), ('##eon', 'I-PER', 1), ('Hanson', 'I-PER', 4), ('N', 'I-MISC', 1), ('##AR', 'I-MISC', 1), ('Norwegian', 'I-LOC', 1), ('Hospital', 'I-LOC', 1), ('Monty', 'I-PER', 1), ('Ricardo', 'I-PER', 1)]


The out of box BERT model performs much better, but subtokenization creates seperations from the entities.

In [248]:
def aggregate_subwords(entities):
    corrected_entities = []
    previous_entity = ""
    previous_label = ""
    previous_count = 0

    for entity, label, count in entities:
        if entity.startswith("##"):
            if previous_entity:  # Check if previous_entity is not empty
                previous_entity += entity[2:]
                previous_count += count  # Aggregate counts for subword continuations
        else:
            if previous_entity:  # Add the previous entity if it exists
                corrected_entities.append((previous_entity, previous_label, previous_count))
            previous_entity = entity
            previous_label = label
            previous_count = count

    # Add the last entity if the loop ends
    if previous_entity:
        corrected_entities.append((previous_entity, previous_label, previous_count))

    return corrected_entities

In [217]:
corrected_entities = aggregate_subwords(extracted_entities)
df_ner_bert = pd.DataFrame(corrected_entities, columns=['entity', 'label', 'count'])
print(df_ner_bert)

       entity   label  count
0           N   I-LOC      1
1    Ridgeway   I-LOC      2
2     Chicago   I-LOC      1
3          IL   I-LOC      1
4        COPA   I-ORG      6
5        CPIC   I-ORG      2
6       Owens   I-PER      3
7    Hinojosa   I-PER     12
8     Carreon   I-PER      2
9      Hanson   I-PER      4
10        NAR  I-MISC      2
11  Norwegian   I-LOC      1
12   Hospital   I-LOC      1
13      Monty   I-PER      1
14    Ricardo   I-PER      1


Because it would be ideal to seperate locations from address name locations and physical building locations, I test bert-addresses for its feasibility in handling this task.

In [237]:
extracted_entities = extract_entities_transformers(case_text_2022, model= "ctrlbuzz/bert-addresses",counts=True)
# Create a DataFrame from the extracted entities
df_ner_bert = pd.DataFrame(extracted_entities, columns=['entity', 'label', 'count'])
# Display the DataFrame
print(df_ner_bert)

     entity   label  count
0        73  B-addr      1
1       ##6  I-addr      1
2         N  I-addr      1
3         .  I-addr      1
4     Ridge  I-addr      1
5     ##way  I-addr      1
6       Ave  I-addr      1
7   Chicago  I-addr      1
8         ,  I-addr      1
9        IL  I-addr      1
10       60  I-addr      1
11    Owens   B-PER      3
12       Hi   B-PER      3
13     ##no   I-PER      3
14    ##jos   I-PER      1
15      ##a   I-PER      3
16     Carr   B-PER      1
17    ##eon   B-PER      1
18   Hanson   B-PER      4
19    ##jos   B-PER      2
20    Monty   B-PER      1
21  Ricardo   B-PER      1


In [249]:
corrected_entities = aggregate_subwords(extracted_entities)
df_ner_bert = pd.DataFrame(corrected_entities, columns=['entity', 'label', 'count'])
print(df_ner_bert)

       entity   label  count
0         736  B-addr      2
1           N  I-addr      1
2           .  I-addr      1
3    Ridgeway  I-addr      2
4         Ave  I-addr      1
5     Chicago  I-addr      1
6           ,  I-addr      1
7          IL  I-addr      1
8          60  I-addr      1
9       Owens   B-PER      3
10   Hinojosa   B-PER     10
11    Carreon   B-PER      2
12  Hansonjos   B-PER      6
13      Monty   B-PER      1
14    Ricardo   B-PER      1


Seems to work just fine. I create a class to handle all the above.

In [300]:
import pandas as pd
from transformers import pipeline

class ExtractEntities:
    def __init__(self, model="dbmdz/bert-large-cased-finetuned-conll03-english"):
        self.model = model
        self.nlp = pipeline("ner", model=model, tokenizer=model)

    def extract_entities(self, text, counts=False):
        entities = self.nlp(text)
        entity_counts = {}
        for ent in entities:
            key = (ent['word'], ent['entity'])
            entity_counts[key] = entity_counts.get(key, 0) + 1
        
        if counts:
            return [(entity, label, count) for (entity, label), count in entity_counts.items()]
        else:
            return [(entity, label) for (entity, label) in entity_counts.keys()]

    def aggregate_subwords(self, entities):
        corrected_entities = []
        previous_entity = ""
        previous_label = ""
        previous_count = 0

        for entity, label, count in entities:
            if entity.startswith("##"):
                if previous_entity:  # Check if previous_entity is not empty
                    previous_entity += entity[2:]
                    previous_count += count  # Aggregate counts for subword continuations
            else:
                if previous_entity:  # Add the previous entity if it exists
                    corrected_entities.append((previous_entity, previous_label, previous_count))
                previous_entity = entity
                previous_label = label
                previous_count = count

        # Add the last entity if the loop ends
        if previous_entity:
            corrected_entities.append((previous_entity, previous_label, previous_count))

        return corrected_entities

    def create_dataframe(self, report_number, text, counts=False):
        extracted_entities = self.extract_entities(text, counts)
        corrected_entities = self.aggregate_subwords(extracted_entities)
        df = pd.DataFrame(corrected_entities, columns=['entity', 'label', 'count'])
        df['report_number'] = report_number  # Add report number column
        return df

In [301]:
# Load and preprocess the data
cases = os.listdir(text_files_dir)
cases = [case for case in cases if case.endswith(".txt")]
random.seed(43)
cases = random.sample(cases, 5)  # Randomly select 5 cases

texts = []
for case in cases:
    # Extract the report number from the filename (assuming the format '1234567.txt')
    report_number = case.split('.')[0]
    with open(os.path.join(PATH, case), 'r') as file:
        text = file.read()
    # Store the tuple (report_number, case_text)
    texts.append((report_number, text))

In [302]:
extractor = ExtractEntities(model="ctrlbuzz/bert-addresses")

dataframes = []
for report_number, text in texts:
    df = extractor.create_dataframe(report_number, text, counts=True)
    dataframes.append(df)

# Concatenate all dataframes to have one unified dataframe
final_df = pd.concat(dataframes, ignore_index=True)

In [303]:
final_df.head(30)

Unnamed: 0,entity,label,count,report_number
0,XX,B-addr,7,1086919
1,S,I-addr,2,1086919
2,.,I-addr,3,1086919
3,Marshfield,I-addr,4,1086919
4,Ave,I-addr,2,1086919
5,",",I-addr,2,1086919
6,Chicago,I-addr,1,1086919
7,IL,I-addr,1,1086919
8,Involved,B-PER,5,1086919
9,Civilian,I-PER,10,1086919


In [304]:
def aggregate_entities(df):
    aggregated_entities = []
    current_entity = ''
    current_label = ''
    report_number = ''

    for i, row in df.iterrows():
        if 'B-' in row['label']:
            if current_entity:  # Add the previous entity
                aggregated_entities.append({'entity': current_entity, 'label': current_label, 'report_number': report_number})
            current_entity = row['entity']
            current_label = row['label'].split('-')[1]  # Update the label
            report_number = row['report_number']
        elif 'I-' in row['label']:
            current_entity += f" {row['entity']}"
        else:  # Handle 'O' labels or other cases
            if current_entity:  # Add the last entity
                aggregated_entities.append({'entity': current_entity, 'label': current_label, 'report_number': report_number})
                current_entity = ''
                current_label = ''
                report_number = ''

    # Add the final entity if it exists
    if current_entity:
        aggregated_entities.append({'entity': current_entity, 'label': current_label, 'report_number': report_number})

    return pd.DataFrame(aggregated_entities)

In [305]:
aggregated_df = aggregate_entities(final_df)
aggregated_df

Unnamed: 0,entity,label,report_number
0,"XX S . Marshfield Ave , Chicago IL",addr,1086919
1,Involved Civilian 1 2 C ivilian Investigation ...,PER,1086919
2,Timothy,PER,1090068
3,Melissa Uldr,PER,1090068
4,the Chicago Police Department,ORG,1090068
5,CP,ORG,1090068
6,Department,ORG,1090068
7,Uldrych,PER,1090068
8,Children and Family Services,ORG,1090068
9,DCFS,ORG,1090068


In [311]:
# Load and preprocess the data
cases = os.listdir(text_files_dir)
cases = [case for case in cases if case.endswith(".txt")]
random.seed(3)
cases = random.sample(cases, 5)  # Randomly select 5 cases

texts = []
for case in cases:
    # Extract the report number from the filename (assuming the format '1234567.txt')
    report_number = case.split('.')[0]
    with open(os.path.join(PATH, case), 'r') as file:
        text = file.read()
    # Store the tuple (report_number, case_text)
    texts.append((report_number, text))

dataframes = []
for report_number, text in texts:
    df = extractor.create_dataframe(report_number, text, counts=True)
    dataframes.append(df)

# Concatenate all dataframes to have one unified dataframe
final_df = pd.concat(dataframes, ignore_index=True)
aggregated_df = aggregate_entities(final_df)
aggregated_df

Unnamed: 0,entity,label,report_number
0,Cedric Baileydric,PER,2020-0001354
1,Bailey,PER,2020-0001354
2,CO,ORG,2020-0001354
3,Airport Operations North - Unit,ORG,2020-0001354
4,the Chicago Police Academy,ORG,2020-0001354
5,IP,ORG,2014-1068753
6,A Sub 1,PER,2014-1068753
7,XX N . Hermitage,addr,2014-1068753
8,Officer A,PER,2014-1068753
9,Department,ORG,2014-1068753
