# Improving spaCy NER models

### Jonathan Juarez
### Date: 2/29/2024

My objective here is to improve the quality of NER models using spaCy. I will first try using out-of-box models, using a smaller and bigger based labeling models and compare their results.

In [1]:
# Import necessary libraries
import spacy
import os
import random
from text_parser import TextParser
import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jonathanjuarez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jonathanjuarez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
# Load and preprocess the data
PATH = "/Users/jonathanjuarez/Documents/Advanced ML/NLP-Police-Complaints/text_files" 

cases = os.listdir(PATH)
cases = [case for case in cases if case.endswith(".txt")]
random.seed(43)
cases = random.sample(cases, 5)  # Randomly select 10 cases for processing
text_parser = TextParser(PATH, nlp_task=None)

texts = []
for case in cases:
    # Extract the report number from the filename (assuming the format '1234567.txt')
    report_number = case.split('.')[0]
    
    with open(os.path.join(PATH, case), 'r') as file:
        text = file.read()
        case_text = text_parser.preprocess(text, return_as_list=False, remove_numbers=False, stem=False)
    
    # Store the tuple (report_number, case_text)
    texts.append((report_number, case_text))

Initializing parsers for None


In [10]:
def extract_unique_entities_with_labels(texts, entity_types=None, model="en_core_web_sm", batch_size=1000, counts=False):
    """
    Extracts unique entities and their labels from a list of texts using SpaCy, with an option to include counts.

    Parameters:
    texts (list of str): The texts from which to extract entities.
    entity_types (list of str or str, optional): The types of entities to extract (e.g., 'PERSON', 'ORG').
                                                 If None, all types are extracted.
    model (str, optional): The SpaCy model to use for entity extraction. Defaults to 'en_core_web_sm'.
    batch_size (int, optional): The number of texts to process at a time. Defaults to 1000.
    counts (bool, optional): If True, includes the count of each unique entity. Defaults to False.

    Returns:
    list: A list of tuples, each containing an entity and its label, and optionally its count.
    """
    nlp = spacy.load(model)
    
    if isinstance(entity_types, str):
        entity_types = [entity_types]
    
    entity_counts = {}

    for doc in nlp.pipe(texts, batch_size=batch_size):
        for ent in doc.ents:
            if entity_types is None or ent.label_ in entity_types:
                key = (ent.text, ent.label_)
                entity_counts[key] = entity_counts.get(key, 0) + 1

    # Convert the dictionary to a list of tuples
    if counts:
        return [(entity, label, count) for (entity, label), count in entity_counts.items()]
    else:
        return [(entity, label) for (entity, label) in entity_counts.keys()]


In [41]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")
print(nlp_sm.get_pipe("ner").labels)
print(nlp_lg.get_pipe("ner").labels)
# add entities of interest for analysis
entity_types = ['PERSON', 'ORG', 'LOC', 'DATE', 'TIME', 'ORDINAL']

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [42]:
# Select the first report's text and its number
report_number, report_text = texts[0]

# Extract entities from the first report's text
extracted_entities = extract_unique_entities_with_labels([report_text], counts=True)

# Create a DataFrame from the extracted entities
df_ner_small = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_sm")
display(df_ner_small.loc[df_ner_small['Label']=='ORDINAL'])

Entities extracted from report 1086919 with en_core_web_sm


Unnamed: 0,Entity,Label,Count
8,first,ORDINAL,40
9,second,ORDINAL,31
60,16cr,ORDINAL,1
77,third,ORDINAL,1
104,2nd,ORDINAL,1
115,1st,ORDINAL,2
135,fourth,ORDINAL,7
140,7th,ORDINAL,2
172,3rd,ORDINAL,1
174,4th,ORDINAL,1


In [35]:
# Select the second report's text and its number
report_number, report_text = texts[1]

# Extract entities from the first report's text
extracted_entities = extract_unique_entities_with_labels([report_text], counts=True)

# Create a DataFrame from the extracted entities
df_ner_small = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_sm")
display(df_ner_small.loc[df_ner_small['Label']=='ORG'])

Entities extracted from report 2019-1092559 with en_core_web_sm


Unnamed: 0,Entity,Label,Count
25,cause1,ORG,1
26,dollar3,ORG,1
29,quick mart,ORG,1
45,avery v,ORG,1


In [38]:
# Extract entities from the first report's text using en_core_web_lg
# python3 -m spacy download en_core_web_lg
report_number, report_text = texts[0]

extracted_entities = extract_unique_entities_with_labels([report_text], model='en_core_web_lg', counts=True)

# Create a DataFrame from the extracted entities
df_ner_lg = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_lg")
display(df_ner_lg.loc[df_ner_lg['Label']=='PERSON'])

Entities extracted from report 1086919 with en_core_web_lg


Unnamed: 0,Entity,Label,Count
17,marshfield,PERSON,1
18,xxxx2013,PERSON,2
19,dob xxxx1981,PERSON,1
21,xxxx2012,PERSON,1
39,f star xxxx,PERSON,1
43,g star xxxx,PERSON,1
53,h star xxxx,PERSON,1
68,xxxx marshfield avenue,PERSON,4
76,fl oor,PERSON,1
79,pr esent execution search,PERSON,1


In [33]:
# Select the second report's text and its number
report_number, report_text = texts[1]

extracted_entities = extract_unique_entities_with_labels([report_text], model='en_core_web_lg', counts=True)

# Create a DataFrame from the extracted entities
df_ner_lg = pd.DataFrame(extracted_entities, columns=['Entity', 'Label', 'Count'])

# Display the DataFrame
print(f"Entities extracted from report {report_number} with en_core_web_lg")
display(df_ner_lg.loc[df_ner_lg['Label']=='PERSON'])

Entities extracted from report 2019-1092559 with en_core_web_lg


Unnamed: 0,Entity,Label,Count
27,dollar3,PERSON,1
43,le ft10,PERSON,1
44,bwc,PERSON,1
68,andrea kersten,PERSON,2


In [None]:
# Cell 6: Performance Analysis (if you have ground truth labels)
# Load your labeled data (you would need to have this prepared)
# Compute precision, recall, and F1 score comparing extracted entities to ground truth