In [2]:
# Step 1: Load libraries and preprocessed dataset
import pandas as pd
import spacy

# Load the preprocessed dataset saved in Step 1
df = pd.read_csv("data/crime_data_nlppreprocessed.csv")
df.columns = df.columns.str.strip()  # ensure no spaces

print("Dataset loaded for NER. Number of rows:", len(df))
print("Columns available:", df.columns.tolist())


Dataset loaded for NER. Number of rows: 782871
Columns available: ['Report Number', 'Report DateTime', 'Offense ID', 'Offense Date', 'NIBRS Group AB', 'NIBRS Crime Against Category', 'Offense Sub Category', 'Shooting Type Group', 'Block Address', 'Latitude', 'Longitude', 'Beat', 'Precinct', 'Sector', 'Neighborhood', 'Reporting Area', 'Offense Category', 'NIBRS Offense Code Description', 'NIBRS_offense_code', 'Report DateTime_std', 'Offense Date_std', 'Offense_Description_Clean']


In [3]:
# Step 2: Load spaCy NLP model (for NER)
nlp = spacy.load("en_core_web_sm")

print("spaCy NER model loaded.")


spaCy NER model loaded.


In [4]:
# Step 3: Define function to extract entities from cleaned text
def extract_entities(text):
    """
    Extracts PERSON, GPE (locations), and potential weapons from text.
    Returns a dictionary of entities.
    """
    entities = {"persons": [], "locations": [], "weapons": []}
    
    if pd.isna(text) or text.strip() == "":
        return entities
    
    doc = nlp(text)
    
    # Extract PERSON and GPE entities
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["persons"].append(ent.text)
        elif ent.label_ in ["GPE", "LOC"]:
            entities["locations"].append(ent.text)
    
    # Simple weapon extraction using keywords (domain-specific)
    weapon_keywords = ["gun", "knife", "firearm", "weapon", "bat", "knife", "rifle", "pistol"]
    for token in doc:
        if token.text.lower() in weapon_keywords:
            entities["weapons"].append(token.text.lower())
    
    return entities


In [5]:
# Step 4: Apply NER extraction using nlp.pipe for efficiency
texts = df['Offense_Description_Clean'].fillna("").tolist()
entities_list = []

for doc in nlp.pipe(texts, batch_size=1000, n_process=2):  # use 2 CPU cores
    # Extract entities from each doc
    entities = {"persons": [], "locations": [], "weapons": []}
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["persons"].append(ent.text)
        elif ent.label_ in ["GPE", "LOC"]:
            entities["locations"].append(ent.text)
    
    # Weapon keywords
    weapon_keywords = ["gun", "knife", "firearm", "weapon", "bat", "rifle", "pistol"]
    for token in doc:
        if token.text.lower() in weapon_keywords:
            entities["weapons"].append(token.text.lower())
    
    entities_list.append(entities)

# Store extracted entities in the dataframe
df['Entities'] = entities_list
print("Entity extraction complete. Sample entities:")
df[['Offense_Description_Clean', 'Entities']].head()


Entity extraction complete. Sample entities:


Unnamed: 0,Offense_Description_Clean,Entities
0,intimidation,"{'persons': [], 'locations': [], 'weapons': []}"
1,Burglary Breaking Entering,"{'persons': [], 'locations': [], 'weapons': []}"
2,false Pretenses swindle confidence Game,"{'persons': [], 'locations': [], 'weapons': []}"
3,theft Motor Vehicle,"{'persons': ['Motor Vehicle'], 'locations': []..."
4,Larceny,"{'persons': [], 'locations': [], 'weapons': []}"


In [6]:
# Save dataset with extracted entities
df.to_csv("data/crime_data_entities.csv", index=False)
print("Entity-extracted dataset saved as 'data/crime_data_entities.csv'.")


Entity-extracted dataset saved as 'data/crime_data_entities.csv'.


In [7]:
# Step 3.1 – Load the dataset with extracted entities
import pandas as pd

df = pd.read_csv("data/crime_data_entities.csv")
df.columns = df.columns.str.strip()  # clean column names

# Quick inspection
print("Dataset loaded with entities. Number of rows:", len(df))
print("Columns:", df.columns.tolist())
df[['Offense_Description_Clean', 'Entities', 'Category_Clean', 'Block Address', 'Neighborhood', 'Offense Date_std']].head()


Dataset loaded with entities. Number of rows: 782871
Columns: ['Report Number', 'Report DateTime', 'Offense ID', 'Offense Date', 'NIBRS Group AB', 'NIBRS Crime Against Category', 'Offense Sub Category', 'Shooting Type Group', 'Block Address', 'Latitude', 'Longitude', 'Beat', 'Precinct', 'Sector', 'Neighborhood', 'Reporting Area', 'Offense Category', 'NIBRS Offense Code Description', 'NIBRS_offense_code', 'Report DateTime_std', 'Offense Date_std', 'Offense_Description_Clean', 'Entities']


KeyError: "['Category_Clean'] not in index"