In [5]:
# 02_Crime_NLP_and_KG.ipynb
# Phase II – Step 1: NLP Preprocessing

import pandas as pd
import spacy

# 1️⃣ Load the cleaned dataset from Phase I
df = pd.read_csv("data/crime_data_cleaned.csv")

# 2️⃣ Strip extra spaces from column names
df.columns = df.columns.str.strip()

# 3️⃣ Confirm columns
print("Columns in dataset:", df.columns.tolist())



Columns in dataset: ['Report Number', 'Report DateTime', 'Offense ID', 'Offense Date', 'NIBRS Group AB', 'NIBRS Crime Against Category', 'Offense Sub Category', 'Shooting Type Group', 'Block Address', 'Latitude', 'Longitude', 'Beat', 'Precinct', 'Sector', 'Neighborhood', 'Reporting Area', 'Offense Category', 'NIBRS Offense Code Description', 'NIBRS_offense_code', 'Report DateTime_std', 'Offense Date_std']


In [7]:
# Step 4: Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")
print("spaCy NLP model loaded successfully.")


spaCy NLP model loaded successfully.


In [8]:
# Step 5: Define preprocessing function
def preprocess_text(text):
    """
    Lowercase, remove stopwords/punctuation, lemmatize words.
    """
    if pd.isna(text):
        return ""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

print("Preprocessing function defined.")


Preprocessing function defined.


In [9]:
# Step 6: Apply preprocessing to offense description using nlp.pipe (much faster)
texts = df['NIBRS Offense Code Description'].fillna("").tolist()
cleaned_texts = []

for doc in nlp.pipe(texts, batch_size=1000, n_process=2):  # adjust n_process to CPU cores
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    cleaned_texts.append(" ".join(tokens))

df['Offense_Description_Clean'] = cleaned_texts
print("Offense description cleaned using nlp.pipe.")


Offense description cleaned using nlp.pipe.


In [10]:
# Step 9: Save preprocessed dataset for later use
df.to_csv("data/crime_data_nlppreprocessed.csv", index=False)
print("Preprocessed dataset saved as 'data/crime_data_nlppreprocessed.csv'.")


Preprocessed dataset saved as 'data/crime_data_nlppreprocessed.csv'.


In [None]:
# 02_Crime_NLP_and_KG.ipynb
# Phase II – Step 1: NLP Preprocessing

import pandas as pd
import spacy

# 1️⃣ Load cleaned dataset from Phase I
df = pd.read_csv("data/crime_data_cleaned.csv")

# 2️⃣ Strip extra spaces from column names
df.columns = df.columns.str.strip()

# 3️⃣ Confirm columns
print("Columns in dataset:", df.columns.tolist())

# 4️⃣ Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# 5️⃣ Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

# 6️⃣ Apply preprocessing to correct text columns
df['Offense_Description_Clean'] = df['NIBRS Offense Code Description'].apply(preprocess_text)
df['Category_Clean'] = df['Offense Category'].apply(preprocess_text)

# 7️⃣ Quick check
df[['NIBRS Offense Code Description', 'Offense_Description_Clean']].head()
