# 🧠 NLP Entity Extraction with spaCy

Extract named entities (like locations, dates, services) from escort profile descriptions.

In [1]:
import pandas as pd
import spacy
from collections import Counter
from spacy import displacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load and prepare data
df = pd.read_csv("../data/processed/all_profiles.csv")
df["description"] = df["description"].fillna("").astype(str)

# Sample subset for performance
sample = df.sample(200, random_state=42).copy()
docs = list(nlp.pipe(sample["description"]))

# Extract named entities
sample["entities"] = [[(ent.text, ent.label_) for ent in doc.ents] for doc in docs]
sample[["title", "entities"]].head()

Unnamed: 0,title,entities
1239,"Cock, Pussy & Butt models required - satisfy o...",[]
1187,Required massage experienced personnel,"[(an hour, TIME), (3 to 4, CARDINAL), (ABN, OR..."
560,M2M Massage Melb,"[(Melbourne, GPE), (Employed Masseur, PERSON)]"
767,"Nice to meet you, I am Keke, 23",[]
714,Maddie,"[(Fit Aussie, PERSON), (8, CARDINAL)]"


## 🔢 Most Common Entity Types

In [2]:
entity_counter = Counter()
for ents in sample["entities"]:
    entity_counter.update([label for _, label in ents])

pd.DataFrame(entity_counter.most_common(), columns=["Entity Type", "Count"])

Unnamed: 0,Entity Type,Count
0,CARDINAL,118
1,DATE,105
2,PERSON,101
3,ORG,87
4,GPE,64
5,TIME,29
6,NORP,27
7,PERCENT,14
8,QUANTITY,13
9,ORDINAL,8


## 📌 Example Entities

In [3]:
samples = []
for ents in sample["entities"]:
    for ent_text, ent_label in ents:
        if ent_label in {"GPE", "DATE", "MONEY", "PERSON"}:
            samples.append((ent_text, ent_label))
pd.DataFrame(samples[:25], columns=["Entity", "Label"])

Unnamed: 0,Entity,Label
0,3048,DATE
1,Melbourne,GPE
2,Employed Masseur,PERSON
3,Fit Aussie,PERSON
4,don,PERSON
5,Amanda,PERSON
6,Thai Ladyboy,PERSON
7,0414805956,DATE
8,50 45,DATE
9,0432106848,DATE


## 🖼️ Visualize Entities in a Description

In [4]:
doc = docs[0]
displacy.render(doc, style="ent", jupyter=True)

