# 🧠 NLP Clustering of Escort Ads
Group similar escort ads using TF-IDF vectorization, KMeans clustering, and UMAP projection.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import umap
import plotly.express as px
import numpy as np
import plotly.io as pio
import os

# Make sure output folder exists
os.makedirs("outputs", exist_ok=True)

# Load and prepare data
df = pd.read_csv("../data/processed/all_profiles.csv")
df["description"] = df["description"].fillna("")

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english", max_features=1000)
X = tfidf.fit_transform(df["description"])

# KMeans clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

# UMAP dimensionality reduction
umap_model = umap.UMAP(random_state=42)
X_umap = umap_model.fit_transform(X.toarray())
df["umap_x"] = X_umap[:, 0]
df["umap_y"] = X_umap[:, 1]

# Create interactive scatter plot
fig = px.scatter(
    df, x="umap_x", y="umap_y", color=df["cluster"].astype(str),
    hover_data=["title", "location", "category", "phone"],
    title="Escort Ad Clusters (TF-IDF + KMeans + UMAP)"
)

# ✅ Save to notebooks/outputs
fig.write_html("outputs/escort_clusters.html")

print("✅ File saved to: notebooks/outputs/escort_clusters.html")


  from .autonotebook import tqdm as notebook_tqdm
  warn(


✅ File saved to: notebooks/outputs/escort_clusters.html


## 🔍 Top TF-IDF Terms per Cluster

In [2]:
# Inspect top terms defining each cluster
terms = tfidf.get_feature_names_out()
centroids = kmeans.cluster_centers_

for i, center in enumerate(centroids):
    print(f"\nCluster {i} top terms:")
    top_indices = np.argsort(center)[-10:][::-1]
    for idx in top_indices:
        print(f"  {terms[idx]}")


Cluster 0 top terms:
  fun
  looking
  let
  guy
  good
  meet
  easy
  host
  discreet
  just

Cluster 1 top terms:
  text
  sexy
  young
  body
  asian
  service
  massage
  girl
  100
  hello

Cluster 2 top terms:
  love
  pussy
  like
  fuck
  man
  cock
  hello
  sex
  men
  hey

Cluster 3 top terms:
  massage
  body
  relaxation
  sensual
  good
  experience
  touch
  available
  private
  deep

Cluster 4 top terms:
  hi
  melbourne
  looking
  new
  make
  want
  time
  experience
  work
  need
