# 02 — Query Typology & Annotation

**Goal.** Apply a simple scheme:
- `structure`: token | noun phrase | sentence
- `specificity`: generic | semi-specific | specific
- `intent`: document_access | template_access | methodology | access_help | unknown

**Deliverables.**
- Labeled sample with distribution plots
- Heuristic annotator functions for reproducibility


In [None]:
import pandas as pd, re, pathlib

ROOT = pathlib.Path(__file__).resolve().parents[1]
DATA = ROOT / "data"

q = pd.read_csv(DATA / "user_queries.csv")

def guess_structure(text: str) -> str:
    t = text.strip()
    if " " not in t:
        return "token"
    if t.endswith("?") or re.match(r"^(how|why|where|what)\b", t.lower()):
        return "sentence"
    return "noun phrase"

def guess_specificity(text: str) -> str:
    t = text.lower()
    if re.search(r"\b(202[0-9]|[12][0-9]{3})\b", t) or "template" in t:
        return "specific"
    if len(t.split()) >= 3:
        return "semi-specific"
    return "generic"

def guess_intent(text: str) -> str:
    t = text.lower()
    if "template" in t: return "template_access"
    if "format" in t or "guide" in t: return "methodology"
    if "access denied" in t or "sign in" in t: return "access_help"
    if "book" in t or "case study" in t or "pdf" in t: return "document_access"
    return "unknown"

q["structure_h"] = q["query"].map(guess_structure)
q["specificity_h"] = q["query"].map(guess_specificity)
q["intent_h"] = q["query"].map(guess_intent)

q.head(10)