# Análisis de Entidades en Consultas de Marca con spaCy
Este notebook analiza consultas extraídas de Google Search Console usando NLP (spaCy) para detectar entidades como marcas, ubicaciones, personas o productos.

In [1]:
# ✅ Instalar spaCy si no está
!pip install -U spacy

Collecting spacy
  Using cached spacy-3.8.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.12-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.5 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Using cached wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)

In [7]:
# ✅ Descargar el modelo en español
!python -m spacy download es_core_news_md

Collecting es-core-news-md==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl (42.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_md')


In [2]:
# ✅ Cargar el modelo spaCy y texto de ejemplo
import spacy
from collections import Counter

# Cargar modelo de idioma español
nlp = spacy.load("es_core_news_md")

# Simulación de queries extraídas de GSC
queries = [
    "marca acme",
    "mi marca",
    "cosmetica",
    "crema",
    "serum",
    "argireline",
    "botox",
    "retinol",
    "ojeras",
    "marula",
    "msm",
    "dmae",
    "bolsas",
    "niacinamida",
    "pycnogenol",
    "acido hialuronico",
    "vitamina c",
    "cuello",
    "ojos",
    "colageno",
    "mejor",
    "serum",
    "tratamiento",
    "antiedad",
    "hidratante",
    "arrugas",
    "flacidez",
    "luminoso",
    "transformador",
    "facial",
    "solucion",
    "sensible"
]

# Unir todas las queries como texto único
text = ". ".join(queries)

# Procesar el texto
doc = nlp(text)

OSError: [E050] Can't find model 'es_core_news_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [3]:
# ✅ Extraer entidades nombradas
entidades = [(ent.text, ent.label_) for ent in doc.ents]
entidades[:10]

NameError: name 'doc' is not defined

In [4]:
# ✅ Contar entidades más frecuentes
conteo = Counter(entidades)
conteo.most_common(10)

NameError: name 'entidades' is not defined

In [5]:
# ✅ Agrupar entidades por tipo (ORG, LOC, PER, etc.)
from collections import defaultdict

agrupadas = defaultdict(set)
for text, label in entidades:
    agrupadas[label].add(text)

# Mostrar entidades por tipo
for tipo, lista in agrupadas.items():
    print(f"\n🔸 {tipo}:")
    print(lista)

NameError: name 'entidades' is not defined