<a href="https://colab.research.google.com/github/DavidP0011/apps/blob/main/scrape_linkedin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from parsel import Selector

def scrap_linkedin_dic(params: dict) -> dict:
    """
    Realiza scraping de un perfil público de LinkedIn:
      - Intenta extraer el bloque JSON con datos estructurados.
      - Extrae el nombre y título profesional mediante selectores CSS.
      - Si falla la extracción directa, utiliza Parsel para procesar el HTML.

    Args:
        params (dict):
            - url (str): URL del perfil de LinkedIn.
            - driver (selenium.webdriver, opcional): instancia de WebDriver para reutilizar.
            - timeout (int, opcional): tiempo máximo de espera (default 10 segundos).

    Returns:
        dict: Diccionario con:
              - profile_json_dic (dict o None): datos estructurados extraídos.
              - nombre_str (str o None): nombre del perfil.
              - titulo_str (str o None): título profesional.
              - raw_html_str (str): HTML completo de la página.

    Raises:
        ValueError: si falta el parámetro 'url'.
    """
    if 'url' not in params:
        raise ValueError("Falta el parámetro 'url' en params.")

    url = params.get('url')
    timeout = params.get('timeout', 10)
    driver = params.get('driver')
    created_driver = False

    # Si no se proporciona driver, se crea uno nuevo con opciones básicas
    if driver is None:
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        # Opcional: agregar cabeceras o configurar proxy si se requiere
        driver = webdriver.Chrome(options=options)
        created_driver = True

    try:
        print(f"[START ▶️] Iniciando scraping del perfil: {url}", flush=True)
        driver.get(url)
        # Agregar una pausa aleatoria para simular navegación humana
        time.sleep(random.uniform(2, 4))

        # Obtener el HTML completo para fallback y debug
        raw_html = driver.page_source

        # ── Extracción del bloque JSON ──────────────────────────────────────────────
        profile_json_dic = None
        try:
            script_elem = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.XPATH, '//script[@type="application/ld+json"]'))
            )
            json_text = script_elem.get_attribute("innerText")
            data = json.loads(json_text)
            # Si el JSON contiene la clave '@graph', buscar el objeto de tipo "Person"
            if isinstance(data, dict) and '@graph' in data:
                profile_json_dic = next((item for item in data['@graph'] if item.get('@type') == 'Person'), data)
            else:
                profile_json_dic = data
            print("[TRANSFORMATION SUCCESS ✅] Bloque JSON extraído correctamente.", flush=True)
        except Exception as e:
            print(f"[TRANSFORMATION WARNING ⚠️] No se pudo extraer el bloque JSON: {e}", flush=True)

        # ── Extracción de datos con Selenium ───────────────────────────────────────────
        nombre_str = None
        titulo_str = None

        try:
            # Intentar extraer el nombre usando el primer <h1>
            name_elem = WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'h1'))
            )
            nombre_str = name_elem.text.strip()
            print("[EXTRACTION SUCCESS ✅] Nombre extraído mediante Selenium.", flush=True)
        except Exception as e:
            print(f"[EXTRACTION WARNING ⚠️] No se extrajo el nombre con Selenium: {e}", flush=True)

        try:
            # Se intenta con un selector principal; si falla, se prueba con uno alternativo
            try:
                title_elem = driver.find_element(By.CSS_SELECTOR, '.text-body-medium')
                titulo_str = title_elem.text.strip()
            except Exception:
                title_elem = driver.find_element(By.CSS_SELECTOR, '.pv-top-card-section__summary-info')
                titulo_str = title_elem.text.strip()
            print("[EXTRACTION SUCCESS ✅] Título profesional extraído mediante Selenium.", flush=True)
        except Exception as e:
            print(f"[EXTRACTION WARNING ⚠️] No se extrajo el título profesional con Selenium: {e}", flush=True)

        # ── Fallback con Parsel: usar el HTML completo en caso de falla parcial ─────────────
        if not nombre_str or not titulo_str:
            sel = Selector(text=raw_html)
            if not nombre_str:
                nombre_sel = sel.css('h1::text').get()
                if nombre_sel:
                    nombre_str = nombre_sel.strip()
                    print("[FALLBACK SUCCESS ✅] Nombre extraído con Parsel.", flush=True)
            if not titulo_str:
                titulo_sel = sel.css('.text-body-medium::text').get()
                if titulo_sel:
                    titulo_str = titulo_sel.strip()
                    print("[FALLBACK SUCCESS ✅] Título profesional extraído con Parsel.", flush=True)

        # ── Fallback final: extraer del JSON si aún faltan datos ───────────────────────────
        if not nombre_str and profile_json_dic and "name" in profile_json_dic:
            nombre_str = profile_json_dic.get("name")
            print("[FALLBACK INFO ℹ️] Nombre obtenido desde el JSON.", flush=True)
        if not titulo_str and profile_json_dic and "jobTitle" in profile_json_dic:
            titulo_str = profile_json_dic.get("jobTitle")
            print("[FALLBACK INFO ℹ️] Título obtenido desde el JSON.", flush=True)

        result_dic = {
            "profile_json_dic": profile_json_dic,
            "nombre_str": nombre_str,
            "titulo_str": titulo_str,
            "raw_html_str": raw_html  # útil para debug
        }

        print("[END ▶️] Scraping del perfil completado.", flush=True)
        return result_dic
    finally:
        if created_driver:
            driver.quit()

# Ejemplo de uso:
if __name__ == "__main__":
    params = {
        "url": "https://www.linkedin.com/in/david-plaza-medina-71981020",
        # Se puede incluir el driver si se quiere reutilizar uno ya configurado
    }
    result = scrap_linkedin_dic(params)
    print(result)


[START ▶️] Iniciando scraping del perfil: https://www.linkedin.com/in/david-plaza-medina-71981020
[TRANSFORMATION SUCCESS ✅] Bloque JSON extraído correctamente.
[EXTRACTION SUCCESS ✅] Nombre extraído mediante Selenium.
  (Session info: chrome=134.0.6998.35); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x56f81d62646a <unknown>
#1 0x56f81d0dfed0 <unknown>
#2 0x56f81d131935 <unknown>
#3 0x56f81d131b61 <unknown>
#4 0x56f81d1809d4 <unknown>
#5 0x56f81d15788d <unknown>
#6 0x56f81d17dceb <unknown>
#7 0x56f81d157633 <unknown>
#8 0x56f81d1231be <unknown>
#9 0x56f81d124981 <unknown>
#10 0x56f81d5ec86b <unknown>
#11 0x56f81d5f073c <unknown>
#12 0x56f81d5d3f12 <unknown>
#13 0x56f81d5f12b4 <unknown>
#14 0x56f81d5b80af <unknown>
#15 0x56f81d614ad8 <unknown>
#16 0x56f81d614cb6 <unknown>
#17 0x56f81d6252e6 <unknown>
#18 0x7f277ecb6ac3 <unknown>

[FALLBACK INFO ℹ️] Título obtenido desde e

In [None]:
import pandas as pd

# Supongamos que ya tienes el resultado del scraping:
result = scrap_linkedin_dic(params)
profile_json_dic = result.get("profile_json_dic")

# Convertir el diccionario en un DataFrame
df = pd.json_normalize(profile_json_dic)
print(df.head())


[START ▶️] Iniciando scraping del perfil: https://www.linkedin.com/in/david-plaza-medina-71981020
[TRANSFORMATION SUCCESS ✅] Bloque JSON extraído correctamente.
[EXTRACTION SUCCESS ✅] Nombre extraído mediante Selenium.
  (Session info: chrome=134.0.6998.35); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x5d1a9404c46a <unknown>
#1 0x5d1a93b05ed0 <unknown>
#2 0x5d1a93b57935 <unknown>
#3 0x5d1a93b57b61 <unknown>
#4 0x5d1a93ba69d4 <unknown>
#5 0x5d1a93b7d88d <unknown>
#6 0x5d1a93ba3ceb <unknown>
#7 0x5d1a93b7d633 <unknown>
#8 0x5d1a93b491be <unknown>
#9 0x5d1a93b4a981 <unknown>
#10 0x5d1a9401286b <unknown>
#11 0x5d1a9401673c <unknown>
#12 0x5d1a93ff9f12 <unknown>
#13 0x5d1a940172b4 <unknown>
#14 0x5d1a93fde0af <unknown>
#15 0x5d1a9403aad8 <unknown>
#16 0x5d1a9403acb6 <unknown>
#17 0x5d1a9404b2e6 <unknown>
#18 0x7c58d50acac3 <unknown>

[FALLBACK INFO ℹ️] Título obtenido desde e

In [None]:
!pip install selenium
!pip install parsel

Collecting parsel
  Downloading parsel-1.10.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=1.2.0 (from parsel)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath (from parsel)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting w3lib>=1.19.0 (from parsel)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Downloading parsel-1.10.0-py2.py3-none-any.whl (17 kB)
Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Downloading w3lib-2.3.1-py3-none-any.whl (21 kB)
Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: w3lib, jmespath, cssselect, parsel
Successfully installed cssselect-1.2.0 jmespath-1.0.1 parsel-1.10.0 w3lib-2.3.1


In [1]:
# -------------------------------------
# 1) Instalación de dependencias
# -------------------------------------
!apt-get update -qq
!apt-get install -y -qq chromium-chromedriver
!pip install -q selenium beautifulsoup4 pandas
!pip install -q git+https://github.com/austinoboyle/scrape-linkedin-selenium.git

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Preconfiguring packages ...
Selecting previously unselected package apparmor.
(Reading database ... 124947 files and directories currently installed.)
Preparing to unpack .../0-apparmor_3.0.4-2ubuntu2.4_amd64.deb ...
Unpacking apparmor (3.0.4-2ubuntu2.4) ...
Selecting previously unselected package liblzo2-2:amd64.
Preparing to unpack .../1-liblzo2-2_2.10-2build3_amd64.deb ...
Unpacking liblzo2-2:amd64 (2.10-2build3) ...
Selecting previously unselected package squashfs-tools.
Preparing to unpack .../2-squashfs-tools_1%3a4.5-3build1_amd64.deb ...
Unpacking squashfs-tools (1:4.5-3build1) ...
Selecting previously unselected package udev.
Preparing to unpack .../3-udev_249.11-0ubuntu3.12_amd64.deb ...
Unpacking udev (249.11-0ubuntu3.12) ...
Selecting previously unselected package libfuse3-3:amd64.
Prepari

In [4]:
# -------------------------------------
# 2) Importar librerías necesarias
# -------------------------------------
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrape_linkedin import ProfileScraper

def scrap_linkedin_dic(
    profile_url: str,
    li_at_cookie: str,
    output_csv: str = "linkedin_profile.csv",
    use_proxy: bool = False,
    proxy_address: str = "IP:PUERTO",
    headless: bool = True,
    pause_range: tuple = (2, 5)
):
    # -------------------------------------
    # 3) Configuración Chrome Webdriver
    # -------------------------------------
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
    )
    if use_proxy:
        chrome_options.add_argument(f"--proxy-server=http://{proxy_address}")

    # -------------------------------------
    # 4) Iniciar driver y autenticar sesión manualmente
    # -------------------------------------
    driver = webdriver.Chrome(options=chrome_options)
    driver.get("https://www.linkedin.com")
    driver.add_cookie({'name': 'li_at', 'value': li_at_cookie, 'domain': '.linkedin.com'})
    driver.refresh()

    # Esperar para asegurar sesión autenticada
    time.sleep(random.uniform(*pause_range))

    # -------------------------------------
    # 5) Scraping principal (sesión ya autenticada)
    # -------------------------------------
    scraper = ProfileScraper(driver=driver, timeout=30, scroll_pause=1.0)
    profile = scraper.scrape(url=profile_url)
    profile_dict = profile.to_dict()

    # -------------------------------------
    # 6) Extracción adicional (recomendaciones)
    # -------------------------------------
    driver.get(profile_url)
    time.sleep(random.uniform(*pause_range))
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(random.uniform(*pause_range))
    soup = BeautifulSoup(driver.page_source, "html.parser")

    recommendations_data = {"received": [], "given": []}
    recommendations_section = soup.find('section', {'id': 'recommendations'})
    if recommendations_section:
        rec_received_header = recommendations_section.find(lambda tag: tag.name in ["h2", "span"] and "recibidas" in tag.get_text().lower())
        rec_given_header = recommendations_section.find(lambda tag: tag.name in ["h2", "span"] and ("emitidas" in tag.get_text().lower() or "dadas" in tag.get_text().lower()))
        if rec_received_header:
            list_received = rec_received_header.find_next("ul")
            if list_received:
                for li in list_received.find_all("li", recursive=True):
                    recommendations_data["received"].append(li.get_text(separator=" ", strip=True))
        if rec_given_header:
            list_given = rec_given_header.find_next("ul")
            if list_given:
                for li in list_given.find_all("li", recursive=True):
                    recommendations_data["given"].append(li.get_text(separator=" ", strip=True))
    profile_dict["recommendations"] = recommendations_data

    # -------------------------------------
    # 7) Extraer número de contactos si falta
    # -------------------------------------
    if "connections" not in profile_dict.get("personal_info", {}):
        connections_badge = soup.find("span", string=lambda s: s and "contactos" in s.lower())
        if connections_badge:
            profile_dict.setdefault("personal_info", {})["connections"] = connections_badge.get_text(strip=True)

    # -------------------------------------
    # 8) Guardar resultado en CSV
    # -------------------------------------
    df = pd.json_normalize(profile_dict)
    df.columns = [
        col.replace("personal_info.", "").replace("experiences.", "").replace("accomplishments.", "")
        for col in df.columns
    ]
    df.to_csv(output_csv, index=False)

    # -------------------------------------
    # 9) Cerrar navegador y retornar resultados
    # -------------------------------------
    driver.quit()
    print(f"Scraping completado correctamente. Datos guardados en: {output_csv}")
    return profile_dict

In [5]:
# -------------------------
# 9) Ejemplo de uso
# -------------------------
cookie_li_at = "AQEDAQRkNQUBA8uEAAABlRg9oFwAAAGVisT43E4AV5uC4b-kjN40AITKeRyOnJ4EtEDjRYvg-HA4R5k5C9zbcjuwRf_bneUTiyj5wsP_nROuhAJXv-ZI0E2bgvl2o7MpcVFI3sBN5JiIdFnY6b4wCihp"  # pega aquí tu cookie li_at
linkedin_url = "https://www.linkedin.com/in/david-plaza-medina-71981020/"  # perfil objetivo

resultado = scrap_linkedin_dic(
    profile_url=linkedin_url,
    li_at_cookie=cookie_li_at,
    output_csv="mi_perfil_linkedin.csv",
    use_proxy=False,
    headless=True
)

print("Diccionario resultante:")
print(resultado)


TypeError: 'WebDriver' object is not callable