In [7]:
### Import necessary libraries
import time
import random
import tqdm
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.by import By
from selenium.webdriver import Firefox
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
import json

# Funciones para el proyecto

In [8]:
### Sistema anti-bot (comportamiento humano)
def simulate_human_behavior(min_delay=1, max_delay=3, verbose=False):
    """Simula comportamiento humano con sleeps aleatorios."""
    delay = round(random.uniform(min_delay, max_delay), 2)
    if verbose:
        print(f"Esperando {delay}s...")
    time.sleep(delay)


In [9]:
### Obtener lista de practice areas para aplicar filtros
def extract_trimmed_set(series):
    """
    Recibe una Serie de pandas con valores separados por comas,
    elimina NaN, divide por ',' y devuelve un set de elementos únicos y limpios.
    """
    values = series.dropna().astype(str)
    trimmed_set = set()
    for val in values:
        items = [item.strip() for item in val.split(',')]
        trimmed_set.update(items)
    return trimmed_set

In [10]:
### Obtener paginas totales por cada busqueda una vez aplicado el filtro 
def generate_paginated_urls(url,driver):
    # Encontrar numero de paginas 
    try:
        pages = driver.find_element(By.CSS_SELECTOR,'ul.right > li:nth-child(2) > span:nth-child(1)')
        if pages:
            pages = driver.find_element(By.CSS_SELECTOR,'ul.right > li:nth-child(2) > span:nth-child(1)').text
            pages = int(pages.split()[-1])
    except Exception as e: 
        print("Solo hay una pagina")
        pages = 1

    #print(f"total pages:{pages}")
    parsed = urlparse(url)
    query_list = parse_qsl(parsed.query)  # esto mantiene el orden de los parámetros

    urls = []
    for i in range(1, pages + 1):
        # Creamos una nueva lista sin el parámetro page
        new_query = [(k, v) for (k, v) in query_list if k != "page"]

        # Si no es la primera página, agregamos page=N
        if i != 1:
            new_query.append(("page", str(i)))

        # Reconstruimos la query string manteniendo el orden
        new_query_string = urlencode(new_query)

        # Reconstruimos la URL
        new_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', new_query_string, ''))
        urls.append(new_url)

    return urls

In [34]:
### crawler
def crawler(url, driver,verbose=False):   
    list_urls = []
    urls_unicos = set()  # Usar un set para almacenar URLs únicas
    driver.get(url)
    time.sleep(2)
    # buscar contenedor con info
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="search-result-container"]/div[5]/div[1]/div[2]/div[1]/div/div'))) 
    attorney_table = driver.find_element(By.XPATH, '//*[@id="search-result-container"]/div[5]/div[1]/div[2]/div[1]/div/div')
    # Extraccion de la lista de abogados
    abogados = attorney_table.find_elements(By.XPATH, './/li[@class="detail_title"]')
    
    for li in abogados:
        try:
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, 'a')))
            enlace = li.find_element(By.TAG_NAME, 'a')  # Busca el <a> dentro del <li>
            href = enlace.get_attribute('href')         # Obtiene el atributo href
            if href not in urls_unicos:
                if verbose:
                    print(href)
                urls_unicos.add(href)
                list_urls.append(href)
        except:
            print("No se encontró <a> en este <li>")
            pass
    if verbose:
        print(driver.current_url)
        print(len(list_urls))
    return list_urls 

### Lista de agentes - Headers

In [None]:
### Set del driver y lista de user-agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7; rv:99.0) Gecko/20100101 Firefox/99.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"
]

def setup_driver():
    """Inicializa el WebDriver de Selenium con configuración de Firefox en modo headless."""
    profile = FirefoxProfile()
    profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
    profile.set_preference("intl.accept_languages", "es-ES,es")
    firefox_options = Options()
    firefox_options.profile = profile
    firefox_options.add_argument("--headless")  # Run in headleßss mode
    driver = Firefox(options=firefox_options)
    driver.set_page_load_timeout(10)
    return driver


## Primera iteración - Obtención lista "practice areas"

In [42]:
#all_info = pd.DataFrame() # Data frame para almacenar información
url_inicial = 'https://www.martindale.com/search/attorneys-law-firms-articles/?term=Puerto%20Rico%2C%20USA'
lista_urls = []

In [None]:
driver = setup_driver() # fun(set) modo --headless
cont = 0
driver.get(url_inicial)
urls_per_filter = []
try:
    for url in generate_paginated_urls(url_inicial,driver):
        simulate_human_behavior(1,2,verbose=False)
        urls_per_page = crawler(url,driver,verbose=False)
        if len(urls_per_page) == 0:
            urls_per_page = crawler(url,driver,verbose=False)
            time.sleep(2)
        if len(urls_per_page) > 1:
            urls_per_filter = urls_per_filter + urls_per_page
        cont += 1
except TimeoutException:
    print("Loading took too much time!")
    driver.quit()
except Exception as e:
    print(f"Failed to load page due to error: {e}")
    driver.quit()

print(f"paginas por filtro: {cont}")
print(len(urls_per_filter))
driver.quit()


paginas por filtro: 20
475


In [98]:
lista_urls = lista_urls +urls_per_filter
lista_urls

['https://www.martindale.com/attorney/maria-luisa-martinez-lopez-1573711/',
 'https://www.martindale.com/attorney/german-a-rieckehoff-perez-2023007/',
 'https://www.martindale.com/attorney/carlos-cebollero-1573777/',
 'https://www.martindale.com/attorney/neal-r-walters-1571546/',
 'https://www.martindale.com/attorney/peter-diaz-1574180/',
 'https://www.martindale.com/attorney/carla-calaf-garcia-158573129/',
 'https://www.martindale.com/attorney/andres-fortuna-garcia-2780965/',
 'https://www.martindale.com/attorney/eduardo-cobian-27060364/',
 'https://www.martindale.com/attorney/agustin-f-carbo-lugo-2776305/',
 'https://www.martindale.com/attorney/jorge-r-jimenez-1573767/',
 'https://www.martindale.com/attorney/patricia-m-marvez-valiente-2999269/',
 'https://www.martindale.com/attorney/bethzaida-jordan-ramirez-1571295/',
 'https://www.martindale.com/attorney/zelma-davilla-carrasquillo-2529680/',
 'https://www.martindale.com/attorney/antonio-j-cruz-bonilla-2038583/',
 'https://www.martin

# Scrapper interactivo con envio de filtros

## Ciclo 1, extracción practice areas

In [None]:
### función para obtener el HTML
def get_html(drive,url,verbose=False):
    driver.get(url)
    simulate_human_behavior(1.0, 2.0, verbose=False)
    if verbose:
        print(f"accediendo a: {url}")
    WebDriverWait(driver, 15).until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, 'body > div.off-canvas-wrap > div > div.attorney-profile-content.profile-content')))
    
    html = driver.page_source
    return html

In [None]:
#url = "https://www.martindale.com/attorney/carlos-villegas-del-valle-301593670/#officeLocation"
#url = "https://www.martindale.com/attorney/rafael-g-rivera-rosario-1572639/"
url = "https://www.martindale.com/attorney/jairo-jay-mellado-villarreal-1572914/"

resultados_aux = []
count_aux = 1
driver = setup_driver()
wait = WebDriverWait(driver, 15)
info_aux = {}
inicio = time.time()
for i in range(1):
    try:
        html = get_html(driver,url)
        soup = BeautifulSoup(html, 'html.parser')
        
        profile_content = soup.select_one('div.attorney-profile-content.profile-content')
        if not profile_content:
            print(f"Perfil no encontrado para {url}")
            pass

        # Obtencion de elementos princiaples de cada pagina
        attorney_card = profile_content.select_one(".masthead-content")
        firm = attorney_card.find("span").get_text()
        url_martindale_firm = attorney_card.find("a")["href"]

        # En caso de que no haya un link de firma, el primer link sera la caja de rewiew que redirige a la misma pagina, por eso el #
        if "#" in url_martindale_firm:
            url_martindale_firm = None

        # Estructura del diccionario
        info_aux["url_martindale"] = url #URL interna de martindale para cada abogado
        info_aux["name"] = profile_content.find('h1').get_text(strip=True) #Nombre del abogado
        info_aux["firm"] = firm
        info_aux["url_firm_martindale"] = url_martindale_firm

        # Busqueda de la sección de links de contacto, no todas las paginas tienen esta sección. por eso se usa try/except 
        try: 
            web_bar = profile_content.select_one("section.sticky-menu-all:nth-child(2) > nav:nth-child(1)")
            website = web_bar.select('a')[1]["href"]
            info_aux["website"] = website if website else None
        except Exception as e:
            print(f"Error al extraer website: {e}")
            info_aux["website"] = ""

        # la sección "middle" tiene todos los tags de información relevante, se revisa cada caso
        main_content_table = profile_content.select_one(".attorney-profile-content > div:nth-child(5)")
        h2_titles = main_content_table.find_all('h2')
        for h in h2_titles:
            simulate_human_behavior(0.1, 1.2)
            key = h.get_text(strip=True)
            info_aux[key] = "" #inicializar keys que se encuentran
            print(f"titulo: {key}")
            # Caso 1: Practice areas
            if "Areas of Practice" in key:
                key = "Areas of Practice"


            content_h2 = h.find_next_sibling(
                lambda tag: tag.name == 'div' and 'toggle-area__content' in tag.get('class', []))
            if not content_h2:
                del info_aux[key]
                continue
            # Caso 1: contenido en div.truncate-text
            child_text = content_h2.find('div', class_='truncate-text')
            if child_text:
                info_aux[key] = child_text.get_text(strip=True)
            # Caso 2: ul con areas
            row_collapse = content_h2.find('div', class_='row collapse')
            if row_collapse:
                ul_info = row_collapse.find('ul')
                if ul_info:
                    items = [li.get_text(strip=True) for li in ul_info.find_all('li')]
                    info_aux[key] = ', '.join(items)
            # Caso 3: datos tipo educación, experiencia, etc.
            rows_collapse = content_h2.find_all(
                lambda tag: tag.name == 'div' and 
                all(cls in tag.get('class', []) for cls in ['row', 'collapse', 'experience-section', 'clearfix'])
            )
            for text in rows_collapse:
                text_info = text.get_text(strip=True)
                for line in text_info.split('\n'):
                    if ':' in line:
                        k, v = map(str.strip, line.split(":", 1))
                        info_aux[k] = v
                    else:
                        info_aux[key] = line.strip()
            # Caso 4: dirección y contacto
            firm_offices = content_h2.find('ul', id='firmOffices')
            if firm_offices:
                location_items = firm_offices.find_all('div', class_='office-address')
                if location_items:
                    info_aux["Location"] = location_items[0].get_text(strip=True)
        if info_aux:
            resultados_aux.append(info_aux)
        #count += 1 
        driver.quit()
        False
    except (NoSuchElementException, TimeoutException) as e:
        print(f"Error procesando {url}: {str(e)}")
        driver.quit()

    driver.quit()
    final = time.time() - inicio
    print(f"Tiempo por iteración: {final}")
    info_aux

accediendo a 1: https://www.martindale.com/attorney/jairo-jay-mellado-villarreal-1572914/
Error procesando https://www.martindale.com/attorney/jairo-jay-mellado-villarreal-1572914/: Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:199:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:552:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Tiempo por iteración: 15.865663290023804


In [150]:
info_aux

{'url_martindale': 'https://www.martindale.com/attorney/jairo-jay-mellado-villarreal-1572914/',
 'name': 'Jairo "Jay" Mellado-Villarreal',
 'firm': 'Mellado & Mellado Villarreal',
 'url_firm_martindale': 'https://www.martindale.com/organization/mellado-mellado-villarreal-1260661/',
 'website': 'http://www.mellado.com'}

## Funcion Scrapper para varios links 

In [96]:
### Scraper pagina por pagina abogados
def scrape_abogados(lista_urls):
    resultados = []
    count = 1
    for url in lista_urls:
        driver = setup_driver()
        wait = WebDriverWait(driver, 15)
        info = {}
        try:
            simulate_human_behavior(1.0, 2.0, verbose=False)
            driver.get(url)
            simulate_human_behavior(0.8, 2.0, verbose=False)

            print(f"accediendo a {count}: {url}")
            wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.off-canvas-wrap > div > div.attorney-profile-content.profile-content')
            ))
            simulate_human_behavior(1.0, 2.0, verbose=False)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            profile_content = soup.select_one('div.attorney-profile-content.profile-content')
            if not profile_content:
                print(f"Perfil no encontrado para {url}")
                continue
            info["url_martindale"] = url
            info["name"] = profile_content.find('h1').get_text(strip=True)
            
            try: 
                web_bar = profile_content.select_one("section.sticky-menu-all:nth-child(2) > nav:nth-child(1)")
                website = web_bar.select('a')[1]["href"]
                if website:
                    info["website"] = website
                else:
                    None
            except:
                print("no hay sitio web")
            
                h2_title = profile_content.find_all('h2')
            for h in h2_title:
                simulate_human_behavior(0.1, 0.5)
                key = h.get_text(strip=True)
                if "Areas of Practice" in key:
                    key = "Areas of Practice"
                info[key] = ""
                content_h2 = h.find_next_sibling(
                    lambda tag: tag.name == 'div' and 'toggle-area__content' in tag.get('class', [])
                )
                if not content_h2:
                    del info[key]
                    continue

                # Caso 1: contenido en div.truncate-text
                child_text = content_h2.find('div', class_='truncate-text')
                if child_text:
                    info[key] = child_text.get_text(strip=True)

                # Caso 2: ul con areas
                row_collapse = content_h2.find('div', class_='row collapse')
                if row_collapse:
                    ul_info = row_collapse.find('ul')
                    if ul_info:
                        items = [li.get_text(strip=True) for li in ul_info.find_all('li')]
                        info[key] = ', '.join(items)

                # Caso 3: datos tipo educación, experiencia, etc.
                rows_collapse = content_h2.find_all(
                    lambda tag: tag.name == 'div' and 
                    all(cls in tag.get('class', []) for cls in ['row', 'collapse', 'experience-section', 'clearfix'])
                )
                for text in rows_collapse:
                    text_info = text.get_text(strip=True)
                    for line in text_info.split('\n'):
                        if ':' in line:
                            k, v = map(str.strip, line.split(":", 1))
                            info[k] = v
                        else:
                            info[key] = line.strip()

                # Caso 4: dirección y contacto
                firm_offices = content_h2.find('ul', id='firmOffices')
                if firm_offices:
                    location_items = firm_offices.find_all('div', class_='office-address')
                    if location_items:
                        info["Location"] = location_items[0].get_text(strip=True)
            if info:
                resultados.append(info)
            count += 1 
            driver.quit()
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Error procesando {url}: {str(e)}")
            driver.quit()
            break
        
    driver.quit()
    # Resultado ya es una lista de diccionarios
    return resultados


In [99]:
datos = scrape_abogados(lista_urls)

accediendo a 1: https://www.martindale.com/attorney/maria-luisa-martinez-lopez-1573711/
no hay sitio web
accediendo a 2: https://www.martindale.com/attorney/german-a-rieckehoff-perez-2023007/
no hay sitio web
accediendo a 3: https://www.martindale.com/attorney/carlos-cebollero-1573777/
no hay sitio web
accediendo a 4: https://www.martindale.com/attorney/neal-r-walters-1571546/
no hay sitio web
accediendo a 5: https://www.martindale.com/attorney/peter-diaz-1574180/
no hay sitio web
accediendo a 6: https://www.martindale.com/attorney/carla-calaf-garcia-158573129/
no hay sitio web
accediendo a 7: https://www.martindale.com/attorney/andres-fortuna-garcia-2780965/
no hay sitio web
accediendo a 8: https://www.martindale.com/attorney/eduardo-cobian-27060364/
no hay sitio web
accediendo a 9: https://www.martindale.com/attorney/agustin-f-carbo-lugo-2776305/
no hay sitio web
accediendo a 10: https://www.martindale.com/attorney/jorge-r-jimenez-1573767/
no hay sitio web
accediendo a 11: https://ww

## Extracción de links aplicando filtros

In [105]:
datos
#df = pd.DataFrame(datos)
#df.to_excel("datos_abogados.xlsx", index=False)
#df.tail()


[{'url_martindale': 'https://www.martindale.com/attorney/maria-luisa-martinez-lopez-1573711/',
  'name': 'Maria Luisa Martinez-Lopez',
  'Education & Credentials': '',
  'University Attended': 'University of Puerto Rico, B.B.A., 1983; University of Puerto Rico, B.B.A., 1983',
  'Law School Attended': 'University of Puerto Rico, J.D., 1986; University of Puerto Rico, J.D., 1986',
  'Year of First Admission': '1987',
  'Admission': '1987, Puerto Rico; 1987, U.S. District Court, District of Puerto Rico and U.S. Court of Appeals, First Circuit',
  'ISLN': '905123941',
  'Areas of Practice': 'Commercial Litigation',
  'Peer Reviews': '',
  'Location': '',
  'Other Practice Areas in San Juan, Puerto Rico': 'Alternative Dispute Resolution, Corporate Law, White Collar Crime, Insurance, Finance, Personal Injury, Administrative Law, International Law, Professional Liability, Workers Compensation, Appellate Practice'},
 {'url_martindale': 'https://www.martindale.com/attorney/german-a-rieckehoff-p

In [None]:

### Lista de Areas of Practice para emplear en filtros
practice_areas = set()
attorney_name = set() # Set de nombres para hacer validación cuando se extraiga la info. 
for abogado in datos:
    areas = abogado.get("Areas of Practice", "")
    #name = abogado.get("name","")
    #print(attorney_name)
    for area in areas.split(","):
        area = area.strip()
        #print(area)
        if area:
            practice_areas.add(area)
practice_areas
len(practice_areas)

{'Accidents',
 'Administrative',
 'Administrative Agencies',
 'Administrative Agency Practice',
 'Administrative Agency Proceedings',
 'Administrative Law',
 'Administrative Litigation',
 'Administrative and Public',
 'Admiralty',
 'Admiralty and Maritime Law',
 'Advertising',
 'Affidavit',
 'All interjurisdictional related matters',
 'Alternative Dispute Resolution',
 'Antitrust',
 'Antitrust Law',
 'Antitrust and Trade Regulation',
 'Appeals',
 'Appellate',
 'Appellate Practice',
 'Aquatic Injuries',
 'Arbitration',
 'Automobile Accidents',
 'Aviation',
 'Aviation Law',
 'Bank Foreclosures',
 'Banking',
 'Banking Law',
 'Banking and Finance Law',
 'Banking and Financial Institutions Law',
 'Bankruptcy',
 'Bankruptcy Law',
 'Banks and Banking',
 'Bicycle Accidents',
 'Bid Protests',
 'Boating Accidents',
 'Brain Injury',
 'Breach of Contract',
 'Bus Accidents',
 'Business',
 'Business Immigration',
 'Business Interruption',
 'Business Law',
 'Business Litigation',
 'Business Planning'

## Ciclo 2: Scrapper accediendo a filtros

### Validación Practice areas que "funcionana en martindale - filtro" 

In [115]:
### Limpieza set Areas of practice - insumo para nuevo crowler

clean = [] # Lista con las practice areas que si disparan una coincidencia en el buscador. 
len(practice_areas) # Toma el set de todas las posibles practice areas, escritas por abogado
url_base = 'https://www.martindale.com/search/attorneys/?term=Puerto%20Rico%2C%20USA' ## CAMBIAR MAS ADELANTE
driver = setup_driver()
wait = WebDriverWait(driver, 15)
driver.get(url_base)

for area in practice_areas: 
    
    simulate_human_behavior(0.5,2,verbose=False)
    ### Elementos clickeables y request
    wait.until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="filtersContainer"]'))) 
    wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="practiceAreas"]')))
    #print("es clickeable")
    searchBar = driver.find_element(By.ID, 'practiceAreas')
    searchBar.clear()
    searchBar.send_keys(area)

    simulate_human_behavior(1,2,verbose=False)
    suggestion = wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "ul.ui-autocomplete li.ui-menu-item div.ui-menu-item-wrapper")
        ))
        ## Busqueda de practice area
    #print("Sugerencia encontrada:", suggestion.text)
    if "No Areas Of Practice matching" in suggestion.text:   
            pass
    else:
        clean.append(area)
        print(area)
    ## Numero de paginas
driver.quit()
len(clean)

Wage and Hour
Products Liability
Complex Litigation
Federal
Consumer Law
Bankruptcy
Trademarks
Immigration
Residential Real Estate
Admiralty
Antitrust and Trade Regulation
Foreclosures
School Law
Property Damage
Wills
Breach of Contract
Boating Accidents
Criminal Law
Aviation
Foreclosure
Litigation
Insurance
Antitrust
Class Actions
Health Care
Government
Government Contracts
Contracts
Equine Law
Municipal Law
Internet Law
Motorcycle Accidents
Commercial
Estate Planning
Wrongful Death
Business Law
Franchises and Franchising
Corporate Taxation
Government Procurement
Tort
Environmental Law
Patents
Banking
Family
Federal Criminal Law
Business
Copyrights
Franchise
Employee Benefits
Construction Law
Labor and Employment
Wills and Probate
Trusts and Estates
CRIM
Telecommunications
Torts
Cruise Ship Injuries
Premises Liability
Civil Practice
Employment
Tax Planning
Business Litigation
Aviation Law
Trusts
Corporate Governance
Administrative Law
Debtor and Creditor
Civil
Medical Malpractice
Gene

141

In [117]:
clean

['Wage and Hour',
 'Products Liability',
 'Complex Litigation',
 'Federal',
 'Consumer Law',
 'Bankruptcy',
 'Trademarks',
 'Immigration',
 'Residential Real Estate',
 'Admiralty',
 'Antitrust and Trade Regulation',
 'Foreclosures',
 'School Law',
 'Property Damage',
 'Wills',
 'Breach of Contract',
 'Boating Accidents',
 'Criminal Law',
 'Aviation',
 'Foreclosure',
 'Litigation',
 'Insurance',
 'Antitrust',
 'Class Actions',
 'Health Care',
 'Government',
 'Government Contracts',
 'Contracts',
 'Equine Law',
 'Municipal Law',
 'Internet Law',
 'Motorcycle Accidents',
 'Commercial',
 'Estate Planning',
 'Wrongful Death',
 'Business Law',
 'Franchises and Franchising',
 'Corporate Taxation',
 'Government Procurement',
 'Tort',
 'Environmental Law',
 'Patents',
 'Banking',
 'Family',
 'Federal Criminal Law',
 'Business',
 'Copyrights',
 'Franchise',
 'Employee Benefits',
 'Construction Law',
 'Labor and Employment',
 'Wills and Probate',
 'Trusts and Estates',
 'CRIM',
 'Telecommunicatio

In [None]:
### Extracción multiple

for area in tqdm.tqdm(clean):
    url_base = 'https://www.martindale.com/search/attorneys/?term=Puerto%20Rico%2C%20USA' #Va a cambiar mas adelante
    try: 
        ### configuración del webdriver
        driver = setup_driver()
        wait = WebDriverWait(driver, 15)
        driver.get(url_base)
        simulate_human_behavior(0.5,3,verbose=False)

        ### Elementos clickeables y request
        # Ubicar barra de busqueda "Filters - Practice Areas"
        wait.until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="filtersContainer"]'))) 
        wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="practiceAreas"]')))
        searchBar = driver.find_element(By.ID, 'practiceAreas')
        searchBar.clear()
        
        # Envio de texto "practice area" para aplicar filtro y tener nuevos resultados
        searchBar.send_keys(area)

        simulate_human_behavior(1,3,verbose=False)
        suggestion = wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "ul.ui-autocomplete li.ui-menu-item div.ui-menu-item-wrapper")
        ))

        # Busqueda de practice area - Si hay una sugerencia, se da click. 
        suggestion.click() # ----> Entra a nueva pagina con filtro. 
        current_url = driver.current_url # Actualizar la url para obtener N° de paginas y urls de paginas
        
        # links y N° de paginas
        urls = generate_paginated_urls(current_url, total_pages=pages)

        # For 
        for url in urls:
            urls_per_page = crawler(url,driver,verbose=False)
            if len(urls_per_page) == 0:
                urls_per_page = crawler(url,driver,verbose=False)
                time.sleep(2)
            if len(urls_per_page) > 1:
                urls_per_filter = urls_per_filter + urls_per_page
            cont += 1

    except TimeoutException:
        print("Loading took too much time!")
        driver.quit()
    except Exception as e:
        print(f"Failed to load page due to error: {e}")
        driver.quit()
    driver.quit()

In [None]:
len(list_url_attorneys)
#list_url_attorneys
ordenado = sorted(list_url_attorneys, key=lambda x: x['name'])
len(ordenado)

In [None]:
unicos = list({json.dumps(d, sort_keys=True) for d in list_url_attorneys})
resultado = [json.loads(d) for d in unicos]
len(resultado)
#resultado
aux_links = pd.DataFrame(resultado)
aux_links.to_excel("links.xlsx", index=False)

In [None]:
lista_urls = []
for i in resultado:
    lista_urls.append(i["url"])
len(lista_urls)


In [None]:
def scrape_abogados(lista_urls):
    resultados = []
    count = 1
    for url in lista_urls:
        #driver = webdriver.Firefox(options=options)
        driver = setup_driver()
        wait = WebDriverWait(driver, 15)
        info = {}
        try:
            simulate_human_behavior(1.0, 2.0, verbose=False)
            driver.get(url)
            simulate_human_behavior(0.8, 2.0, verbose=False)

            print(f"accediendo a {count}: {url}")
            wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, 'body > div.off-canvas-wrap > div > div.attorney-profile-content.profile-content')
            ))

            simulate_human_behavior(1.0, 2.0, verbose=False)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            profile_content = soup.select_one('div.attorney-profile-content.profile-content')

            if not profile_content:
                print(f"Perfil no encontrado para {url}")
                continue

            info["url"] = url
            info["name"] = profile_content.find('h1').get_text(strip=True)
            h2_title = profile_content.find_all('h2')

            for h in h2_title:
                simulate_human_behavior(0.1, 0.5)
                key = h.get_text(strip=True)
                if "Areas of Practice" in key:
                    key = "Areas of Practice"
                info[key] = ""

                content_h2 = h.find_next_sibling(
                    lambda tag: tag.name == 'div' and 'toggle-area__content' in tag.get('class', [])
                )
                if not content_h2:
                    del info[key]
                    continue

                # Caso 1: contenido en div.truncate-text
                child_text = content_h2.find('div', class_='truncate-text')
                if child_text:
                    info[key] = child_text.get_text(strip=True)

                # Caso 2: ul con areas
                row_collapse = content_h2.find('div', class_='row collapse')
                if row_collapse:
                    ul_info = row_collapse.find('ul')
                    if ul_info:
                        items = [li.get_text(strip=True) for li in ul_info.find_all('li')]
                        info[key] = ', '.join(items)

                # Caso 3: datos tipo educación, experiencia, etc.
                rows_collapse = content_h2.find_all(
                    lambda tag: tag.name == 'div' and 
                    all(cls in tag.get('class', []) for cls in ['row', 'collapse', 'experience-section', 'clearfix'])
                )
                for text in rows_collapse:
                    text_info = text.get_text(strip=True)
                    for line in text_info.split('\n'):
                        if ':' in line:
                            k, v = map(str.strip, line.split(":", 1))
                            info[k] = v
                        else:
                            info[key] = line.strip()

                # Caso 4: dirección y contacto
                firm_offices = content_h2.find('ul', id='firmOffices')
                if firm_offices:
                    location_items = firm_offices.find_all('div', class_='office-address')
                    if location_items:
                        info["Location"] = location_items[0].get_text(strip=True)
            if info:
                resultados.append(info)
            
            count += 1 
            driver.quit()
        except (NoSuchElementException, TimeoutException) as e:
            print(f"Error procesando {url}: {str(e)}")
            driver.quit()
            pass
        
    driver.quit()
    # Resultado ya es una lista de diccionarios
    return resultados

In [None]:
datos = scrape_abogados(lista_urls[272:])
len(datos)
#data_list_total.append(datos)

In [None]:
list_aux = []

In [None]:
list_aux.append(datos)
print(len(list_aux))

#df_aux.tail()

In [None]:
len(list_aux)
lista_aplanada = [item for sublista in list_aux for item in sublista]
df_aux = pd.DataFrame(lista_aplanada)
df_aux.tail()





In [None]:
df_attorneys = pd.concat([df_aux, df], ignore_index=True)
#df_attorneys = df_attorneys(how='all')
# Eliminar filas duplicadas
df_attorneys = df_attorneys.drop_duplicates()
df_attorneys.tail()
#df_attorneys.to_excel("Martindale_attorneys.xlsx",index=False)