In [191]:
### Import necessary libraries
import time
import random
import tqdm
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.by import By
from selenium.webdriver import Firefox
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
import json
import pickle

# Funciones para el proyecto

In [192]:
### Set del driver y lista de user-agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7; rv:99.0) Gecko/20100101 Firefox/99.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0"
]

def setup_driver(headless=False):
    """Inicializa el WebDriver de Selenium con configuración de Firefox en modo headless."""
    profile = FirefoxProfile()
    profile.set_preference("general.useragent.override", random.choice(USER_AGENTS))
    profile.set_preference("intl.accept_languages", "es-ES,es")
    firefox_options = Options()
    firefox_options.profile = profile
    if headless:
        firefox_options.add_argument("--headless")  # Run in headleßss mode
    driver = Firefox(options=firefox_options)
    driver.set_page_load_timeout(10)
    return driver


In [193]:
### Sistema anti-bot (comportamiento humano)
def simulate_human_behavior(min_delay=1, max_delay=3, verbose=False):
    """Simula comportamiento humano con sleeps aleatorios."""
    delay = round(random.uniform(min_delay, max_delay), 2)
    if verbose:
        print(f"Esperando {delay}s...")
    time.sleep(delay)


In [194]:
### Obtener lista de practice areas para aplicar filtros
def extract_trimmed_set(series):
    """
    Recibe una Serie de pandas con valores separados por comas,
    elimina NaN, divide por ',' y devuelve un set de elementos únicos y limpios.
    """
    values = series.dropna().astype(str)
    trimmed_set = set()
    for val in values:
        items = [item.strip() for item in val.split(',')]
        trimmed_set.update(items)
    return trimmed_set

In [195]:
### Obtener paginas totales por cada busqueda una vez aplicado el filtro 
def generate_paginated_urls(url,driver):
    # Encontrar numero de paginas 
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR,'ul.right')))
        simulate_human_behavior(1,2,verbose=False)
        pages_tab = driver.find_element(By.CSS_SELECTOR,'ul.right')       
        if pages_tab:
            pages = pages_tab.find_elements(By.TAG_NAME,"li")
            pages = pages[1].text
            pages = int(pages.split()[-1])
            print(pages)
    except Exception as e: 
        print(f"{e}")
        
        pages = 1

    #print(f"total pages:{pages}")
    parsed = urlparse(url)
    query_list = parse_qsl(parsed.query)  # esto mantiene el orden de los parámetros

    urls = []
    for i in range(1, pages + 1):
        # Creamos una nueva lista sin el parámetro page
        new_query = [(k, v) for (k, v) in query_list if k != "page"]

        # Si no es la primera página, agregamos page=N
        if i != 1:
            new_query.append(("page", str(i)))

        # Reconstruimos la query string manteniendo el orden
        new_query_string = urlencode(new_query)

        # Reconstruimos la URL
        new_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', new_query_string, ''))
        urls.append(new_url)

    return urls

In [196]:
### crawler
def crawler(url, driver,verbose=False):   
    list_urls = []
    urls_unicos = set()  # Usar un set para almacenar URLs únicas
    driver.get(url)
    # buscar contenedor con info
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '.results__title')))
    result_title = driver.find_element(By.CSS_SELECTOR, '.results__title')
    if result_title:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '.results__result-row'))) 
        attorney_table = driver.find_element(By.CSS_SELECTOR, '.results__result-row')
        if verbose:
            print("se encontro la tabla de abogados")
        
        # Extraccion de la lista de abogados
        abogados = attorney_table.find_elements(By.XPATH, '(//div[@class="row collapse card__content"])')
        if verbose:
            print(len(abogados))
            print(abogados[0])

        for ul in abogados:
            try:
                WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'ul')))
                enlaces = ul.find_elements(By.TAG_NAME, 'a')
                href_attorney = enlaces[1].get_attribute('href')         # Obtiene el atributo href
                if verbose:
                        print(href_attorney)
                if href_attorney not in urls_unicos:
                    urls_unicos.add(href_attorney)
                    list_urls.append(href_attorney)
            except:
                print("No se encontró <a> en este <li>")
                pass
    if verbose:
        print(driver.current_url)
        print(len(list_urls))
    return list_urls 

In [212]:
### función para obtener el HTML
def get_html(driver,url,verbose=False):
    driver.get(url)
    simulate_human_behavior(1, 2, verbose=False)
    if verbose:
        print(f"accediendo a: {url}")
    WebDriverWait(driver, 15).until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, 'body > div.off-canvas-wrap > div > div.attorney-profile-content.profile-content')))
    
    html = driver.page_source
    simulate_human_behavior(1,1.5,verbose=False)
    return html

In [221]:
### funcion Scraper pagina por pagina abogados
def scrape_abogados(lista_urls):
    resultados = []
    count = 1
    for url in lista_urls:
        driver = setup_driver(headless=False)
        simulate_human_behavior(1,3,verbose=False)
        info_attorney = {}
        try:
            print(f"accediendo a {count}: {url}")
            html = get_html(driver,url)
            soup = BeautifulSoup(html, 'html.parser')

            profile_content = soup.select_one('div.attorney-profile-content.profile-content')
            if not profile_content:
                print(f"Perfil no encontrado para {url}")
                pass

            # Obtencion de elementos princiaples de cada pagina
            attorney_card = profile_content.select_one(".masthead-content")
            firm = attorney_card.find("span")
            if firm:
                firm = firm.get_text(strip=True)
            url_martindale_firm = attorney_card.find("a")["href"]

            # En caso de que no haya un link de firma, el primer link sera la caja de rewiew que redirige a la misma pagina, por eso el #
            if "#" in url_martindale_firm:
                url_martindale_firm = None

            # Estructura del diccionario
            info_attorney["url_martindale"] = url #URL interna de martindale para cada abogado
            info_attorney["name"] = profile_content.find('h1').get_text(strip=True) #Nombre del abogado
            info_attorney["firm"] = firm
            info_attorney["url_firm_martindale"] = url_martindale_firm

            # Busqueda de la sección de links de contacto, no todas las paginas tienen esta sección. por eso se usa try/except 
            try: 
                web_bar = profile_content.select_one("section.sticky-menu-all:nth-child(2) > nav:nth-child(1)")
                website_firm = web_bar.select('a')[1]["href"]
                info_attorney["website_firm"] = website_firm if website_firm else None
            except Exception as e:
                #print(f"Error al extraer website: {e}")
                info_attorney["website"] = ""

            #Extraccion de los tags relevantes de cada pagina
            main_content_table = soup.find_all("div", class_="small-12 columns")[0]
            for section_div in main_content_table.find_all("div", recursive=False):
                content_div = section_div.find("div", class_="toggle-area__content")
                h2_key = section_div.find("h2") #Titulo de sección por separado
                if h2_key and content_div:
                    h2_key = h2_key.get_text(strip=True)
                    if "Areas of Practice" in h2_key:
                        h2_key = "Areas of Practice"
                    #print(h2_key)
                    info_attorney[h2_key] = "" #inicializar keys que se encuentran

                    # Biography and practice areas 
                    child_text = content_div.find('div', class_='truncate-text')
                    if child_text:
                        info_attorney[h2_key] = child_text.get_text(strip=True)
                    Aoe_area = content_div.find('div', class_='row collapse')
                    if Aoe_area:
                        ul_info = Aoe_area.find('ul')
                        if ul_info:
                            items = [li.get_text(strip=True) for li in ul_info.find_all('li')]
                            info_attorney[h2_key] = ', '.join(items)

                    # Education and credentials
                    EnC_area = content_div.find_all(
                        lambda tag: tag.name == 'div' and 
                        all(cls in tag.get('class', []) for cls in ['row', 'collapse', 'experience-section', 'clearfix'])
                    )
                    for text in EnC_area:
                        text_info = text.get_text(strip=True)
                        for line in text_info.split('\n'):
                            if ':' in line:
                                k, v = map(str.strip, line.split(":", 1))
                                info_attorney[k] = v
                            else:
                                info_attorney[h2_key] = line.strip()

                    # Peer rewiew - Pendiente
                    PrA_section = content_div.find("section", class_="profile-peer-reviews")
                    if PrA_section: 
                        rewiew_area = PrA_section.find('div', class_="review-top review-top-peer-reviews")
                        rewiew_list = PrA_section.find('div', class_="attorney-reviews-list attorney-peer-reviews-list")
                        if rewiew_area and rewiew_list:
                            text_rewiew_area = rewiew_area.find_all('span')[2].get_text().strip().splitlines()
                            stars = text_rewiew_area[0].split("/")[0].strip()   
                            reviews = text_rewiew_area[1].strip("()").split()[0]
                            info_attorney["profile_peer_review_star"] = stars
                            info_attorney["profile_peer_review_count"] = reviews

                    # Location
                    firm_offices = content_div.find('ul', id='firmOffices')
                    if firm_offices:
                        location_items = firm_offices.find_all('div', class_='office-address')
                        if location_items:
                            info_attorney["Location"] = location_items[0].get_text(strip=True)
            if info_attorney:
                resultados.append(info_attorney)
            count += 1 
            driver.quit()
        except TimeoutException:
            print("Loading took too much time!")
            driver.quit()
        except Exception as e:
            print(f"Failed to load page due to error: {e}")
            driver.quit()
        except NoSuchElementException:
            driver.quit()
        driver.quit()
    # Resultado ya es una lista de diccionarios
    return resultados


## Primera iteración - Obtención lista "practice areas"

In [159]:
#all_info = pd.DataFrame() # Data frame para almacenar información
url_inicial = 'https://www.martindale.com/search/attorneys-law-firms-articles/?term=Puerto%20Rico%2C%20USA'
lista_urls = []


In [214]:
urls_per_filter = []  # se inicializa los links que hay por filtro o busqueda. 
datos_preliminares = []

In [None]:
### Lista de Areas of Practice para emplear en filtros 
try:
    # Setup del driver
    driver = setup_driver()
    driver.get(url_inicial)
    pages = generate_paginated_urls(url_inicial,driver)
    # Crawler inicial para obtener los urls de abogados de la primera pagina
    for url in pages: #se obtiene el numero de paginas en cada busqueda
        simulate_human_behavior(1,2,verbose=False)
        links_per_page = crawler(url,driver,verbose=False) #deberian ser 25 links de abogados por cada pagina de busqueda
        if len(links_per_page) == 0:
            urls_per_page = crawler(url,driver,verbose=False)
            time.sleep(2)
        else:
            urls_per_filter = urls_per_filter + links_per_page
except TimeoutException:
    print("Loading took too much time!")
    driver.quit()
except Exception as e:
    print(f"Failed to load page due to error: {e}")
    driver.quit()
except NoSuchElementException:
    driver.quit()
driver.quit()

20


In [224]:
urls_per_filter
#len(urls_per_filter)
with open("urls.pkl", "wb") as f:
    pickle.dump(urls_per_filter, f)

In [222]:
### obtencion de lista de practice area 
data_for_PA = scrape_abogados(urls_per_filter)

accediendo a 1: https://www.martindale.com/attorney/tessie-leal-garabis-esq-169104324/
Failed to load page due to error: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:199:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:747:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:559:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:147:4
GeckoDriver.prototype.findElement@chrome://remote/content/marionette/driver.sys.mjs:1746:15
despatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20



KeyboardInterrupt: 

In [207]:
data_for_PA

[{'url_martindale': 'https://www.martindale.com/attorney/jennie-marielle-espada-ocasio-esq-167698736/',
  'name': 'Jennie Marielle Espada-Ocasio, Esq',
  'firm': 'Luis E. Miñana & Associates Abogados-Notarios',
  'url_firm_martindale': 'https://www.martindale.com/organization/luis-e-minana-associates-abogados-notarios-11652240/',
  'website_firm': 'http://securitiesatty.com',
  'Biography': 'Criminal Justice Act Panel, Federal and State.',
  'Areas of Practice': 'Criminal Law, Criminal Defense, DUI/DWI, Federal Criminal Law, Sex Crimes, Weapons Charges',
  'Education & Credentials': '',
  'Contact Information': '787-303-0720Phone787-761-5155Fax',
  'University Attended': '2007, TSPR',
  'Year of First Admission': '2007',
  'Admission': '2007 TSPR',
  'Memberships': 'PRADCL.',
  'Languages': 'Spanish and English and French and Bulgarian',
  'ISLN': '921906788',
  'Hobbies & interests': 'Paddle Board',
  'Peer Reviews': '',
  'Location': 'San Juan, Puerto Rico122 Calle Dómenech Altos Urb

In [208]:
practice_areas = set()
# Recorrer cada diccionario
for entry in data_for_PA:
    areas = entry.get("Areas of Practice", "")
    # Separar por coma, quitar espacios y agregar al set
    for area in areas.split(","):
        area = area.strip()
        if area:
            practice_areas.add(area)
with open("practice_area_set.pkl", "wb") as f:
    pickle.dump(practice_areas, f)
print(practice_areas)

{'Healthcare Services and Life Sciences Business Development', 'Labor and Employment Law', 'General Civil Law', 'Corporate Law', 'Bankruptcy', 'Medicare and Medicaid Expert Witness', 'Life Sciences', 'Trademarks', 'Corporate and Estate Planning', 'Real Estate', 'Managed Care', 'Commercial Litigation', 'Weapons Charges', 'Notarial', 'Criminal Defense', 'Bankruptcy Law', 'Federal Criminal Law', 'Distribution Law', 'Small Business Law', 'Immigration Law', 'Regulatory Affairs', 'Government Affairs', 'Employee Benefits Law', 'Intellectual Property', 'Medicare and Medicaid Reimbursement', 'Business Law', 'Medicare and Medicaid', 'Government Contracts', 'Health Care', 'Litigation', 'Criminal Law', 'Securities Law', 'DUI/DWI', 'Tax Law', 'Labor Litigation', 'Sex Crimes'}


In [210]:
### Limpieza set Areas of practice - insumo para nuevo crowler

clean = [] # Lista con las practice areas que si disparan una coincidencia en el buscador. 
len(practice_areas) # Toma el set de todas las posibles practice areas, escritas por abogado
url_base = 'https://www.martindale.com/search/attorneys/?term=Puerto%20Rico%2C%20USA' ## CAMBIAR MAS ADELANTE
driver = setup_driver()
wait = WebDriverWait(driver, 15)
driver.get(url_base)

for area in practice_areas: 
    
    simulate_human_behavior(0.5,2,verbose=False)
    ### Elementos clickeables y request
    wait.until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="filtersContainer"]'))) 
    wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="practiceAreas"]')))
    #print("es clickeable")
    searchBar = driver.find_element(By.ID, 'practiceAreas')
    searchBar.clear()
    searchBar.send_keys(area)

    simulate_human_behavior(1,2,verbose=False)
    suggestion = wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "ul.ui-autocomplete li.ui-menu-item div.ui-menu-item-wrapper")
        ))
        ## Busqueda de practice area
    #print("Sugerencia encontrada:", suggestion.text)
    if "No Areas Of Practice matching" in suggestion.text:   
            pass
    else:
        clean.append(area)
        print(area)
    ## Numero de paginas
driver.quit()
len(clean)

Corporate Law
Bankruptcy
Trademarks
Real Estate
Commercial Litigation
Weapons Charges
Federal Criminal Law
Small Business Law
Intellectual Property
Business Law
Medicare and Medicaid
Government Contracts
Health Care
Litigation
Criminal Law
Sex Crimes


16

In [None]:
### primera corrida scrapper para analisis de 
try:
    # obtencion de datos para lista de practice area
    data_for_PA = scrape_abogados(urls_per_filter)
    predata_practice_area = scrape_abogados(url_inicial,driver)
    # Crawler inicial para obtener los urls de abogados de la primera pagina
    for url in pages: #se obtiene el numero de paginas en cada busqueda
        simulate_human_behavior(1,2,verbose=False)
        links_per_page = crawler(url,driver,verbose=False) #deberian ser 25 links de abogados por cada pagina de busqueda
        if len(links_per_page) == 0:
            urls_per_page = crawler(url,driver,verbose=False)
            time.sleep(2)
        else:
            urls_per_filter = urls_per_filter + links_per_page
except TimeoutException:
    print("Loading took too much time!")
    driver.quit()
except Exception as e:
    print(f"Failed to load page due to error: {e}")
    driver.quit()
driver.quit()

In [None]:
datos_preliminares = datos_preliminares + datos_pre 

In [98]:
lista_urls = lista_urls +urls_per_filter
lista_urls

['https://www.martindale.com/attorney/maria-luisa-martinez-lopez-1573711/',
 'https://www.martindale.com/attorney/german-a-rieckehoff-perez-2023007/',
 'https://www.martindale.com/attorney/carlos-cebollero-1573777/',
 'https://www.martindale.com/attorney/neal-r-walters-1571546/',
 'https://www.martindale.com/attorney/peter-diaz-1574180/',
 'https://www.martindale.com/attorney/carla-calaf-garcia-158573129/',
 'https://www.martindale.com/attorney/andres-fortuna-garcia-2780965/',
 'https://www.martindale.com/attorney/eduardo-cobian-27060364/',
 'https://www.martindale.com/attorney/agustin-f-carbo-lugo-2776305/',
 'https://www.martindale.com/attorney/jorge-r-jimenez-1573767/',
 'https://www.martindale.com/attorney/patricia-m-marvez-valiente-2999269/',
 'https://www.martindale.com/attorney/bethzaida-jordan-ramirez-1571295/',
 'https://www.martindale.com/attorney/zelma-davilla-carrasquillo-2529680/',
 'https://www.martindale.com/attorney/antonio-j-cruz-bonilla-2038583/',
 'https://www.martin

# Scrapper interactivo con envio de filtros

## Ciclo 1, extracción practice areas

In [None]:
### Prueba scrapper 1 url
#url = "https://www.martindale.com/attorney/carlos-villegas-del-valle-301593670/#officeLocation"
#url = "https://www.martindale.com/attorney/rafael-g-rivera-rosario-1572639/"
url = "https://www.martindale.com/attorney/jairo-jay-mellado-villarreal-1572914/"

resultados = []
count = 1
driver = setup_driver()
wait = WebDriverWait(driver, 15)
info_attorney = {}
inicio = time.time()
for i in range(1):
    try:
        html = get_html(driver,url)
        soup = BeautifulSoup(html, 'html.parser')
        
        profile_content = soup.select_one('div.attorney-profile-content.profile-content')
        if not profile_content:
            print(f"Perfil no encontrado para {url}")
            pass

        # Obtencion de elementos princiaples de cada pagina
        attorney_card = profile_content.select_one(".masthead-content")
        firm = attorney_card.find("span").get_text()
        url_martindale_firm = attorney_card.find("a")["href"]

        # En caso de que no haya un link de firma, el primer link sera la caja de rewiew que redirige a la misma pagina, por eso el #
        if "#" in url_martindale_firm:
            url_martindale_firm = None

        # Estructura del diccionario
        info_attorney["url_martindale"] = url #URL interna de martindale para cada abogado
        info_attorney["name"] = profile_content.find('h1').get_text(strip=True) #Nombre del abogado
        info_attorney["firm"] = firm
        info_attorney["url_firm_martindale"] = url_martindale_firm

        # Busqueda de la sección de links de contacto, no todas las paginas tienen esta sección. por eso se usa try/except 
        try: 
            web_bar = profile_content.select_one("section.sticky-menu-all:nth-child(2) > nav:nth-child(1)")
            website_firm = web_bar.select('a')[1]["href"]
            info_attorney["website_firm"] = website_firm if website_firm else None
        except Exception as e:
            #print(f"Error al extraer website: {e}")
            info_attorney["website"] = ""

        #Extraccion de los tags relevantes de cada pagina
        main_content_table = soup.find_all("div", class_="small-12 columns")[0]
        for section_div in main_content_table.find_all("div", recursive=False):
            content_div = section_div.find("div", class_="toggle-area__content")
            h2_key = section_div.find("h2") #Titulo de sección por separado
            if h2_key and content_div:
                h2_key = h2_key.get_text(strip=True)
                if "Areas of Practice" in h2_key:
                    h2_key = "Areas of Practice"
                print(h2_key)
                info_attorney[h2_key] = "" #inicializar keys que se encuentran

                # Biography and practice areas 
                child_text = content_div.find('div', class_='truncate-text')
                if child_text:
                    info_attorney[h2_key] = child_text.get_text(strip=True)
                Aoe_area = content_div.find('div', class_='row collapse')
                if Aoe_area:
                    ul_info = Aoe_area.find('ul')
                    if ul_info:
                        items = [li.get_text(strip=True) for li in ul_info.find_all('li')]
                        info_attorney[h2_key] = ', '.join(items)

                # Education and credentials
                EnC_area = content_div.find_all(
                    lambda tag: tag.name == 'div' and 
                    all(cls in tag.get('class', []) for cls in ['row', 'collapse', 'experience-section', 'clearfix'])
                )
                for text in EnC_area:
                    text_info = text.get_text(strip=True)
                    for line in text_info.split('\n'):
                        if ':' in line:
                            k, v = map(str.strip, line.split(":", 1))
                            info_attorney[k] = v
                        else:
                            info_attorney[h2_key] = line.strip()
                
                # Peer rewiew - Pendiente
                PrA_section = content_div.find("section", class_="profile-peer-reviews")
                if PrA_section: 
                    rewiew_area = PrA_section.find('div', class_="review-top review-top-peer-reviews")
                    rewiew_list = PrA_section.find('div', class_="attorney-reviews-list attorney-peer-reviews-list")
                    if rewiew_area and rewiew_list:
                        text_rewiew_area = rewiew_area.find_all('span')[2].get_text().strip().splitlines()
                        stars = text_rewiew_area[0].split("/")[0].strip()   
                        reviews = text_rewiew_area[1].strip("()").split()[0]
                        info_attorney["profile_peer_review_star"] = stars
                        info_attorney["profile_peer_review_count"] = reviews
                    
                # Location
                firm_offices = content_div.find('ul', id='firmOffices')
                if firm_offices:
                    location_items = firm_offices.find_all('div', class_='office-address')
                    if location_items:
                        info_attorney["Location"] = location_items[0].get_text(strip=True)
        if info_attorney:
            resultados.append(info_attorney)
        count += 1 
        driver.quit()
        False
    except (NoSuchElementException, TimeoutException) as e:
        print(f"Error procesando {url}: {str(e)}")
        driver.quit()
    driver.quit()
    final = time.time() - inicio
    print(f"Tiempo por iteración: {final}")

## Funcion Scrapper para varios links 

In [40]:
### Si ya se cuenta con la lista de links se obtiene de un xlsx para comenzar
df_urls = pd.read_excel("links.xlsx")
df_urls.head()
list_urls_exl = df_urls["url"].tolist()
len(list_urls_exl)

2637

In [None]:
datos = scrape_abogados(list_urls_exl[201:250])

accediendo a 1: https://www.martindale.com/attorney/pedro-j-salicrup-1574006/
accediendo a 2: https://www.martindale.com/attorney/hector-perez-1572834/
accediendo a 3: https://www.martindale.com/attorney/hector-pedrosa-41531869/
accediendo a 4: https://www.martindale.com/attorney/patricia-lorenzi-julia-1571896/
accediendo a 5: https://www.martindale.com/attorney/vanessa-rivera-aquino-1571909/
accediendo a 6: https://www.martindale.com/attorney/jose-l-gandara-sanchez-1571867/
accediendo a 7: https://www.martindale.com/attorney/frank-zorrilla-1573562/
accediendo a 8: https://www.martindale.com/attorney/jorge-cela-urena-1572996/
accediendo a 9: https://www.martindale.com/attorney/maritza-i-munich-1573903/
accediendo a 10: https://www.martindale.com/attorney/irma-r-valldejuli-1574270/
accediendo a 11: https://www.martindale.com/attorney/dora-l-monserrate-penagaricano-1572181/
accediendo a 12: https://www.martindale.com/attorney/luis-carlos-marini-2298729/
accediendo a 13: https://www.marti

## Extracción de links aplicando filtros

In [58]:
datos_total =[]


In [None]:
datos_total = datos_total + datos


In [77]:
df = pd.DataFrame(datos_total)
df.to_excel("datos_abogados.xlsx", index=False) ## Ya se corriendo los primeros 100 links.
df.tail()

Unnamed: 0,url_martindale,name,firm,url_firm_martindale,website,Education & Credentials,University Attended,Law School Attended,Year of First Admission,Admission,...,"Other Practice Areas in Mayaguez, Puerto Rico","Other Practice Areas in Cabo Rojo, Puerto Rico","Other Practice Areas in Aguadilla, Puerto Rico","Other Practice Areas in Arecibo, Puerto Rico",Responsibilities,"Other Practice Areas in Humacao, Puerto Rico",Biography,Special Agencies,"Other Practice Areas in Rio Piedras, Puerto Rico","Other Practice Areas in Carolina, Puerto Rico"
243,https://www.martindale.com/attorney/olga-b-ros...,Olga B. Rosas-Vélez,,,,,"University of Puerto Rico, B.S.B.A., 1998; Uni...","University of Puerto Rico, J.D., 2004",2005,"2005, Puerto Rico; 2006, U.S. Court of Appeals...",...,,,,,,,,,,Intellectual Property
244,https://www.martindale.com/attorney/raymond-p-...,Raymond P. Burgos,Vila-Carrion & Burgos,,,,"University of Puerto Rico, B.A.S., 1984","Interamerican University of Puerto Rico, J.D.,...",1989,"1989, Puerto Rico; 1990, U.S. District Court, ...",...,,,,,,,,,,
245,https://www.martindale.com/attorney/omar-javie...,Omar Javier Marrero,,,,,,,2009,"2009, New York",...,,,,,,,,,,
246,https://www.martindale.com/attorney/angel-e-ro...,Angel E. Rotger-Sabat,,,,,"Brown University, B.A., 1989","University of Puerto Rico, J.D., 1992",1992,"1992, Puerto Rico; 1993, U.S. Court of Appeals...",...,,,,,,,,,,
247,https://www.martindale.com/attorney/pedro-a-ji...,Pedro A. Jimenez,"Adsuar Muñiz Goyco Seda & Perez-Ochoa, P.S.C.",,,,"University of Puerto Rico, B.A., 1973","University of Puerto Rico Law School, J.D., 1976",1976,"1976, Puerto Rico; 1976, U.S. District Court, ...",...,,,,,,,,,,


## Ciclo 2: Scrapper accediendo a filtros

### Validación Practice areas que "funcionana en martindale - filtro" 

In [117]:
clean

['Wage and Hour',
 'Products Liability',
 'Complex Litigation',
 'Federal',
 'Consumer Law',
 'Bankruptcy',
 'Trademarks',
 'Immigration',
 'Residential Real Estate',
 'Admiralty',
 'Antitrust and Trade Regulation',
 'Foreclosures',
 'School Law',
 'Property Damage',
 'Wills',
 'Breach of Contract',
 'Boating Accidents',
 'Criminal Law',
 'Aviation',
 'Foreclosure',
 'Litigation',
 'Insurance',
 'Antitrust',
 'Class Actions',
 'Health Care',
 'Government',
 'Government Contracts',
 'Contracts',
 'Equine Law',
 'Municipal Law',
 'Internet Law',
 'Motorcycle Accidents',
 'Commercial',
 'Estate Planning',
 'Wrongful Death',
 'Business Law',
 'Franchises and Franchising',
 'Corporate Taxation',
 'Government Procurement',
 'Tort',
 'Environmental Law',
 'Patents',
 'Banking',
 'Family',
 'Federal Criminal Law',
 'Business',
 'Copyrights',
 'Franchise',
 'Employee Benefits',
 'Construction Law',
 'Labor and Employment',
 'Wills and Probate',
 'Trusts and Estates',
 'CRIM',
 'Telecommunicatio

In [None]:
### Extracción multiple

for area in tqdm.tqdm(clean):
    url_base = 'https://www.martindale.com/search/attorneys/?term=Puerto%20Rico%2C%20USA' #Va a cambiar mas adelante
    try: 
        ### configuración del webdriver
        driver = setup_driver()
        wait = WebDriverWait(driver, 15)
        driver.get(url_base)
        simulate_human_behavior(0.5,3,verbose=False)

        ### Elementos clickeables y request
        # Ubicar barra de busqueda "Filters - Practice Areas"
        wait.until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="filtersContainer"]'))) 
        wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="practiceAreas"]')))
        searchBar = driver.find_element(By.ID, 'practiceAreas')
        searchBar.clear()
        
        # Envio de texto "practice area" para aplicar filtro y tener nuevos resultados
        searchBar.send_keys(area)

        simulate_human_behavior(1,3,verbose=False)
        suggestion = wait.until(EC.visibility_of_element_located(
            (By.CSS_SELECTOR, "ul.ui-autocomplete li.ui-menu-item div.ui-menu-item-wrapper")
        ))

        # Busqueda de practice area - Si hay una sugerencia, se da click. 
        suggestion.click() # ----> Entra a nueva pagina con filtro. 
        current_url = driver.current_url # Actualizar la url para obtener N° de paginas y urls de paginas
        
        # links y N° de paginas
        urls = generate_paginated_urls(current_url, total_pages=pages)

        # For 
        for url in urls:
            urls_per_page = crawler(url,driver,verbose=False)
            if len(urls_per_page) == 0:
                urls_per_page = crawler(url,driver,verbose=False)
                time.sleep(2)
            if len(urls_per_page) > 1:
                urls_per_filter = urls_per_filter + urls_per_page
            cont += 1

    except TimeoutException:
        print("Loading took too much time!")
        driver.quit()
    except Exception as e:
        print(f"Failed to load page due to error: {e}")
        driver.quit()
    driver.quit()

In [None]:
len(list_url_attorneys)
#list_url_attorneys
ordenado = sorted(list_url_attorneys, key=lambda x: x['name'])
len(ordenado)

In [None]:
unicos = list({json.dumps(d, sort_keys=True) for d in list_url_attorneys})
resultado = [json.loads(d) for d in unicos]
len(resultado)
#resultado
aux_links = pd.DataFrame(resultado)
aux_links.to_excel("links.xlsx", index=False)

In [None]:
lista_urls = []
for i in resultado:
    lista_urls.append(i["url"])
len(lista_urls)


In [None]:
len(list_aux)
lista_aplanada = [item for sublista in list_aux for item in sublista]
df_aux = pd.DataFrame(lista_aplanada)
df_aux.tail()





In [None]:
df_attorneys = pd.concat([df_aux, df], ignore_index=True)
#df_attorneys = df_attorneys(how='all')
# Eliminar filas duplicadas
df_attorneys = df_attorneys.drop_duplicates()
df_attorneys.tail()
#df_attorneys.to_excel("Martindale_attorneys.xlsx",index=False)