In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

from selenium import webdriver
from tqdm import notebook
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
FDorUSD = [r'функциональн.+ диагностик.+', ' фд ', 'узи', 'ультразвук', 'ультразвуковая диагностика']
SKILLS = ['ЭНМГ', 'электронейромиограф', 'электромиограф', 'узи нервов', 'узи бца', 'узи брахиоцефальных артертий', 'дуплекс']

In [3]:
def get_doctors():
    doctors = []
    for page in notebook.tqdm(range(21,41)):
        url = 'https://prodoctorov.ru/spb/nevrolog/?page=' + str(page)
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--incognito')
        options.add_argument('--headless')

        driver = webdriver.Safari()
        driver.get(url)
        html = driver.execute_script("return document.documentElement.outerHTML;")

        soup = BeautifulSoup(html, 'html.parser')

        links = soup.find_all('a', {"class": "b-doctor-card__name-link"})
        for l in links:
            href = l.get('href')
            dr_url = 'https://prodoctorov.ru' + href
            driver.get(dr_url)
            dr_html = driver.execute_script("return document.documentElement.outerHTML;")
            dr_soup = BeautifulSoup(dr_html, 'html.parser')

            doctor = {'link': dr_url}

            try:
                doctor['name'] = " ".join(dr_soup.find_all('span',
                                                           {"itemprop": "name", "class": "d-block ui-text ui-text_h5 ui-text_color_black mb-2"})[0].contents[0].split())
            except IndexError:
                pass

            try:
                doctor['about'] = [" ".join(x.contents[0].split()) for x in dr_soup.find_all('div', {"class": "b-doctor-details__paragraph"})]
            except IndexError:
               pass

            try:
                doctor['profile'] = [x.contents[0] for x in dr_soup.find_all('a', {"class": "b-doctor-details__link b-doctor-details__link_column"})]
            except IndexError:
                pass

            try:
                courses = dr_soup.find_all('div', {"id": "courses"})[0]
                years = [x.contents[0] for x in courses.find_all('div', {"class": "b-doctor-details__number"})]
                course_names = [x.contents[0] for x in courses.find_all('div', {"class": "b-doctor-details__list-item-title"})]
                courses_text = [x + ' (' + y + ')' for x, y in zip(course_names, years)]

                doctor['courses'] = courses_text
            except IndexError:
                pass

            try:
                edu = dr_soup.find_all('div', {"id": "educations"})[0]
                edu_years = [x.contents[0] for x in edu.find_all('div', {"class": "b-doctor-details__number"})]
                edu_names = [" ".join(x.contents[0].split()) for x in edu.find_all('div', {"class": "b-doctor-details__list-item-title"})]
                edu_text = [x + ' (' + y + ')' for x, y in zip(edu_names, edu_years)]
                doctor['education'] = edu_text
            except IndexError:
                pass

            doctors.append(doctor)
        driver.close()

    return(doctors)

def get_doctor_features(row):
    try:
        education = [x.lower() for x in row['education']]
        courses = [x.lower() for x in row['courses']]
        about = [x.lower() for x in row['about']]

        edu_matches = [s for s in row['education'] if any(xs in s for xs in FDorUSD)]
        row['relevant_education'] = edu_matches

        skills_matches = [s for s in row['courses'] if any(xs in s for xs in SKILLS)] + [s for s in about if any(xs in s for xs in SKILLS)]
        row['relevant_skills'] = skills_matches
    except TypeError:
        pass

    return(row)

In [None]:
drs = get_doctors()

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(drs)
df = df.fillna('')
df['relevant_education'] = ''
df['relevant_skills'] = ''

df = df.apply(get_doctor_features, axis=1)

In [None]:
for column in ['about', 'profile', 'education', 'courses', 'matching_education', 'matching_skills']:
    df[column] = df[column].apply(lambda x: "; ".join(x) if not len(x)==0 else '')

df = df.query('matching_education != "" or matching_skills != ""')
df.to_excel('neuros2.xlsx', index=False)