# Setup For Scrape

#### Import Packages

In [45]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
import pandas as pd
from tqdm.auto import tqdm

from PIL import Image
from io import StringIO
import numpy as np

import pytesseract
import cv2

#### Define website and chromedriver locations

In [None]:
url = 'https://aaop.clubexpress.com/content.aspx?page_id=2720&club_id=508439'
chromedriver_location = 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe'

In [7]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Scrape the website

#### Create functions to scrape the website

In [8]:
def open_browser_and_get_list_of_members():
    
    # open the browser
    driver = webdriver.Chrome(chromedriver_location)
    driver.get(url)
    
    # click on the search button    
    search_button = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_search_button"]/span')
    search_button.click()
    
    scraped_data = iterate_through_pages_and_scrape_data(driver)
    
    return scraped_data

In [18]:
def iterate_through_pages_and_scrape_data(driver):
    
    all_members_df = pd.DataFrame()
    for i in tqdm(range(1, 21)):
        
        # print the page we are on
        page_num = driver.find_element(By.XPATH, '//*[@id="page_content"]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr[1]/td[2]/div/select/option[{}]'.format(i))
        print(page_num.text)
        
        # Iterate through the members on this page
        one_page_of_members_df = scrape_member_details_from_page(driver)

        # add this dataframe to the dataframe of all members
        all_members_df = pd.concat([all_members_df, one_page_of_members_df], axis=0, ignore_index=True)
        
        # go to the next page of members
        next_page = driver.find_element(By.XPATH, '//*[@id="page_content"]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr[1]/td[2]/div/a[2]')
        next_page.click()
        
        break
        
    return all_members_df

In [26]:
def scrape_member_details_from_page(driver):
    
    # create a table to store the data
    standard_info_piece_list = ["Name", "City", "State"]
    aggregate_columns = ["Membership_status"]
    optional_info_piece_list = ["Zip"]
    bio_info_cols_list = ["Location","Email_Address_Image_link", "Phone", "Mobile_Phone", "Fax", "Practice", "Title",
                          "Unordered_Address_lines", "Website", "Degrees_and_Credentials", "Licenses", "ABOP_Certified",
                          "Board_Certifications"
                         ]
    all_df_cols = standard_info_piece_list + aggregate_columns + optional_info_piece_list + bio_info_cols_list
    df = pd.DataFrame(columns=all_df_cols)
    
    # iterate through the members
    for memb_num in tqdm(range(2, 7)):#32)):
        
        member_info_dict = {}
        
        try:
            # scrape the data that every member will have
            for standard_info_piece_num in range(2, len(standard_info_piece_list) + 2):

                info_piece_name = standard_info_piece_list[standard_info_piece_num - 2]

                info_piece = driver.find_element(By.XPATH, '//*[@id="page_content"]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr[3]/td/table/tbody/tr[{}]/td[{}]'.format(memb_num, standard_info_piece_num))

                # split up the name and the affiliation
                if info_piece_name == "Name":
                    name, affiliation = str(info_piece.text).split("\n")

                    print(name)
                    
                    member_info_dict["Name"] = name
                    member_info_dict["Membership_status"] = affiliation

                else:
                    member_info_dict[info_piece_name] = str(info_piece.text)


            # scrape the zip code
            if len(member_info_dict["State"]) == 2:
                # this is an american state so there is a zip code
                zip_code_info = driver.find_element(By.XPATH, '//*[@id="page_content"]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr[3]/td/table/tbody/tr[{}]/td[5]'.format(memb_num))
                member_info_dict["Zip"] = str(zip_code_info.text)
                zip_info_present_bool = True

            else:
                # there is no zip code
                member_info_dict["Zip"] = None
                zip_info_present_bool = False


            # scrape the Bio info
            try:
                bio_info = driver.find_element(By.XPATH, '//*[@id="page_content"]/table[2]/tbody/tr[2]/td[1]/table/tbody/tr[3]/td/table/tbody/tr[{}]/td[6]'.format(memb_num, standard_info_piece_num))

                # click on the 'view bio' button
                bio_info.click()

                # scrape the info from the members bio webpage
                bio_info_dict = scrape_members_bio_info(driver)

                # update the member information with the scraped bio info
                member_info_dict.update(bio_info_dict)

                # go back out of this members bio
                button_back_to_main_page = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_return_link"]')
                button_back_to_main_page.click()

            except WebDriverException:
                # there is no bio information
                for col in bio_info_cols_list:
                    member_info_dict[col] = None


            # add the created dictionary for this member to the dataframe
            member_row = pd.DataFrame(member_info_dict, columns=all_df_cols, index=[0])

            df = pd.concat([df, member_row], axis=0, ignore_index=True)
            
        except NoSuchElementException:
            print("Error - could not scrape details for member number {}".format(memb_num))
            if info_piece_name in locals():
                print("Info Being Scraped =", info_piece_name)
                
            if name in locals():
                print("Member Name =", name)


    return df

In [None]:
def scrape_members_bio_info(driver):

            
    dict_of_member_info = {}
    
    # scrape location
    try:
        location_info = driver.find_element(By.XPATH, '//*[@id="content_right"]/div/div[4]')
        dict_of_member_info["Location"] = str(location_info.text)
    except NoSuchElementException:
        dict_of_member_info["Location"] = None
    
    # scrape email
    try:
        email_img = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_email_image"]')
        email_image_source = email_img.get_attribute('src')
        
        # call function to extact the email from this image
        #email_address_info = get_email_address_from_image(email_image_source)
        
        dict_of_member_info["Email_Address_Image_link"] = email_image_source
    except NoSuchElementException:
        dict_of_member_info["Email_Address_Image_link"] = None
        
    # scrape phone
    try:
        phone_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_phone_panel"]/div[2]')
        dict_of_member_info["Phone"] = str(phone_info.text)
    except NoSuchElementException:
        dict_of_member_info["Phone"] = None
        
    # scrape mobile number
    try:
        mobile_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_cell_phone_panel"]/div[2]')
        dict_of_member_info["Mobile_Phone"] = str(mobile_info.text)
    except NoSuchElementException:
        dict_of_member_info["Mobile_Phone"] = None
        
    # scrape fax
    try:
        fax_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_fax_panel"]/div[2]')
        dict_of_member_info["Fax"] = str(fax_info.text)
    except NoSuchElementException:
        dict_of_member_info["Fax"] = None
        
    # scrape practice/organisation
    try:
        practice_info = driver.find_element(By.XPATH, '//*[@id="content_right"]/div/div[12]')
        dict_of_member_info["Practice"] = str(practice_info.text)
    except NoSuchElementException:
        dict_of_member_info["Practice"] = None
        
    # scrape title
    try:
        title_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_title_panel"]/div[2]')
        dict_of_member_info["Title"] = str(title_info.text)
    except NoSuchElementException:
        dict_of_member_info["Title"] = None
    
    # scrape website
    try:
        website_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_website_panel"]/div[2]')
        dict_of_member_info["Website"] = str(website_info.text)
    except NoSuchElementException:
        dict_of_member_info["Website"] = None

    # scrape degrees and credentials
    try:
        degree_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_additional_member_data_panel_questions_panel"]/div[2]/div[2]')
        dict_of_member_info["Degrees_and_Credentials"] = str(degree_info.text)
    except NoSuchElementException:
        dict_of_member_info["Degrees_and_Credentials"] = None

    # scrape license details
    try:
        license_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_additional_member_data_panel_questions_panel"]/div[3]/div[2]')
        dict_of_member_info["Licenses"] = str(license_info.text)
    except NoSuchElementException:
        dict_of_member_info["Licenses"] = None

    # scrape ABOP certified
    try:
        abop_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_additional_member_data_panel_questions_panel"]/div[4]/div[2]')
        dict_of_member_info["ABOP_Certified"] = str(abop_info.text)
    except NoSuchElementException:
        dict_of_member_info["ABOP_Certified"] = None

    # scrape the Board certifications
    try:
        board_cert_info = driver.find_element(By.XPATH, '//*[@id="ctl00_ctl00_membership_bio_info_standard_panel_additional_member_data_panel_questions_panel"]/div[5]/div[2]')
        dict_of_member_info["Board_Certifications"] = str(board_cert_info.text)
    except NoSuchElementException:
        dict_of_member_info["Board_Certifications"] = None

    # scrape address
    set_of_all_scraped_vals = set(dict_of_member_info.values())
    
    if set_of_all_scraped_vals == {None}:
        dict_of_member_info["Address"] = None
            
    else:
        set_of_all_scraped_vals.remove(str(location_info.text))
    
        set_of_content = set()
        all_content = driver.find_elements_by_class_name('content-text')
        for content in all_content:
            set_of_content.add(str(content.text).strip())

        address_val_set = set(set_of_content).difference(set_of_all_scraped_vals)
        address_val_set.discard("Note: Email addresses are displayed in a non-clickable format as a security measure.")

        address_info = ", ".join(address_val_set)
        dict_of_member_info["Unordered_Address_lines"] = address_info
    
    return dict_of_member_info

In [None]:
def get_email_address_from_image(email_image_source):
    
    # ------------------------------------------------
    # THIS FUNCTION WAS NOT FINISHED AND DOES NOT WORK
    # ------------------------------------------------
    
    # open the image link and scrape the email
    print(0)
    driver = webdriver.Chrome(chromedriver_location)
    driver.get(email_image_source)
    driver.save_screenshot('temp.png')
    driver.close()
    print(1)
    #     img = Image.open(StringIO(img_data))
    #     print(4)
    #     img_as_np_array = np.asarray(img)

    #     print(img_as_np_array)

    # Read in data, Grayscale the image & get the Otsu's threshold
    img_as_np_array = cv2.imread('temp.png')
    print(2)
    grayscale_img = cv2.cvtColor(img_as_np_array, cv2.COLOR_BGR2GRAY)
    threshold = cv2.threshold(grayscale_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    print(3)
    
    # Morph open to remove noise
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
    opening = cv2.morphologyEx(threshold, cv2.MORPH_OPEN, kernel, iterations=1)
    print(4)
    
    # Find contours and remove small noise
    cnts = cv2.findContours(opening, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        area = cv2.contourArea(c)
        if area < 50:
            cv2.drawContours(opening, [c], -1, 0, -1)
    
    print(5)
    # Invert and apply slight Gaussian blur
    result = 255 - opening
    result = cv2.GaussianBlur(result, (3,3), 0)
    
    print(6)
    cv2.startWindowThread()
    cv2.imshow('screenshot', img_as_np_array)
    cv2.imshow('thresh', threshold)
    cv2.imshow('opening', opening)
    cv2.imshow('result', result)
    cv2.waitKey()

    
    print(7)
    data = pytesseract.image_to_string(result, lang='eng', config='--psm 6')
    print(data)

    return data

#### Run the functions

In [None]:
scraped_df = open_browser_and_get_list_of_members()

In [26]:
len(scraped_df)

598

In [27]:
scraped_df

Unnamed: 0,Name,City,State,Membership_status,Zip,Location,Email_Address_Image_link,Phone,Mobile_Phone,Fax,Practice,Title,Unordered_Address_lines,Website,Degrees_and_Credentials,Licenses,ABOP_Certified,Board_Certifications
0,Jeremy Abbott DDS,Bethesda,MD,Active Member,20814,"Bethesda, MD",,301-530-8570,,301-530-8572,20814,,"10401 Old Georgetown Road, USA, Bethesda, MD, ...",,DDS,Maryland,Yes,
1,Doraida L Abramowitz DMD-Dentist and Orthodont...,Tampa,FL,Active Member,33606,,,,,,,,,,,,,
2,"Jonathan M Adams B.SC, DDS",Victoria,Canada,Fellow of the Academy,,,,,,,,,,,,,,
3,"Robert C Adler DMD, MS",St. Helena,CA,Life/Life-Fellow Member,94574,"St. Helena, CA",https://s3.amazonaws.com/ClubExpressClubFiles/...,707-963-4867,310-500-6625,707-6717273,Dr. Robert Adler,,", 94574, USA, St. Helena, CA, PO Box 281",www.advwinery.com,"DMD, MS",CA,Yes,
4,Mohammed Abdullah Al Roshaidan DDS,Fresno,CA,Fellow of the Academy,93720,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,Henry Young DDS,Brooklyn,NY,Fellow of the Academy,11209,"Brooklyn, NY",https://s3.amazonaws.com/ClubExpressClubFiles/...,718-238-7878,917-971-7188,,Dr Henry Young,Dentist,", 415 73rd Street, USA, Brooklyn, NY, 11209",http://www.drhenryyoung.com/,DDS,NY,Yes,ABOP
594,"Keith A Yount DDS, MAGD, ABOP",Raleigh,NC,Fellow of the Academy,27607-7511,"Raleigh, NC",,919-781-6600,919-696-4752,919-781-6430,Suite 107,Owner/Orofacial Pain Practitioner,"Raleigh Facial Pain Center, Raleigh, NC, 27607...",www.raleighfacialpain.com,"DDS, MAGD, ABOP",North Carolina,Yes,ABOP
595,Julianne A Yuziuk N/A,Greenville,NC,Student/Grad Members,27834,"Greenville, NC",https://s3.amazonaws.com/ClubExpressClubFiles/...,8282085494,8282085494,,ECU School of Dental Medicine,Student,", 1608 Treybrooke Circle, Greenville, NC, 2783...",,,,No,
596,"Maria T Zerjav BSc (H) Psych, BSc, PT, CCTT, CODN",Vancouver,Canada,Affiliate Member,,,,,,,,,,,,,,


In [28]:
scraped_df.to_csv("AAOP_member_details.csv", index=False)