# Webscraping with Selenium

In the project below, I used Selenium to scrape data from a website ('https://healthengine.com.au/find/text/Australia/) that contain listings of different australian clinics.

I focused on the dental clinic category and extracted information like

- Name of the practice
- Address of the practice
- City where the practice is located
- State where the practice is located
- Postal code
- Phone number
- Fax number
- Staff names
- Website URL

I scraped the first 500 listings and saved the extracted data in a CSV file.

In [1]:
import time
from selenium import webdriver
import pandas as pd

In [2]:
chromedriver = "C:\Program Files\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chromedriver)

driver.get('https://healthengine.com.au/find/text/Australia/?search=Dentist')

**The webpage has an infinite scroll mechanism, so I want to scroll through 50 pages then, I'LL GRAB ALL THE LINKS AT ONCE**

In [3]:
page = 1
while page < 50:
    #driver.find_element_by_xpath('//*[@id="__next"]/div/div/div/div[2]/div/div/div[1]/div[3]/div/div/div/div[3]/div[3]').click()
    driver.find_element_by_class_name('ResultCountstyles__LoadMore-gdg866-4').click()
    page += 1
    time.sleep(1)
    if page == 50:
        break

In [4]:
list_of_dental_links = []
dental_list = driver.find_elements_by_class_name("ProfileNamestyles__Heading-tgn88t-1")
for el in dental_list:
    tag = el.find_elements_by_tag_name("a")
    link = tag[0].get_property('href')
        
    list_of_dental_links.append(link)
    time.sleep(1)

In [5]:
df = pd.DataFrame({'links':list_of_dental_links})

In [6]:
df.to_csv(r"C:\Users\Babatolatemi\Desktop\Jupyter\A programss\DTS\data train\linklist.csv")

In [9]:
df = pd.read_csv('linklist.csv')
links = df['links']

In [10]:
links.head()

0    https://healthengine.com.au/dentist/wa/vasse/v...
1    https://healthengine.com.au/dentist/wa/clarkso...
2    https://healthengine.com.au/dentist/wa/pinjarr...
3    https://healthengine.com.au/dentist/wa/alkimos...
4    https://healthengine.com.au/dentist/wa/spearwo...
Name: links, dtype: object

In [95]:
from tqdm import tqdm
def scrape():
    """
    This function loops through a list of links gotten from the healthengine website and extracts:
            - name of dental clinic
            - street of clinic
            - city clinic is located
            - state the clinic is located
            - post code
            - phone number(s)
            - fax number(s)
            - clinic website url
            - staff names(s)
            and saves all the scraped data into a dataframe
    """
    
    clinic_names = []
    street_names = []
    city_names = []
    state_names = []
    post_codes = []
    phone_numbers = []
    fax_numbers = []
    clinic_urls = []
    staff_names = []
    for i in tqdm(links):
    

        driver.get(i)
        #xpaths
        try:
            name =  driver.find_element_by_xpath('//*[@id="panelMain"]/div[3]/div[2]/div[1]/div/div[2]/h1').text
            clinic_names.append(name)
                                                    
        except:
            try:
                name = driver.find_element_by_xpath('//*[@id="panelMain"]/div[2]/div[2]/div[1]/div/div[2]/h1').text
                clinic_names.append(name)
            except:
                name = driver.find_element_by_xpath('//*[@id="panelMain"]/div[3]/div[2]/div[1]/div/div/h1').text
                clinic_names.append(name)
    
    
        try:
            Address = driver.find_element_by_xpath('//*[@id="contact-info"]/div/div[1]/div/ul/li[1]/span').text

            strt = Address.strip().split(',')[0]
            clinicstreet = strt.split('\n')[0]
            street_names.append(clinicstreet)

            c = Address.strip().split(' ')[-3]
            city = c.split('\n')[-1]
            city_names.append(city)
        
            state = Address.strip().split(' ')[-2]
            state_names.append(state)
        
            post = Address.strip().split(' ')[-1]
            post_codes.append(post)

        except:
            street_names.append('not available')
            city_names.append('not available')
            state_names.append('not available')
            post_codes.append('not available')

    
        try:
            phone = driver.find_element_by_xpath('//*[@id="contact-info"]/div/div[1]/div/ul/li[2]/span').get_attribute('data-tel')
            phone_numbers.append(phone)
        except:
            phone = 'not available'
            phone_numbers.append(phone)
    
    
        try:
            fax = driver.find_element_by_xpath('//*[@id="contact-info"]/div/div[1]/div/ul/li[3]/span').get_attribute('data-tel')
            fax_numbers.append(fax)
        except:
            fax = 'not available'
            fax_numbers.append(fax)
        
        
        try:
            url = driver.find_element_by_xpath('//*[@id="practice-link"]').text
            clinic_urls.append(url)
        except:
            url = 'not available'
            clinic_urls.append(url)
        
        
        try:
            Staff = driver.find_element_by_xpath('//*[@id="practice-staff"]').text
            st = Staff.split('\n')
            staff_list = []
            for s in st:
                if s.startswith('Dr'):
                    staff_list.append(s)
            staff_names.append(staff_list)
        except:
            Staff = 'not available'
            staff_names.append(Staff)
        
        time.sleep(2)
        
    info = {
        'Clinic Name': clinic_names,
        'Clinic Street Address' : street_names,
        'City': city_names,
        'State': state_names,
        'Postcode': post_codes,
        'Phone Number': phone_numbers,
        'Fax Number': fax_numbers,
        'Website address': clinic_urls,
        'Staff Names': staff_names
    }
    
    return info

In [96]:
output = scrape()

100%|██████████| 500/500 [1:06:38<00:00,  8.00s/it]


In [97]:
df = pd.DataFrame(output)

In [114]:
df.head()

Unnamed: 0,Clinic Name,Clinic Street Address,City,State,Postcode,Phone Number,Fax Number,Website address,Staff Names
0,Vasse Dental,3/21 Napoleon Promenade,"Vasse,",WA,6280,(08)97550548,(08)97550542,http://www.vassedental.com.au,"[Dr Tara Gopal, Dr Bryan Fleming, Dr Timothy S..."
1,Allied Dental Ocean Keys,Ocean Keys Shopping Centre,"Clarkson,",WA,6030,(08)61746705,1300859359,http://www.allieddental.com.au,"[Dr Jeff Luebbert, Dr Cina Yaqub, Dr Sally Cho..."
2,All Dental Pinjarra,39 McLarty road,"Pinjarra,",WA,6208,(08)65556503,,http://www.alldentalpinjarra.com.au,"[Dr Lee Jasson, Dr Faisal Syed, Dr Sharifa Shaik]"
3,Alkimos Beach Dental Centre,Suite 8 / 15 Graceful Blvd,"Alkimos,",WA,6038,(08)95029901,,https://www.alkimosbeachdental.net.au/,[Dr Paul Boulos - Dentist]
4,Exceptional Dental Care,Suite 2 / 21 Mell road,"Spearwood,",WA,6163,(08)61540330,,http://www.exceptionaldentalcare.com.au,"[Dr Jeremy Foster, Dr Sharifa Shaik, Dr Alexis..."


In [118]:
df.fillna(value='Not available', inplace=True)
df['City'] = df['City'].str.replace(',', '')
df['Phone Number'] = df['Phone Number'].str.replace('(', '').str.replace( ')' , '')
df['Fax Number'] = df['Fax Number'].str.replace('(', '').str.replace(')' , '')

In [119]:
df.to_csv(r"C:\Users\Babatolatemi\Desktop\Jupyter\A programss\DTS\data train\dental_clinic_data.csv")