In [5]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re, time, csv
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [10]:
# the scrape function
def scrape(initUrl, num, fileName, mode, title): 
    outpath = fileName + '.csv' # get output path
    fw = open(outpath, mode, encoding='utf8') # open the file
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerow(['text','job title']) # write headers firstly
    
    # avoid robot detection of anti spider on the Indeed website
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    # add user-agent
    options.add_argument('user-agent=Mozilla/5.0')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
    driver.maximize_window()
    
    jobCountNum = 0
    url = initUrl
    links = []
    errorCount = 0
    
    while True:       
        driver.get(url)
        # avoid requesting url too frequently
        time.sleep(3)
        
        #scroll down
        driver.execute_script('window,scrollTo(0,document.body.scrollHeight)')

        try:
            # get job links on current page
            jobLinks = driver.find_elements(by=By.XPATH, value = '//a[contains(@class,"tapItem")]')
        except:
            print('unable get jobLinks on the current page')
            jobLinks = None
            errorCount += 1
            if errorCount==10: return # quit the program
            continue
            
        if len(jobLinks)==0:
            print('jobLinks is empty, please run again')
            errorCount += 1
            if errorCount==3: return # quit the program
            continue
        
        for jobLink in jobLinks: 
            print('job count number is: ', jobCountNum)
            jobCountNum += 1
            
            # get one job link
            link = jobLink.get_attribute(name="href")
            
            if link and link!='':
                links.append(link) # append link to the link list
            else:
                print('empty in job_title or link')
                errorCount += 1
                if errorCount==10: return # quit the program
                continue
                
        try:
            nextLinkBox = driver.find_element(by=By.XPATH, value='.//a[contains(@aria-label,"Next")]')
        except:
            print('unable get the next page link box')
            break
        
        # get the next page link
        urlLink = nextLinkBox.get_attribute(name="href")
        
        print('next page link: ',urlLink)
        if urlLink: url = urlLink
        
        print('current length of stored job links: ',len(links))
        
        if len(links) >= num:
            break

        # click the next page button
        nextLinkBox.click()
        
    # traversal the stored links
    for idx, link in enumerate(links):
        
        if idx>= num: break
        
        #initialize key attributes
        text, job_title = 'NA','NA'
        
        print('current idx: ',idx)
        
        try:
            driver.get(link)
        except:
            continue
            
        # avoid requesting url too frequently
        time.sleep(3)
        
        #scroll down
        driver.execute_script('window,scrollTo(0,document.body.scrollHeight)')
        
        try:
            descriptionBox = driver.find_element(by=By.ID, value='jobDescriptionText')
        except:
            descriptionBox = None
            print('unable get the current job description')
        
        # get the job description
        if descriptionBox: 
            text = descriptionBox.text.replace('\n',' ')
            job_title = title
        
        writer.writerow([text, job_title])

    print('program finished....')
    fw.close() 

## notation
Because of the robot detection of anti spider mechanism on the Indeed website, we need to run scrape codes many times, we change URLs and the number many times in order to add all data to the final file

In [12]:
# need to change urls as needed
sde_url = 'https://www.indeed.com/jobs?q=Software%20Engineer&l=California&vjk=28c2b38ed84e3b75'
ds_url = 'https://www.indeed.com/jobs?q=data%20scientist&l=California&vjk=5ef83d01db120213'
# sde_url = 'https://www.indeed.com/jobs?q=Software+Engineer&l=California&start=10'
# ds_url = 'https://www.indeed.com/jobs?q=data+scientist&l=California&start=10'

# define the number of jobs when scraping, do not set this number over 1000
# because it may cause robot detection of anti spider and the program crash
number = 500
# number = 10
sde_title = 'Software Engineer'
ds_title = 'Data Scientist'

#### if it says 'jobLinks is empty', please re-execut above code again until we can see the message: 'job count number is:'
#### message for 'program finished....' means we have scraped data successfully

In [8]:
scrape(sde_url, number, 'training_set_new', 'a+', sde_title)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [/Users/aaron/.wdm/drivers/chromedriver/mac64/101.0.4951.41/chromedriver] found in cache


job count number is:  0
job count number is:  1
job count number is:  2
job count number is:  3
job count number is:  4
job count number is:  5
job count number is:  6
job count number is:  7
job count number is:  8
job count number is:  9
job count number is:  10
job count number is:  11
job count number is:  12
job count number is:  13
job count number is:  14
next page link:  https://www.indeed.com/jobs?q=Software+Engineer&l=California&start=10
current length of stored job links:  15
current idx:  0
current idx:  1
current idx:  2
current idx:  3
current idx:  4
current idx:  5
current idx:  6
current idx:  7
current idx:  8
current idx:  9
program finished....


In [11]:
scrape(ds_url, number, 'training_set_new', 'a+', ds_title)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [/Users/aaron/.wdm/drivers/chromedriver/mac64/101.0.4951.41/chromedriver] found in cache


job count number is:  0
job count number is:  1
job count number is:  2
job count number is:  3
job count number is:  4
job count number is:  5
job count number is:  6
job count number is:  7
job count number is:  8
job count number is:  9
job count number is:  10
job count number is:  11
job count number is:  12
job count number is:  13
job count number is:  14
next page link:  https://www.indeed.com/jobs?q=data+scientist&l=California&start=10
current length of stored job links:  15
current idx:  0
current idx:  1
current idx:  2
current idx:  3
current idx:  4
current idx:  5
current idx:  6
current idx:  7
current idx:  8
current idx:  9
program finished....
