In [16]:
import requests #we can also use "urllib" but requests is more powerful

indeed_result = requests.get('https://www.indeed.com/jobs?q=python&limit=50')
print(indeed_result) #check if requests.get() works fine. <Response [200]> means "OK"
print(indeed_result.text[:100]) #print html (for test)

<Response [200]>
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<meta http-equiv="content-type" content="text/html


### 1. Extract number of pages
we need max page number in order to get all jobs on the site

In [17]:
from bs4 import BeautifulSoup
indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser') #soup is data extractor

#from here, code should be changed if html of indeed.com is changed
pagination = indeed_soup.find("div", {"class":"pagination"})
pages = pagination.find_all('a') #find all anchors
spans = []
for page in pages[:-1]:
    spans.append(int(page.find('span').string))
max_page = max(spans)
print("max page: ",max_page)

max page:  5


**For reusibility, we'll write function that returns <u>max page number</u>**

In [18]:
LIMIT = 50
URL = f'https://www.indeed.com/jobs?q=python&limit={LIMIT}'

def get_last_page(): #return max page number
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'html.parser') #soup is data extractor
    pagination = soup.find("div",{"class":"pagination"})
    pages = pagination.find_all('a')
    spans=[] #list containing page numbers
    for page in pages[:-1]:
        spans.append(int(page.find('span').string))
    max_page = max(spans)
    return max_page

### 2. Extract job title, company name, location, and link

In [19]:
def extract_job(html):
    title = html.find("span", title=True).string
    company = html.find("span",{"class":"companyName"}).string
    location = html.select_one("pre>div").text
    job_id=html['data-jk']
    return {
        'title':title, 
        'company':company, 
        'location':location, 
        'link':f"https://www.indeed.com/viewjob?jk={job_id}"}


def extract_jobs(last_page):
    jobs = []
    for page in range(last_page):
        print(f"Scrapping Indeed: Page {page}...") #to check if it works fine
        result = requests.get(f"{URL}&start={page*LIMIT}")
        soup = BeautifulSoup(result.text, 'html.parser')
        results = soup.find_all("a",{"class":"tapItem"})
        for result in results:
            job = extract_job(result)
            jobs.append(job)
    return jobs
            
#test if functions above work well
def get_indeed_jobs():
    last_page = get_last_page()
    jobs = extract_jobs(last_page)
    return jobs

indeed_jobs = get_indeed_jobs()
print(indeed_jobs)

Scrapping Indeed: Page 0...
Scrapping Indeed: Page 1...
Scrapping Indeed: Page 2...
Scrapping Indeed: Page 3...
Scrapping Indeed: Page 4...
[{'title': 'Python Software Engineering Teaching Assistant (Part-Time)', 'company': 'Hackbright Academy', 'location': '+2 locationsRemote', 'link': 'https://www.indeed.com/viewjob?jk=cc10e115d1ed386e'}, {'title': 'DBA Apprentice', 'company': 'WebMD', 'location': 'Newark, NJ 07102 (Central Business District area)', 'link': 'https://www.indeed.com/viewjob?jk=a7727fb4996b872c'}, {'title': 'Data Analyst (Python)', 'company': 'Qcentrio', 'location': '+1 locationRemote', 'link': 'https://www.indeed.com/viewjob?jk=d1218a12442b5c05'}, {'title': 'Software Engineer Python (Entry Level) - Remote', 'company': 'XR Trading LLC', 'location': 'Chicago, IL•Remote', 'link': 'https://www.indeed.com/viewjob?jk=9be66eb9484eee6f'}, {'title': 'Junior Python Developer', 'company': 'GTS', 'location': 'New York, NY 10022 (Midtown area)', 'link': 'https://www.indeed.com/view

### StackOverflow

In [20]:
import requests
from bs4 import BeautifulSoup

URL = f"https://stackoverflow.com/jobs?q=python&sort=i"

#step 1: get the page
#step 2: make the request
#step 3: extract jobs

def get_last_page():
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'html.parser')
    pages = soup.find("div",{"class":"s-pagination"}).find_all("a")
    last_page=pages[-2].get_text(strip=True)
    return int(last_page)

def extract_job(html): #singular. one by one
    title = html.find("h2",{"class":"mb4"}).find("a")["title"]
    company, location = html.find("h3",{"class":"fc-black-700"}).find_all("span", recursive=False) #resursive=0 means "don't go too deep. just get first two level"
    company = company.get_text(strip=True)
    location = location.get_text(strip=True)
    job_id = html['data-jobid']
    return {
        'title':title, 
        'company':company, 
        'location':location, 
        'link':f"https://stackoverflow.com/jobs/{job_id}"
    }

def extract_jobs(last_page):
    jobs=[]
    for page in range(last_page):
        print(f"Scrapping SO: Page {page}...") #for my mental health
        result = requests.get(f"{URL}&pg={page+1}")
        soup = BeautifulSoup(result.text, 'html.parser')
        results = soup.find_all("div",{"class":"-job"}) #"class":"listResults" ?
        for result in results:
            job = extract_job(result)
            jobs.append(job)
    return jobs
    
    
def get_so_jobs(): #get_jobs()
    last_page = get_last_page() #number of page can differ depending on 'user-agent'(info of user's OS&browser)
                                #how to get 'user-agent': https://www.whatismybrowser.com/detect/what-is-my-user-agent
    jobs = extract_jobs(last_page)
    return jobs

so_jobs = get_so_jobs()
print(so_jobs)

Scrapping SO: Page 0...
Scrapping SO: Page 1...
Scrapping SO: Page 2...
Scrapping SO: Page 3...
Scrapping SO: Page 4...
Scrapping SO: Page 5...
Scrapping SO: Page 6...
Scrapping SO: Page 7...
Scrapping SO: Page 8...
Scrapping SO: Page 9...
Scrapping SO: Page 10...
Scrapping SO: Page 11...
Scrapping SO: Page 12...
Scrapping SO: Page 13...
Scrapping SO: Page 14...
Scrapping SO: Page 15...
Scrapping SO: Page 16...
Scrapping SO: Page 17...
Scrapping SO: Page 18...
Scrapping SO: Page 19...
Scrapping SO: Page 20...
Scrapping SO: Page 21...
Scrapping SO: Page 22...
Scrapping SO: Page 23...
Scrapping SO: Page 24...
Scrapping SO: Page 25...
Scrapping SO: Page 26...
Scrapping SO: Page 27...
Scrapping SO: Page 28...
Scrapping SO: Page 29...
Scrapping SO: Page 30...
Scrapping SO: Page 31...
Scrapping SO: Page 32...
Scrapping SO: Page 33...
Scrapping SO: Page 34...
Scrapping SO: Page 35...
Scrapping SO: Page 36...
Scrapping SO: Page 37...
Scrapping SO: Page 38...
Scrapping SO: Page 39...
Scrapping 

### Get lists of jobs from two sites(indeed&stackoverflow) and save as CSV file

In [21]:
jobs = indeed_jobs + so_jobs

In [22]:
#version 1: classical method with file object
import csv

def save_to_file(jobs):
    file = open("jobs.csv", mode="w") #open a file
    writer = csv.writer(file) #make a writer object
    writer.writerow(["title", "company", "location", "link"])
    for job in jobs:
        writer.writerow(list(job.values())) #without list, data type is 'dict_values'
    return

save_to_file(jobs)

In [24]:
#version 2: with open() handler (safer way to write a file, recommended)
import csv

def save_to_file(jobs):
    with open("jobs.csv", 'w', newline='') as csv_file:
        columns = ["title", "company", "location", "link"]
        writer = csv.DictWriter(csv_file, fieldnames = columns)
        writer.writeheader()
        for job in jobs:
            writer.writerow(job)
            
save_to_file(jobs)