In [37]:
import requests #we can also use "urllib" but "requests" is more powerful

indeed_result = requests.get('https://www.indeed.com/jobs?q=python&limit=50')
print(indeed_result) #check if requests.get() works fine. <Response [200]> means "OK"
print(indeed_result.text[:100]) #print html (for test)

<Response [200]>
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<meta http-equiv="content-type" content="text/html


### 1. Extract number of pages
we need max page number in order to get all jobs on the site

In [38]:
from bs4 import BeautifulSoup
indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser') #soup is data extractor

#from here, code should be changed if html of indeed.com is modified
pagination = indeed_soup.find("div", {"class":"pagination"})
print(pagination)
pages = pagination.find_all('a') #find all anchors
spans = []
for page in pages[:-1]:
    print(page.find('span'))
    spans.append(int(page.find('span').string))
max_page = max(spans)
print("max page: ",max_page)

<div class="pagination" onmousedown="pclk(event);">
<ul class="pagination-list">
<li><b aria-current="true" aria-label="1" tabindex="0">1</b></li><li><a aria-label="2" data-pp="gQAyAAAAAAAAAAAAAAABvK5TOQBeAQMBDiYGHgYGLKMkGOV31hS4dUbB8rTosXdt9HP3IRbpkCcD5vIGu5tLASBV42tKkgpjWzzKdEmo_sShi-EW8rLbUTVh3_NHyLUL64wqx-LiibajvdWYhkkv_Wa59gAA" href="/jobs?q=python&amp;limit=50&amp;start=50" onmousedown="addPPUrlParam &amp;&amp; addPPUrlParam(this);" rel="nofollow"><span class="pn">2</span></a></li><li><a aria-label="3" data-pp="gQBkAAAAAAAAAAAAAAABvK5TOQCdAQIBHUwGAuytlawyZ3fk1rURh7dm2SmyD-GclUrxmWGwyWwf_hCsKP6BYchUEpZBRrvP--1NR5jPnHtGvrTa7yUAygd0WA_D_zp6l3juY1KTczVw2tWGihJMehl3xwD_1-HA4uYl95t_yIKxumsgSrXTXCM4RXeyHTveAWj-jtdORs9aZdnvF1NxGKLOTgU_WI1W1ekvEqF3UzcyYwAA" href="/jobs?q=python&amp;limit=50&amp;start=100" onmousedown="addPPUrlParam &amp;&amp; addPPUrlParam(this);" rel="nofollow"><span class="pn">3</span></a></li><li><a aria-label="4" data-pp="gQCWAAAAAAAAAAAAAAABvK5TOQDKAQQBIEwGHgZQBh4s1A

**For reusibility, we'll write function that returns <u>max page number</u>**

In [39]:
LIMIT = 50
URL = f'https://www.indeed.com/jobs?q=python&limit={LIMIT}'

def extract_indeed_page(): #return max page number
    result = requests.get(URL)
    soup = BeautifulSoup(result.text, 'html.parser') #soup is data extractor
    pagination = soup.find("div",{"class":"pagination"})
    pages = pagination.find_all('a')
    spans=[] #list containing page numbers
    for page in pages[:-1]:
        spans.append(int(page.find('span').string))
    max_page = max(spans)
    return max_page

### 2. Extract job title, company name, location, and link

In [40]:
def extract_job(html):
    title = html.find("span", title=True).string
    company = html.find("span",{"class":"companyName"}).string
    location = html.select_one("pre>div").text
    job_id=html['data-jk']
    return {'title':title, 'company':company, 'location':location, 'link':f"https://www.indeed.com/viewjob?jk={job_id}"}


def extract_indeed_jobs(last_page):
    jobs = []
    for page in range(last_page):
        print(f"Scrapping page {page}")
        result = requests.get(f"{URL}&start={page*LIMIT}")
        soup = BeautifulSoup(result.text, 'html.parser')
        results = soup.find_all("a",{"class":"tapItem"})
        for result in results:
            job = extract_job(result)
            jobs.append(job)
    return jobs
            
#test code
last_indeed_page = extract_indeed_page()
extract_indeed_jobs(last_indeed_page)

Scrapping page 0
Scrapping page 1
Scrapping page 2
Scrapping page 3
Scrapping page 4


[{'title': 'Python Engineer',
  'company': 'Outside Online',
  'location': 'Boulder, CO',
  'link': 'https://www.indeed.com/viewjob?jk=896f0683f822304e'},
 {'title': 'DBA Apprentice',
  'company': 'WebMD',
  'location': 'Newark, NJ 07102 (Central Business District area)',
  'link': 'https://www.indeed.com/viewjob?jk=a7727fb4996b872c'},
 {'title': 'Software Engineer (PHP/Python)',
  'company': 'Catchpoint',
  'location': 'Remote',
  'link': 'https://www.indeed.com/viewjob?jk=e06b448fecf5553d'},
 {'title': 'Entry Level Business Analyst-Remote',
  'company': 'Agama Solutions',
  'location': '+2 locationsRemote',
  'link': 'https://www.indeed.com/viewjob?jk=6f6a1c18da31655b'},
 {'title': 'Python Software Engineering Teaching Assistant (Part-Time)',
  'company': 'Hackbright Academy',
  'location': '+2 locationsRemote',
  'link': 'https://www.indeed.com/viewjob?jk=cc10e115d1ed386e'},
 {'title': 'Game Designer',
  'company': 'Microsoft',
  'location': 'Redmond, WA 98052 (Overlake area)',
  'l