In [7]:
import requests
import json
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument('--headless')

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}

greenhouse = "https://boards.greenhouse.io"

jobs = []


Get Adobe jobs (Where .json response is obtained on applying filters)

In [9]:
adobe_url = "https://careers.adobe.com/widgets"

# no of jobs to be fetched in one request
size = 10

# body for the request
body = {
    "lang":"en_us",
    "deviceType":"desktop",
    "country":"us",
    "pageName":"Engineering and Product jobs",
    "ddoKey":"refineSearch",
    "sortBy":"",
    "subsearch":"",
    "from":0,
    "jobs":True,
    "counts":True,
    "all_fields":["remote","country","state","city","experienceLevel","category","profession","employmentType","jobLevel"],
    "pageType":"category",
    "size":size,
    "clearAll":False,
    "jdsource":"facets",
    "isSliderEnable":False,
    "pageId":"page62",
    "siteType":"external",
    "keywords":"",
    "global":True,
    "selected_fields":{"category":["Engineering and Product","Information Technology"],"country":["India"],"state":["Uttar Pradesh","Karnataka"]},
    "locationData":{}
}

adobe_headers = {
    'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
    'content-type': "application/json"
}

adobe_response = requests.post(adobe_url, headers=adobe_headers, json=body).json()
print(adobe_response)

# get adobe jobs from response consisting of 10 jobs inside refineSearch object consitsting data object with jobs array
adobe_jobs = adobe_response['refineSearch']['data']['jobs']

# get total number of jobs from the json response stored in key 'totalHits'
adobe_jobs_count = adobe_response['refineSearch']['totalHits']
# print(adobe_jobs_count)

# resend the request with size = adobe_jobs_count - 10
size = adobe_jobs_count - 10
body['size'] = size

# also set from = 10
body['from'] = 10

adobe_response = requests.post(adobe_url, headers=adobe_headers, json=body).json()
# print(adobe_response)

# get the jobs list from the json response stored in array 'jobs' inside data key which is a object inside refineSearch Object in the response and append it to adobe_jobs array
adobe_jobs += adobe_response['refineSearch']['data']['jobs']

# jobs array consists objects with keys like title, location, reqId, postedDate
adobe_jobs_title = [job['title'] for job in adobe_jobs]
adobe_jobs_location = [job['location'] for job in adobe_jobs]
adobe_jobs_id = [job['reqId'] for job in adobe_jobs]
adobe_jobs_posted = [job['postedDate'] for job in adobe_jobs]
adobe_jobs_description = [job['descriptionTeaser'] for job in adobe_jobs]
adobe_jobs_link = ["https://careers.adobe.com/us/en/job/"+ job['jobSeqNo'] for job in adobe_jobs ]

# write results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id, key (where key increments by 1) and company name

for i in range(len(adobe_jobs_title)):
    jobs.append({
        'title': adobe_jobs_title[i],
        'location': adobe_jobs_location[i],
        'link': adobe_jobs_link[i],
        'posted': adobe_jobs_posted[i],
        'description': adobe_jobs_description[i],
        'id': adobe_jobs_id[i],
        'company': 'Adobe'
    })

print(len(jobs))


{'refineSearch': {'status': 200, 'hits': 10, 'totalHits': 63, 'data': {'jobs': [{'cityState': 'Noida, Uttar Pradesh', 'country': 'India', 'city': 'Noida', 'ml_skills': ['development', 'computer science', 'typescript', 'architecture', 'influence', 'presentations', 'software development', 'integration', 'azure services', 'automation', 'integration automation', 'cloud-native applications', 'algorithms', 'aws', 'product management', 'applications', 'java', 'design', 'devops'], 'latitude': '28.6092086', 'type': 'Full time', 'multi_location': ['Noida, Uttar Pradesh, India'], 'locale': 'en_US', 'title': 'Sr. Computer Scientist', 'multi_location_array': [{'latlong': {'lon': 77.3482038, 'lat': 28.6092086}, 'location': 'Noida, Uttar Pradesh, India'}], 'jobSeqNo': 'ADOBUSR133541EXTERNALENUS', 'postedDate': '2023-01-03T00:00:00.000Z', 'descriptionTeaser': 'Responsible for design and architecture of new products. Be well versed in emerging industry technologies and trends, and have the ability to c

Get Able jobs (where it has greenhouse board and initial html response contains data)

In [None]:
able_url = 'https://boards.greenhouse.io/able'
able_response = requests.get(able_url, headers=headers)
able_soup = BeautifulSoup(able_response.content, 'html5lib')
# print(able_soup.prettify())

able_jobs = able_soup.find_all('a', attrs={"data-mapped":"true"})
# print(able_jobs)

# get the job title from able_jobs list stored in text
able_jobs_title = [job.text for job in able_jobs]
# print(able_jobs_title)

# get job link from attribute href
able_jobs_link = [greenhouse+job['href'] for job in able_jobs]
# print(able_jobs_link)

# get job location from span.location innertext
able_jobs_location = [able_soup.find_all('span', class_="location")[i].text for i in range(len(able_jobs))]
# print(able_jobs_location)

# writing results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id and key(we will use index of jobs[]).

for i in range(len(able_jobs_title)):
    jobs.append({
        'title': able_jobs_title[i],
        'location': able_jobs_location[i],
        'link': able_jobs_link[i],
        'posted': '',
        'description': '',
        'id': '',
        'company': 'Able'
    })


Get Affinidi jobs (Not working)

In [15]:
affinidi_url = 'https://www.affinidi.com/careers'

# use selenium to get the page source
driver = Firefox()
driver.get(affinidi_url)

# wait for the page to load till div.opening is visible
try:
    element = WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "section.level-0"))
    )
    affinidi_soup = BeautifulSoup(driver.page_source, 'html5lib')
finally:
    driver.quit()

print (affinidi_soup.prettify())
affinidi_jobs = affinidi_soup.find_all('div', class_='opening')
print(affinidi_jobs)
# get span.location innertext from affinidi_jobs list consisting India.
affinidi_jobs_location = [affinidi_soup.find_all('span', class_="location")[i].text for i in range(len(affinidi_jobs))]
affinidi_jobs_location = [location for location in affinidi_jobs_location if 'India' in location]
print(affinidi_jobs_location)

TimeoutException: Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:182:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:394:5
element.find/</<@chrome://remote/content/marionette/element.sys.mjs:275:16


Get AgileSolutions jobs 
Can't find website _sed lyf_.

Get Agnikul jobs (where data renders through javascript probably a react website)  
Send email to humancapital@agnikul.in

In [None]:
agnikul_url = 'https://agnikul.in/#/career'

# use selenium to get the page source
driver = Firefox(options=options)
driver.get(agnikul_url)

# wait for the page to load till div.career__description is not found
try:
    elem = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "div.career__description"))
    )
    agnikul_soup = BeautifulSoup(driver.page_source, 'html5lib')

finally:
    driver.quit()

# print(agnikul_soup.prettify())

# get agnikul_jobs list from div.career__description leaving the first 3 elements
agnikul_jobs = agnikul_soup.find_all('div', class_='career__description')[3:]
# print(agnikul_jobs)

# get the job title from agnikul_jobs list stored in p.career__job innertext
agnikul_jobs_title = [
    job.find('p', class_='career__job').text for job in agnikul_jobs]
# print(agnikul_jobs_title)

# get job location from p.career__head innertext which is direct child of agnikul_jobs consisting "Location" ignoring which are not direct child
agnikul_jobs_location = [
    job.find('p', class_='career__head', recursive=False).text.split("Location: ")[1] for job in agnikul_jobs]
# print(agnikul_jobs_location)

# get job link which is same as career url
agnikul_jobs_link = [agnikul_url for job in agnikul_jobs]
# print(agnikul_jobs_link)

# get job description from ul innertext sibling of p.career__head innertext equal to "RESPONSIBILITIES"
agnikul_jobs_description = [
    job.find('p', class_='career__head', text="RESPONSIBILITIES").find_next_sibling().text for job in agnikul_jobs]
# print(agnikul_jobs_description)

# write results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id, key(we will use index of jobs[]) and company name.

for i in range(len(agnikul_jobs_title)):
    jobs.append({
        'title': agnikul_jobs_title[i],
        'location': agnikul_jobs_location[i],
        'link': agnikul_jobs_link[i],
        'posted': '',
        'description': agnikul_jobs_description[i],
        'id': '',
        'company': 'Agnikul'
    })


Get Airbase jobs

In [None]:
airbase_url = 'https://boards.greenhouse.io/airbase'
airbase_response = requests.get(airbase_url, headers=headers)
airbase_soup = BeautifulSoup(airbase_response.content, 'html5lib')
# print(airbase_soup.prettify())

airbase_jobs = airbase_soup.find_all('a', attrs={"data-mapped": "true"})
# print(airbase_jobs)

# get job location from span.location innertext only if it consists "India"
airbase_jobs_location = [airbase_soup.find_all('span', class_="location")[i].text for i in range(
    len(airbase_jobs)) if "India" in airbase_soup.find_all('span', class_="location")[i].text]

# get the job title from airbase_jobs list stored in text only corresponding to airbase_jobs_location list made above
airbase_jobs = [airbase_job for airbase_job in airbase_jobs if airbase_job.find_next_sibling().find_next_sibling().text in airbase_jobs_location]
airbase_jobs_title = [airbase_job.text for airbase_job in airbase_jobs]
# print(airbase_jobs_title)

# get job link from attribute href
airbase_jobs_link = [greenhouse+job['href'] for job in airbase_jobs]
# print(airbase_jobs_link)

# writing results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id and key(we will use index of jobs[]).

for i in range(len(airbase_jobs_title)):
    jobs.append({
        'title': airbase_jobs_title[i],
        'location': airbase_jobs_location[i],
        'link': airbase_jobs_link[i],
        'posted': '',
        'description': '',
        'id': '',
        'company': 'Airbase'
    })

# print(jobs)

Get Airbnb jobs (It has .json file endpoint)

In [41]:
airbnb_url = 'https://careers.airbnb.com/wp-admin/admin-ajax.php?action=fetch_greenhouse_jobs&which-board=airbnb&strip-empty=true'
airbnb_response = requests.get(airbnb_url, headers=headers).json()

airbnb_jobs = airbnb_response['jobs']
# print()

# get objects which consists "location" as "Bangalore, India" or "Gurugram, India" or "India"
airbnb_jobs = [airbnb_job for airbnb_job in airbnb_jobs if airbnb_job['location'] in [
    "Bangalore, India", "Gurugram, India", "India"]]
# print(airbnb_jobs)

# get job title from airbnb_jobs list stored in title key
airbnb_jobs_title = [airbnb_job['title'] for airbnb_job in airbnb_jobs]
# print(airbnb_jobs_title)

# get job location from airbnb_jobs list stored in location key
airbnb_jobs_location = [airbnb_job['location'] for airbnb_job in airbnb_jobs]
# print(airbnb_jobs_location)

# get job link from airbnb_jobs list stored in id key
airbnb_jobs_link = ["https://careers.airbnb.com/positions/"+str(airbnb_job['id']) for airbnb_job in airbnb_jobs]
# print(airbnb_jobs_link)

# writing results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id and key(we will use index of jobs[]).

for i in range(len(airbnb_jobs_title)):
    jobs.append({
        'title': airbnb_jobs_title[i],
        'location': airbnb_jobs_location[i],
        'link': airbnb_jobs_link[i],
        'posted': '',
        'description': '',
        'id': '',
        'company': 'Airbnb'
    })

['Analytics and Insights Senior Analyst', 'Analytics & Insight Sr. Analyst', 'Associate Principal, Enterprise Tech (Oracle Cloud)', 'Systems Engineer', 'Associate Principal, Finance Technology', 'Senior Analytics Developer', 'Staff Product Manager- Payments Platform', 'Salesforce Engineer', 'Senior Software Engineer', 'Senior Software Engineer Payments and Hosting', 'Senior Staff Software Engineer- Trust Platform', 'Sr Engineering Manager - Information Security', 'Staff Software Engineer, Ambassador Platforms', 'Staff Software Engineer - Cities', 'Staff Software Engineer- Payment Economics', 'Accountant', 'Senior Accountant', 'Senior Tax Associate', 'Business Analyst', 'Manager, Appeals & Escalations', 'Manager, Claims', 'Supervisor Claims', 'Supervisor, Training', 'WFM Analyst', 'Operations Engineer, Biztech', 'Senior System Engineer', 'Senior Integration Engineer', 'Specialist, Accounts Payable', 'Talent Partner ', 'HR Business Systems Analyst, BizTech ']


Get AirBus jobs (Extracted .json)

In [None]:
airbus_headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
    "Accept": "application/json",
    "Accept-Language": "en-US",
    "Content-Type": "application/json"
}

airbus_url = 'https://ag.wd3.myworkdayjobs.com/wday/cxs/ag/Airbus/jobs'

# store offset for pagination
offset = 0

# send request to get jobs by sending offset as 0 and limit as 20
airbus_response = requests.post(airbus_url, headers=airbus_headers, json=
    {"appliedFacets": {"locationCountry": ["c4f78be1a8f14da0ab49ce1162348a5e"]}, "limit": 20, "offset": offset, "searchText": ""}).json()
# print(airbus_response)

# get total jobs count stored in total key of response
airbus_total_jobs = airbus_response['total']
# print(airbus_total_jobs)

# get jobs list stored in jobPosings key of response
airbus_jobs = airbus_response['jobPostings']

# increment for next request
offset += 20
# send request to get jobs by sending offset increased by 20 and limit as 20 till offset is less than total jobs count
while offset < airbus_total_jobs:
    airbus_response = requests.post(airbus_url, headers=airbus_headers, json={
        "appliedFacets": {"locationCountry": ["c4f78be1a8f14da0ab49ce1162348a5e"]}, "limit": 20, "offset": offset, "searchText": ""}).json()
    offset += 20
    airbus_jobs += airbus_response['jobPostings']

# print(len(airbus_jobs))

# get job title from airbus_jobs list stored in title key
airbus_jobs_title = [airbus_job['title'] for airbus_job in airbus_jobs]
# print(airbus_jobs_title)

# get job location from airbus_jobs list stored in locationsText key
airbus_jobs_location = [airbus_job['locationsText'] for airbus_job in airbus_jobs]
print(airbus_jobs_location)

# get job link from airbus_jobs list stored in externalPath key
airbus_jobs_link = ["https://ag.wd3.myworkdayjobs.com/en-US/Airbus"+airbus_job['externalPath'] for airbus_job in airbus_jobs]
# print(airbus_jobs_link)

# get posted date from airbus_jobs list stored in postedOn key
airbus_jobs_posted = [airbus_job['postedOn'] for airbus_job in airbus_jobs]
# print(airbus_jobs_posted)

# get job id from airbus_jobs list stored at 0th index of bulletFields array
airbus_jobs_id = [airbus_job['bulletFields'][0] for airbus_job in airbus_jobs]
# print(airbus_jobs_id)

# writing results into a json file named as jobs.json in objects list with each object containing job title, location, link, posted date, description, id and key(we will use index of jobs[]).

for i in range(len(airbus_jobs_title)):
    jobs.append({
        'title': airbus_jobs_title[i],
        'location': airbus_jobs_location[i],
        'link': airbus_jobs_link[i],
        'posted': airbus_jobs_posted[i],
        'description': '',
        'id': airbus_jobs_id[i],
        'company': 'Airbus'
    })