In [85]:
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests

In [86]:
template="https://in.indeed.com/jobs?q={}&l={}"

In [87]:
#Defining it in a function 
# We don't need to use formatting a url to fill in the spaces the requests will interpret correctly without them on this website
# However we shouldn't include any puntuation or characters which we dont typically find in a url

In [88]:
def get_url(position,location):
    #to generate a url from position and location
    template='https://in.indeed.com/jobs?q={}&l={}'
    url=template.format(position,location)
    return url

In [89]:
url=get_url('data scientist','Mumbai')
url

'https://in.indeed.com/jobs?q=data scientist&l=Mumbai'

# Extracting the raw html

In [90]:
# Request libraries returns a response object passing in the url we created as the argument this will send a request to the
# website and response will be sent back 
# if the request is successfull the response object will return a code of 200

In [91]:
response=requests.get(url)

In [92]:
response

<Response [200]>

In [93]:
# BeautifulSoup To navigate html tree stucture other webpage and extract the relevent parts

In [None]:
soup=BeautifulSoup(response.text,features='lxml')
soup

In [None]:
print(soup.prettify())

In [None]:
cards=soup.find_all('div','job_seen_beacon')
cards

# Prototype of the model with single record

In [None]:
card=cards[0]
card

In [98]:
atag=card.h2.a
atag

<a aria-label="full details of Decision Scientist" class="jcs-JobTitle" data-hide-spinner="true" data-hiring-event="false" data-jk="385095c9ea636f76" data-mobtk="1g5hf3tdfjv4t800" href="/rc/clk?jk=385095c9ea636f76&amp;fccid=5c439b65a0bd045b&amp;vjs=3" id="job_385095c9ea636f76" role="button" target="_blank"><span id="jobTitle-385095c9ea636f76" title="Decision Scientist">Decision Scientist</span></a>

In [99]:
stag=atag.span
job_title=stag['title']
job_title

'Decision Scientist'

In [100]:
# get will return the required criteria
# we need to specify 'https://in.indeed.com' domain name to guide to the url

In [101]:
job_url='https://in.indeed.com'+atag.get('href')
job_url

'https://in.indeed.com/rc/clk?jk=385095c9ea636f76&fccid=5c439b65a0bd045b&vjs=3'

In [102]:
company=card.find('span','companyName').text
company

'FedEx Express MEISA'

In [103]:
location=card.find('div','companyLocation').text
location

'Mumbai, Maharashtra'

In [104]:
job_summary=card.find('div','job-snippet').text.strip()
job_summary=job_summary.split('\n')
for i in job_summary:
    print(i)

Acts as a technical expert on complex and specialist subject(s).
Supports management with the analysis, interpretation and application of complex information,…


In [105]:
post_date=card.find('span','date').text
post_date

'PostedToday'

In [106]:
try:
    salary_range=card.find('div','metadata salary-snippet-container').text
except AttributeError:
    salary_range=''
salary_range

''

In [107]:
from datetime import datetime

In [108]:
today=datetime.today().strftime('%d-%m-%Y')
today

'14-06-2022'

# generalizing the function

In [109]:
def get_record(card):
    '''Extract job record from single record'''
    atag=card.h2.a
    stag=atag.span
    job_title=stag['title']
    job_url='https://in.indeed.com'+atag.get('href')
    company=card.find('span','companyName').text
    location=card.find('div','companyLocation').text
    post_date=card.find('span','date').text
    job_summary=card.find('div','job-snippet').text.strip()
    job_summary=job_summary.split('\n')
    today=datetime.today().strftime('%d-%m-%Y')
    try:
        salary_range=card.find('div','metadata salary-snippet-container').text
    except AttributeError:
        salary_range=''
    record=(job_title,company,location,job_summary,post_date,today,salary_range,job_url)
    return record

In [None]:
records=[]
for card in cards:
    record=get_record(card)
    records.append(record)
records

In [111]:
# We're going to use get method to extract the value of href property which will gives us the URL of the next page
# This serves two purposes first it's going to tell us where to go next but it will also be an incdication that we've reached the end of the program
# if the find method can't find this tag on the page if program can't find the tag it will return the Attribute error because 
# we are trying to pull href property from an object that doen't exist
# This is relative URL so pre-pin the root of the web page to get the full URL

In [112]:
url='https://in.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
url

'https://in.indeed.com/jobs?q=data+scientist&l=Mumbai&start=10'

In [113]:
# Creating a while loop that continues to run until this url block returns an attribute error at which the loop can break out

In [114]:
while True:
    try:
        url='https://in.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
    except AttributeError:
        break
    response=requests.get(url)
    soup=BeautifulSoup(response.text,'html.parser')
    cards=soup.find_all('div','job_seen_beacon')
    for card in cards:
        record=get_record(card)
        records.append(record)
        

In [115]:
len(records)

270

In [32]:
#creating a main funtion using all the defined functions

In [83]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv
from datetime import datetime

def get_url(position,location):
    template='https://in.indeed.com/jobs?q={}&l={}'
    url=template.format(position,location)
    return url
def get_record(card):
    '''Extract job record from single record'''
    atag=card.h2.a
    stag=atag.span
    job_title=stag['title']
    job_url='https://in.indeed.com'+atag.get('href')
    company=card.find('span','companyName').text
    location=card.find('div','companyLocation').text
    post_date=card.find('span','date').text
    job_summary=card.find('div','job-snippet').text.strip().replace('\n','')
    today=datetime.today().strftime('%d-%m-%Y')
    try:
        salary_range=card.find('div','metadata salary-snippet-container').text
    except AttributeError:
        salary_range=''
    record=(job_title,company,location,post_date,today,salary_range,job_summary,job_url)
    return record
def main(position,location):
    records=[]
    url=get_url(position,location)
    while True:
        response=requests.get(url)
        soup=BeautifulSoup(response.text,'html.parser')
        cards=soup.find_all('div','job_seen_beacon')
        for card in cards:
            record=get_record(card)
            records.append(record)
        try:
            url='https://in.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')#--goes to next page
        except AttributeError:
            break
            
        with open('results.csv','w',newline='',encoding='utf-8') as f:
            writer=csv.writer(f)
            writer.writerow(['job_title','company','location','post_date','today','salary_range','job_summary','job_url'])
            writer.writerows(records)


In [41]:
main('full time','India')

In [35]:
df=pd.read_csv('results.csv')

In [36]:
#created 1200+ records using webscraping

In [37]:
df

Unnamed: 0,job_title,company,location,post_date,today,salary_range,job_summary,job_url
0,Data Entry Lead,Talent500 Tech (India) Private Limited,"Hyderabad, Telangana",PostedToday,14-06-2022,"₹5,00,000 a year",The Lead Data Entry Associate is responsible f...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1,BVHMI - Human Resources Manager - (Full Time P...,BVH Services,"Gandhidham, Gujarat",EmployerActive 6 days ago,14-06-2022,"₹45,000 - ₹60,000 a month",Organization and time management skills.Job Ty...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
2,Customer Success Executive,LIVINGFOODCO PRIVATE LIMITED,"Bengaluru, Karnataka",EmployerActive 2 days ago,14-06-2022,"₹3,00,000 - ₹5,00,000 a year","Job Types: Full-time, Regular / Permanent.Answ...",https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
3,Office Administrator,LIVINGFOODCO PRIVATE LIMITED,"Bengaluru, Karnataka",EmployerActive 8 days ago,14-06-2022,"₹4,00,000 - ₹4,50,000 a year",Coordinate office activities and operations to...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
4,Fresher Jobs - US Recruiter - Night Shift,Denken Solutions India Private Limited,"Guntur, Andhra Pradesh",Posted11 days ago,14-06-2022,,Source candidates using job portals including ...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
...,...,...,...,...,...,...,...,...
1255,General Nurses - work in the UK,Servisource Recruitment Ltd,"Delhi, Delhi",Hiring ongoing,14-06-2022,"₹35,00,000 a year",Servisource Healthcare UK in conjunction with ...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1256,Lead / Principal Biostatistician,Labcorp,"Bengaluru, Karnataka",Posted30+ days ago,14-06-2022,,Lead complex studies such as NDA submissions o...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1257,Senior QA Engineer,7-Eleven,"Bengaluru, Karnataka",Posted30+ days ago,14-06-2022,,"For 90 years, 7-Eleven has been successfully m...",https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
1258,Occupational Therapist,Health care,"Chennai, Tamil Nadu",Posted30+ days ago,14-06-2022,,Bachelors of Occupational Therapy/Masters of O...,https://in.indeed.com/pagead/clk?mo=r&ad=-6NYl...
