Packages that we have to install
scrapy,scrapy-crawler,and gspread

In [None]:
# importing the libraries
import scrapy
import gspread

In [None]:
#Creating a class for scraping
class LinkedJobsSpider(scrapy.Spider):
    # Name of the Spider
    name = "linkedin_jobs"

    #LinkedIn Api url for worldwide job fetching except(USA and Canada)
    api_url = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?currentJobId=3822072449&f_CR=102890883%2C105015875%2C101282230%2C106057199%2C101165590%2C102713980%2C102890719%2C103350119%2C101620260%2C102454443%2C103291313%2C101452733%2C105646813&geoId=92000000&location=Worldwide&origin=JOB_SEARCH_PAGE_JOB_FILTER&refresh=true&sortBy=R&start='
    
    # Starting point for the spider for crawl
    def start_requests(self):
        # Initial page number for job listing
        first_job_on_page = 0
        #Making the url for first page
        first_url = self.api_url + str(first_job_on_page)
        #Start the spider with a request to the initial url and specift the callback function
        yield scrapy.Request(url=first_url, callback=self.parse_job, meta={'first_job_on_page': first_job_on_page})

    # Parse job information from a specific job page
    def parse_job(self, response):
        first_job_on_page = response.meta['first_job_on_page']
        #creating the empty dictionary
        job_item = {}
        # Extract job information using CSS selectors
        jobs = response.css("li")
        #storing the number of job post in the page 
        num_jobs_returned = len(jobs)
        # Extract job information using CSS selectors
        for job in jobs:
            #fetching the data for each job post
            job_item['job_title'] = job.css("h3::text").get(default='not-found').strip()
            job_item['company_name'] = job.css('h4 a::text').get(default='not-found').strip()
            job_item['company_location'] = job.css('.job-search-card__location::text').get(default='not-found').strip()
            job_item['job_listed'] = job.css('time::text').get(default='not-found').strip()
            job_item['job_detail_url'] = job.css(".base-card__full-link::attr(href)").get(default='not-found').strip()
            job_item['company_link'] = job.css('h4 a::attr(href)').get(default='not-found')
            # Yield the job information for further processing
            yield job_item
            #Function calling for storing the extracted data into the sheet
            self.write_to_sheet(job_item)

        # Check if there are more jobs on the next page
        if num_jobs_returned > 0:
            # Increment the page number by 25 for the next page 
            first_job_on_page = int(first_job_on_page) + 25
            # Construct the URL for the next page
            next_url = self.api_url + str(first_job_on_page)
            # Make a request to the next page and specify the callback function
            yield scrapy.Request(url=next_url, callback=self.parse_job, meta={'first_job_on_page': first_job_on_page})
    
    #Storing the extracted information into the google sheet
    def write_to_sheet(self, job_item):
        # Authenticate with Google Sheets using credentials
        gc = gspread.service_account(filename='mycred.json')
        # Open the specified Google Sheet by name
        wks = gc.open('linkfind').sheet1
        # Extract job information fields and append a new row to the Google Sheet
        row = [
            job_item.get('job_title', ''),
            job_item.get('company_name', ''),
            job_item.get('company_location', ''),
            job_item.get('job_listed', ''),
            job_item.get('job_detail_url', ''),
            job_item.get('company_link', ''),
        ]
        # Append the row to the Google Sheet
        wks.append_row(row)

In [None]:
# Import necessary modules from Scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

In [None]:
# Create a CrawlerProcess instance with default settings
process = CrawlerProcess(settings=Settings())

In [None]:
# Crawl the spider named 'LinkedJobsSpider'
process.crawl(LinkedJobsSpider)

In [None]:
# Start the CrawlerProcess to run the spider
process.start()