In [6]:
# Number of jobs needed to be scraped.
num_of_jobs_needed = 100

# If we run out of coutries_states to scrape, then the program will stop, regardless of the number_of_jobs.
countries_or_states = "United States California,Canada Toronto,India Dehli"

# The topic which we are searching for. This will go into the url
topic = "Machine Learning"


In [7]:

# Processing user input
topic = topic.replace(" ", "+")

#Processing user input
countries_states = [country_state.strip() for country_state in countries_or_states.split(",")]


# Path to the webdriver, saved as env variable
driver_path = r"C:\Users\dilsh\Downloads\chromedriver_win32\chromedriver.exe"  # Todo("CHANGE THIS TO UR PATH! so it looks like driver_path = 'YOUR STRING' ")

# time out needed between events, based on Wi-Fi and PC performance
time_out = 0.5

# Boolean which controls if the browser activities will be shown on screen on or not.
browser_visible = True

# Get from pytz list
scraping_timezone = "Canada/Pacific"

# Pre-Processed Data Publish Location
pre_location = r"data/unprocessedData/"

# Processed Data Publish Location
process_location = r"data/processedData/"

In [8]:
# Imports for program function

import time
from datetime import datetime, timedelta

import pandas as pd
from bs4 import BeautifulSoup
from pytz import timezone
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By


class jobScraper:

    def __init__(self, topic, country_states, driver_path, links_needed, time_out, broswer_vis, timezone, pre_location):
        """Initializes the class, sets the variables and customizes scraping"""

        self.links_collected = 0
        self.num_links_needed = links_needed
        self.driver_path = driver_path
        self.time_out = time_out
        self.timezone = timezone

        sel_service = Service(self.driver_path)
        option = webdriver.ChromeOptions()

        # Disable asking for location prompts or tracking location, as it may affect which jobs are shown
        option.add_argument('--deny-permission-prompts')
        option.add_argument('--disable-geolocation')

        # Use chrome incognito
        if not broswer_vis:
            option.add_argument("--window-size=1920,1080")
            option.add_argument("--headless")
            user_agent = 'ozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
            option.add_argument(f'user-agent={user_agent}')

        # Get the country/state/city from the list of countries, set the URL, Scrape, then save
        
        for country_state in country_states:
            if self.num_links_needed > self.links_collected:
                self.driver = webdriver.Chrome(service=sel_service, options=option)
                self.driver.get(f"https://www.google.com/search?q={topic}+Jobs+{country_state.replace(' ', '+')}")
                self.saveToCsv(self.getJobData(), country_state.replace(' ', '_'), pre_location)
                self.driver.delete_cookie("CONSENT")
                self.driver.close()
            
            else:
                self.driver.quit()

    def getJobData(self):
        """Scrolls down, gets a job postings data, and then continues until it reaches the end"""


            
        # Click on the Jobs tab and select the list we will scroll
        self.driver.find_element(By.ID, 'fMGJ3e').click()
        job_list = self.driver.find_element(By.CLASS_NAME, 'zxU94d')

        # Contain all the data saved to the csv
        all_job_data = list()

        li_count = 0

        while self.num_links_needed > self.links_collected:
            
        
            # Finds multiple lists of jobs in the scrollable div
            li_focus = self.driver.find_elements(By.CLASS_NAME, "nJXhWc")

            # Checks if we reached the end of the scrollable div
            if len(li_focus) < li_count + 1:
                break

            # Clicks on the job postings in the li and stores the data
            for li in li_focus[li_count].find_elements(By.TAG_NAME, "li"):

                post_data = {}
                li.click()
                time.sleep(self.time_out)

                # Get HTML of only specific job posting
                bSoup = BeautifulSoup(self.driver.find_element(By.ID, "tl_ditsc").get_attribute("outerHTML"),
                                      'html.parser')

                jobDetail = bSoup.findAll("div", {"class": "I2Cbhb"})

                # Default values
                post_data["Job-Type"] = "NA"
                post_data["Date-Posted"] = "NA"
                post_data["Salary"] = "NA"

                post_data["Job-Title"] = bSoup.find("h2").text
                post_data["Date-Scraped"] = f"{datetime.now(timezone(self.timezone)).strftime('%Y-%m-%d %H:%M:%S')}"
                # Gets the job type, date posted, and salary
                for element in jobDetail:

                    if element.text in ("Full-time", "Part-time", "Internship", "Contractor"):

                        post_data["Job-Type"] = element.text

                    elif "ago" in element.text:

                        if "day" in element.text:

                            post_data["Date-Posted"] = (
                                (datetime.now() - timedelta(days=(int(element.text[0:2].strip())))).astimezone(
                                    timezone(self.timezone)).strftime("%Y-%m-%d"))

                        else:
                            post_data["Date-Posted"] = datetime.now().astimezone(timezone(self.timezone)).strftime(
                                "%Y-%m-%d")

                    elif "a year" in element.text:
                        post_data["Salary"] = element.text

                post_data["Url"] = self.driver.current_url
                post_data["Company"] = bSoup.findAll("div", {"class": "nJlQNd"})[0].text
                post_data["Location"] = bSoup.findAll("div", {"class": "sMzDkb"})[1].text
                post_data["Description"] = bSoup.find("span",
                                                      {"class": "HBvzbc"}).text

                self.links_collected += 1
                all_job_data.append(post_data)

            # Scroll down to the next list of jobs
            self.driver.execute_script(
                f'arguments[0].scrollTop = arguments[0].scrollTop + arguments[0].offsetHeight;', job_list
            )

            li_count += 1
            time.sleep(self.time_out)

            
        return all_job_data[0:self.num_links_needed-1]

    def saveToCsv(self, data, country, pre_location):
        """"Saves the data to a csv file and overwrites any previous data"""

        df = pd.DataFrame(data, columns=["Job-Title", "Date-Posted", "Date-Scraped", "Url", "Company", "Job-Type", "Salary", "Location", "Description"])
        df.to_csv(f'{pre_location}/machineLearningJobData_{country}.csv', mode='w', index=False)


In [9]:
import os
import re

import pandas as pd
from bs4 import BeautifulSoup
from langdetect import detect


class jobProcessor():
    def __init__(self, process_location, pre_location):
        for file in os.listdir(pre_location):
            self.df = pd.read_csv(f"{pre_location}" + file)
            self.check_for_dup() 
            self.check_html() 
            self.check_lang()
            self.format_text() # It may actually be better to drop this
            self.df.fillna("NA", inplace=True)
            self.salary_fix()
            self.pushCsv(file, process_location)

    def check_for_dup(self):
        self.df.drop_duplicates(['Job-Title', 'Company', 'Description'], keep='last', inplace=True)

    def format_text(self):

        for i in range(len(self.df)):
            input_string = " ".join(self.df.iloc[i, self.df.columns.get_loc("Description")].strip().split())
            output_string = re.sub(r'[^a-zA-Z0-9 -:,;.!]', '', input_string)

            self.df.iloc[i, self.df.columns.get_loc("Description")] = output_string

    def salary_fix(self):
        for i in range(len(self.df)):
            self.df.iloc[i, self.df.columns.get_loc("Salary")] = " ".join(self.df.iloc[i, self.df.columns.get_loc("Salary")].split())

    def check_lang(self):
        for i in range(len(self.df)):

            if detect(self.df.iloc[i, self.df.columns.get_loc("Description")]) != "en" or detect(
                    self.df.iloc[i, self.df.columns.get_loc("Description")]) != "en":
                self.df.drop(i, axis=0)

    def check_html(self):

        for i in range(len(self.df)):

            if bool(BeautifulSoup(self.df.iloc[i, self.df.columns.get_loc("Description")], "html.parser").find()):
                self.df.drop(i, inplace=True)

    def pushCsv(self, file, process_location):
        self.df.to_csv(f'{process_location}/{file}', mode='w', index=False)


In [10]:

if __name__ == "__main__":
    jobScraper = jobScraper(topic, countries_states, driver_path, num_of_jobs_needed, time_out, browser_visible,
                            scraping_timezone, pre_location)
    jobProcessor(process_location,pre_location)