In [None]:
import csv
from datetime import datetime, date
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from random import random
import time

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")


def get_URL(position, location):
    """[Build a template url]

    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]

    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=7&sort=date'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_features(web):
    """[Designates desired features and provides for their initial processing]

    Args:
        web ([Data from web pull]): [Single job posting]

    Returns:
        [Data]: [Retieved from pull and processed]
    """
    job_title = web.h2.a.get('title')
    company = web.find('span', 'company').text.strip()
    job_location = web.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = web.find('span', 'date').text
    summary = web.find('div', 'summary').text.strip().replace('\n', ' ')
    today = datetime.today().strftime('%Y-%m-%d')
    job_url = 'https://www.indeed.com' + web.h2.a.get('href')
    
    
    def job_description(job_url):
        """[Retrieves data from job summary page attached to each query result]

        Args:
            job_url ([string]): [url to the specific posting]

        Returns:
            [tuple of strings]: [job requirements, job description]
        """
        # I'd noticed that most Indeed webscrapers either skip the descriptive text contained
        # in the actual posting. Here, I repeat much of the process used to retrieve the job
        # postings but use the url given by those postings to dig a bit deeper.
        response_jobDesc = requests.get(job_url)
        soup = BeautifulSoup(response_jobDesc.text, 'html.parser')
        # https://stackoverflow.com/questions/63231164/indeed-web-scraping-python-selenium-beautifulsoup
        try:
            requirements = soup.find(class_="icl-u-xs-block jobsearch-ReqAndQualSection-item--title").text.replace("\n", "").strip()
        except:
            requirements = 'None'
        try:
            description = soup.find(id="jobDescriptionText").text.replace('\n', '')
        except:
            description = 'None'
        # A nifty little workaround for evading detection.
        time.sleep(3+random()*2)
        return requirements, description
    
    requirements, description = job_description(job_url)

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = web.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''
        
    data = (job_title, company, job_location, post_date, today, summary, salary, job_url, requirements, description)
    return data


def main(position, location):
    """[Conducts the web scraping process]

    Args:
        position ([string]): [job position for indeed.com query]
        position ([string]): [job location for indeed.com query]
        
        Returns:
        [csv]: [scraped data]
    """
    data = []
    url = get_URL(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        pull = soup.find_all('div', 'jobsearch-SerpJobCard')
        for web in pull:
            datapoint = get_features(web)
            data.append(datapoint)
            # Again, a nifty little workaround for evading detection.
            time.sleep(2+random()*3)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break

    name = position.replace(' ','_')
    loc = location.replace(' ','_')
    day = date.today()
    # save the job data
    with open(f'../data/scraped_{name}_{loc}_{day}.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Pay', 'JobUrl', 'Requirements', 'Description'])
        writer.writerows(data)


In [None]:
l = ''
main('("data scientist" or "data science")', l  )

In [None]:
# Use this to test for captcha block or IP ban
def get_URL(position,location):
    """[Build a template url for a dummy call to verify the site isn't returning a captcha]

    Args:
        position ([string]): [job for query]
        location ([string]): [location for query]

    Returns:
        [string]: [formatted url]
    """
    template = 'https://www.indeed.com/jobs?q={}&l={}&fromage=3&sort=date'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position,location)
    return url

position = '("data scientist" or "data science")'
location = ''

response = requests.get(get_URL(position,location))
# This will either return an HTML block for a captcha or of a search result
response.text

In [None]:
# Use this to use tor after activating it in terminal when needed.
import socks
import socket
socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.1", port=9050)

In [21]:
a = pd.read_csv(f'../data/scraped_("data_scientist"_or_"data_science")__2021-06-18.csv')
b = pd.read_csv(f'../data/total.csv')

In [23]:
c = pd.concat([a,b])

In [24]:
len(c)

6105

In [25]:
c.to_csv(f'../data/total.csv', index=False)

In [26]:
d = pd.read_csv(f'../data/total.csv')
d

Unnamed: 0,JobTitle,Company,Location,PostDate,ExtractDate,Summary,Pay,JobUrl,Requirements,Description
0,Analyst - Data Scientist,United Airlines Inc.,"Chicago, IL",Today,2021-06-18,Ready to learn and an interest in data science...,,https://www.indeed.com/rc/clk?jk=a1d3fe07f5b59...,,We have a wide variety of career opportunities...
1,Deep Learning Data Scientist,Intel,"Santa Clara, CA",Today,2021-06-18,"Bachelors in Computer science, Data science, C...",,https://www.indeed.com/rc/clk?jk=d4acb9696d3f5...,,Job DescriptionJoin Intel-and build a better t...
2,Data Scientist,Density Inc.,Remote,Today,2021-06-18,3+ years experience as a data scientist. Exten...,,https://www.indeed.com/company/Density-Inc./jo...,,"At Density, we build one of the most advanced ..."
3,Data Scientist Intermediate,Cone Health,"Greensboro, NC",Today,2021-06-18,The ability to design and apply multiple advan...,,https://www.indeed.com/rc/clk?jk=fae261040a53b...,,"LOCATION: Cone Health, SW-Enter Analytics - Co..."
4,DATA ANALYST,University of Washington,"Seattle, WA",Today,2021-06-18,Proven interest in health financing or malaria...,,https://www.indeed.com/rc/clk?jk=58b049c2565c2...,,"As a UW employee, you have a unique opportunit..."
...,...,...,...,...,...,...,...,...,...,...
6100,Senior Data Analyst – Managed Services (Servic...,CDW,"Lincolnshire, IL",30+ days ago,2021-05-18,8 years of work experience in an analytical ro...,,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,,"The Senior Data Analyst ensures the integrity,..."
6101,Business Analysis Engineer - Risk Data Managem...,Freddie Mac,"McLean, VA",30+ days ago,2021-05-18,IDEA has a critical function to manage and mai...,,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,,"Job DescriptionAt Freddie Mac, you will do imp..."
6102,Senior Data Scientist,Engtal,"Boston, MA",Active 5 days ago,2021-05-18,Own the technical and project management compo...,"$130,000 - $160,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Post-education Data Science: 4 years (Required),Senior Data ScientistWe are looking to expand ...
6103,"Senior Director, Data Science",Salesforce,"San Francisco, CA",12 days ago,2021-05-18,Hire and lead a team of high-caliber data scie...,,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,,"To get the best candidate experience, please c..."
