# Job Sieve
---

Development documentation for the screen scraping of multiple job boards for useful statistics, filtered high-value prospects, and correlative analytics.

---

## <a name="toc"></a> Table of Contents
1. [Proof of Concept](#poc)



## <a name="poc"></a> [Proof of Concept](#toc)

...



In [4]:
# -------------------- LOAD DEPENDENCIES -------------------- #

# Environment hard reset
%reset -f

# Standard math and data libraries
import numpy as np
import pandas as pd

# Plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline

# Libraries for scraping
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import lxml.html as lh
import ssl

# Date time for date operations
import datetime

# Levenshtein fuzzy comparisons
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

# Import string cleaning functions
import re

# Flask support
from flask import request, jsonify

# Configure paths
from pathlib import Path
# data_path = Path('Datasets')


## ...




In [12]:
# -------------------- FORM AND RUN QUERY -------------------- #

def form_query(keywords, location=None):
    
    base = "https://www.indeed.com/jobs?"
    
    keyword_chain = str()
    for keyword in keywords:
        if len(keyword_chain) == 0:
            keyword_chain += "q=" + keyword
        else:
            keyword_chain += "+" + keyword

    if location:
        location = "&l=" + location
        
    return base + keyword_chain + location


# TEST #

keywords = ["CCNA"]
location =  "Albuquerque"

form_query(keywords, location)


'https://www.indeed.com/jobs?q=CCNA&l=Albuquerque'

In [11]:
# -------------------- RUN QUERY -------------------- #

def run_query(query):
    response = requests.get(query)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup.prettify()


# TEST #

keywords = ["CCNA"]
location =  "Albuquerque"

query = form_url(keywords, location)
html = run_query(query)


In [24]:
# -------------------- PARSE HTML -------------------- #

keywords = ["CCNA"]
location =  "Albuquerque"

query = form_url(keywords, location)

# ---------- #

# def parse_html(query):
    
# For ignoring SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Making the website believe that you are accessing it using a Mozilla browser
req = Request(query, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the HTML page for easy extraction of data.
soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
profile = {}
trading = {}
fundamentals = {}

# Iterate through cards
for td in soup.findAll("td", attrs={"id": "resultsCol"}):
    for card in td.findAll("div", attrs={"class": "jobsearch-SerpJobCard unifiedRow row result"}):
        
        # Parse title
        title = parse_card_title(card)

# ---------- #


In [None]:
#   HELPER   #
# ---------- #

def parse_card_title(card):
    h2 = card.findAll("h2", attrs={"class": "title"})[0]
    title = h2.findAll("a", attrs={"class": "jobtitle turnstileLink"})[0]
    return title.text.strip()


In [None]:
def ParseHTML(query):
    
    # For ignoring SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    # Making the website believe that you are accessing it using a Mozilla browser
    req = Request(query, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()

    # Creating a BeautifulSoup object of the HTML page for easy extraction of data.
    soup = BeautifulSoup(webpage, 'html.parser')
    html = soup.prettify('utf-8')
    profile = {}
    trading = {}
    fundamentals = {}
    
    # TRADING
    
    # Previous Close
    for td in soup.findAll('td', attrs={'data-test': 'PREV_CLOSE-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Previous Close'] = span.text.strip()
    
    # Open Value
    for td in soup.findAll('td', attrs={'data-test': 'OPEN-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Open'] = span.text.strip()

    # Present Value
    for span in soup.findAll('span', attrs={'class': 'Trsdu(0.3s) Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(b)'}):
        trading['Present Value'] = span.text.strip()
            
    # Bid
    for td in soup.findAll('td', attrs={'data-test': 'BID-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Bid'] = span.text.strip()

    # Ask
    for td in soup.findAll('td', attrs={'data-test': 'ASK-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Ask'] = span.text.strip()

    # Present Growth
    for div in soup.findAll('div', attrs={'class': 'D(ib) Va(t)'}):
        for span in div.findAll('span', recursive=False):
            profile['Present Growth'] = span.text.strip()

    # Day's Range
    for td in soup.findAll('td', attrs={'data-test': 'DAYS_RANGE-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Day Range'] = span.text.strip()

    # Fifty-two Week Range
    for td in soup.findAll('td', attrs={'data-test': 'FIFTY_TWO_WK_RANGE-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Fifty-Two Week Range'] = span.text.strip()

    # Trading Volume
    for td in soup.findAll('td', attrs={'data-test': 'TD_VOLUME-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Day Volume'] = span.text.strip()

    # Average 3M Volume
    for td in soup.findAll('td', attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}):
        for span in td.findAll('span', recursive=False):
            trading['Average 3M Volume'] = span.text.strip()
            
    # FUNDAMENTALS

    # Market Capitalization
    for td in soup.findAll('td', attrs={'data-test': 'MARKET_CAP-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['Market Capitalization'] = span.text.strip()

    # Beta 3Y
    for td in soup.findAll('td', attrs={'data-test': 'BETA_3Y-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['Beta 3Y'] = span.text.strip()

    # PE Ratio
    for td in soup.findAll('td', attrs={'data-test': 'PE_RATIO-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['PE Ratio'] = span.text.strip()

    # EPS Ratio
    for td in soup.findAll('td', attrs={'data-test': 'EPS_RATIO-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['EPS Ratio'] = span.text.strip()

    # Earnings Date
    for td in soup.findAll('td', attrs={'data-test': 'EARNINGS_DATE-value'}):
        trading['Earnings Date'] = []
        for span in td.findAll('span', recursive=False):
            fundamentals['Earnings Date'] = span.text.strip()

    # Dividend and Yield
    for td in soup.findAll('td', attrs={'data-test': 'DIVIDEND_AND_YIELD-value'}):
        fundamentals['Dividend'] = td.text.strip().split()[0]
        fundamentals['Dividend Yield'] = td.text.strip().split()[1].translate({ord(i): None for i in '()%'})

    # Ex Dividend Date
    for td in soup.findAll('td', attrs={'data-test': 'EX_DIVIDEND_DATE-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['Ex Dividend Rate'] = span.text.strip()

    # One Year Target Price
    for td in soup.findAll('td', attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value'}):
        for span in td.findAll('span', recursive=False):
            fundamentals['One Year Target Price'] = span.text.strip()

    # Other Details
    profile['Trading'] = trading
    profile['Fundamental'] = fundamentals
    
    # Return full profile
    return profile