In [3]:
import selenium
from selenium import webdriver
import sqlite3
from selenium.webdriver.common.keys import Keys
import os
import time
import re

import urllib.request
import requests
from bs4 import BeautifulSoup

# Helper Functions

In [4]:
def setup_webdriver(): 
    
    # set up wen driver
    chromedriver = r"C:\Users\Anthony\Desktop\chromedriver.exe"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    return driver


def create_soup(url):
    ''' create bs4 object '''
    
    r = requests.get(url, headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
    })
    return BeautifulSoup(r.content, "html5lib")


def scrape_job_links(url):
    
    soup = create_soup(url)
    
    # get hrefs ONLY for non- header and footer job postings 
    lst1= []
    for div in soup.find_all('div', {'class':' row result'}):
        links = div.find('a')['href']
        #print(links)
        lst1.append(links)

    # last row has different class name
    for div in soup.find_all('div', {'class':'lastRow row result'}):
        links = div.find('a')['href']
        #print(div)
        lst1.append(links)
    
    links = ['https://www.indeed.com' + i for i in lst1]
    return links


def next_page_url(url):
    ''' returns the url for the next page on an indeed job search page '''
    
    soup = create_soup(url)
    
    # this creates a list of "Results Page" at bottom of screen... last url will always be next page 
    next_pages_urls = []
    for i in soup.find_all(attrs={'class': 'pagination'}):
        a_tags = i.find_all('a')
        for a in a_tags:
            next_pages = 'https://www.indeed.com' + a['href']
            next_pages_urls.append(next_pages)

    return next_pages_urls[-1]


def filter_links(links_lst):
    '''separates scraped indeed links based on whether they redirect you to 
       internal indeed job posts or external company website '''
    
    indeed_links = []
    non_indeed_links = []
    for i in links_lst:
        if i.startswith('https://www.indeed.com/rc'):
            non_indeed_links.append(i)
            pass
        else:
            indeed_links.append(i)
            
    return indeed_links, non_indeed_links


def bottom_scroll(webdriver):
    ''''scroll to bottom of page--- w/ page search pages -- not individual job postings'''
    element=webdriver.find_element_by_xpath('//*[@id="resultsCol"]/div[18]')
    return element.location_once_scrolled_into_view

# Main Functions

In [5]:
def indeed_scraper(job_name, job_location, int_pages_to_search): 
    '''this combined everything '''
    

    driver = setup_webdriver()
    
    # set website to scrape and query 
    driver.get("https://www.indeed.com/")
    driver.find_element_by_id('what').clear()
    driver.find_element_by_id('what').send_keys(job_name)
    driver.find_element_by_id('where').clear()
    driver.find_element_by_id('where').send_keys(job_location)
    driver.find_element_by_id('fj').click()

    
    # x out advertisement 
    driver.find_element_by_id('prime-popover-x').click()
    
    # get current url and pass into bs4
    this_url = driver.current_url
    date_soup = create_soup(this_url)
    
    # sort by date 
    date_button = date_soup.find(attrs={'class': 'no-wrap'})
    date_button2 = date_button.find('a')['href']
    sort_by_date = 'https://www.indeed.com' + date_button2
    
    # pass url to driver & get current url to scrape links 
    driver.get(sort_by_date)
    
    # scrape em
    all_job_links = []
    for i in range(int(int_pages_to_search)):
        # get current url
        driver.get(driver.current_url)
        
        # get job links & append to list
        job_links = scrape_job_links(driver.current_url)
        all_job_links.extend(job_links)
        
        # scroll to the bottom of screen
        bottom_scroll(driver)
        
        # go to next page
        url_next = next_page_url(driver.current_url)
        driver.get(url_next)
    return all_job_links

# scrape text from only the indeed postings -- text stored in paragraph and list tags 

def job_description_to_query(url, keywords): 
    ''' job descriptions have p and li tags -- when scraped theyre list of lists --
       flatten lists separetely, combined into 1 list, make everything lowercase &
       remove special characters '''
    
    # create soup
    soup = create_soup(url)
    
    # get all p tags
    para_lst = []
    for p in soup.find_all('p'):
        if 'style' in p.attrs:
            pass
        else: 
            p_text = p.get_text()
            para_lst.append(p_text)

    # p tags is a list of lists so we need to flatten it
    new_para_lst=[]
    para_len = len(para_lst)

    # split items in list then iterate over each list and append to new list 
    para_split = [i.split() for i in para_lst]
    for x in range(para_len):
        for i in para_split[x]:
            new_para_lst.append(i)

    # get all li tags 
    lists = soup.find_all('li')
    lists_text = [i.get_text() for i in lists]        

    # li tag is a list of lists so we need to flatten it 
    new_li_tag_lst = []
    lists_len = len(lists_text)

    lists_split = [i.split() for i in lists_text]
    for x in range(lists_len):
        for i in lists_split[x]:
            new_li_tag_lst.append(i)

    # get job title and company name 
    job_name = soup.find('b', attrs={'class': 'jobtitle'}).get_text()
    company = soup.find('span', attrs={'class':'company'}).get_text()
    
    # combined cleaned lists 
    job_description = new_li_tag_lst + new_para_lst
    
    # format lists by remove any non-alphanum char, spaces, and make everything lowercase
    clean_job_description = []
    for i in job_description:
        remove_spec_chars = re.sub('[^A-Za-z0-9]+', '', i) # removes anything thats not a letter or number 
        lowercase = remove_spec_chars.lower()
        if lowercase: # remove spaces in list 
            clean_job_description.append(lowercase)
    
    #keywords = ['python', 'pandas']
    for i in clean_job_description:
        if any(word in i for word in keywords):
            return url
        

def view_jobs(jobs_filtered, lst_of_keywords, sleep_time):
    ''' given a list of ONLY INDEED JOB LINKS -- filters by keyword,
        opens job post in new tab, gives user time to read, opens next 
        job post in new tab in same window -- all tabs that remain open 
        are the posts user deemed relevant '''

    driver = setup_webdriver()

    for j in jobs_filtered:
        job_url =job_description_to_query(j, lst_of_keywords)
        concat_url = 'window.open("' + job_url + '","_blank");'
        driver.execute_script(concat_url)
        time.sleep(int(sleep_time))

# search indeed for job name and location 

In [6]:
jobs = indeed_scraper('python developer', 'New York, NY', 5)

# separate links based on whether indeed postings or links take you to company website -- indeed posts are easier to search job description 

In [7]:
jobs_filt = filter_links(jobs)

# take indeed links & see if keywords are in job description...
# launch a new tab in same window for every job posting...
# program sleeps to allow time to read job posting...
# if user not interested in job, x it out...
# and new posting will appear in new tab in same window
# all leftover posts are the ones user is interested in 

In [6]:
keywords = ['python', 'training']

view_jobs(jobs_filt[0], keywords, 5)

KeyboardInterrupt: 

In [None]:
driver = setup_webdriver()

In [32]:
def create_db_table(full_path_to_db):
    
    conn = sqlite3.connect(full_path_to_db)
    c = conn.cursor()
    c.execute('''CREATE TABLE indeed_jobs
        (url text, 
        company_name text, 
        job_title text)''')
    
    conn.commit()
    conn.close()

def post_to_db(jobs_filt, full_path_to_db):
    
    driver = setup_webdriver()
    
    create_db_table(full_path_to_db)
    
    conn = sqlite3.connect(full_path_to_db)
    c = conn.cursor()
    
    tup_db = []
    for url in jobs_filt:
        driver.get(url)
        soup = create_soup(driver.current_url)
        job_name = soup.find('b', attrs={'class': 'jobtitle'}).get_text()
        company = soup.find('span', attrs={'class':'company'}).get_text()
        tup = job_name, company, driver.current_url
        tup_db.append(tup)
        
        c.execute("insert into indeed_jobs (url, company_name, job_title) values (?, ?, ?)",
            (driver.current_url, company, job_name))
        conn.commit()
    #conn.close()
        
    return tup_db

In [34]:
db_path = r'C:\Users\Anthony\Documents\db\indeed8.sqlite'

post_to_db(jobs_filt[0], db_path)

[('Ruby on Rails Engineer',
  'Intelletec',
  'https://www.indeed.com/cmp/Intelletec/jobs/Ruby-Rail-Engineer-7d5e112d9ae5fe2b'),
 ('OOP - Python',
  'IITS',
  'https://www.indeed.com/cmp/IITS/jobs/Oop-754be9f354c37f91'),
 ('Web Architect',
  'Access Staffing',
  'https://www.indeed.com/cmp/Access-Staffing-LLC/jobs/Web-Architect-9c07bc52cbe296f8'),
 ('Scala Developer',
  'Ascon Soft',
  'https://www.indeed.com/cmp/Ascon-Soft/jobs/Scala-Developer-e1f6d27e50cdacd0'),
 ('ETL Architect',
  'Afactory HUB, Inc',
  'https://www.indeed.com/cmp/Afactory-HUB-Inc/jobs/ETL-Architect-abafff36e03d6133'),
 ('DevOps',
  'Alagen',
  'https://www.indeed.com/cmp/Alagen/jobs/Devop-af642c1243af04f4'),
 ('Java Developer',
  'Strivector',
  'https://www.indeed.com/cmp/Strivector/jobs/Java-Developer-3364e453fc19fe80'),
 ('Java Developer with Kafka',
  'Amiga Informatics',
  'https://www.indeed.com/cmp/Amiga-Informatics/jobs/Java-Developer-Kafka-abef2b4680f585c3'),
 ('Lead Systems Analyst/Programmer',
  'The La

In [12]:
jobs_filt[0]

['https://www.indeed.com/company/Intelletec/jobs/Ruby-Rail-Engineer-7d5e112d9ae5fe2b?fccid=c72a0a2653e9c59d',
 'https://www.indeed.com/company/IITS/jobs/Oop-754be9f354c37f91?fccid=8d387370c9d53b88',
 'https://www.indeed.com/company/Access-Staffing-LLC/jobs/Web-Architect-9c07bc52cbe296f8?fccid=28912ef34bc6ef48',
 'https://www.indeed.com/company/Ascon-Soft/jobs/Scala-Developer-e1f6d27e50cdacd0?fccid=cf817a4f78ecb56a',
 'https://www.indeed.com/company/Afactory-HUB-Inc/jobs/ETL-Architect-abafff36e03d6133?fccid=20075c7f8d5406e8',
 'https://www.indeed.com/company/Alagen/jobs/Devop-af642c1243af04f4?fccid=8762a3e16717dc82',
 'https://www.indeed.com/company/Strivector/jobs/Java-Developer-3364e453fc19fe80?fccid=9ed4d98a4f5cad27',
 'https://www.indeed.com/company/Amiga-Informatics/jobs/Java-Developer-Kafka-abef2b4680f585c3?fccid=531057b22748b190',
 'https://www.indeed.com/company/The-Lamont--Doherty-Earth-Observatory-of/jobs/Lead-System-Analyst-Programmer-d0b1a31fedf072b8?fccid=6f303f69f85262e0',

In [29]:
create_db_table()

In [19]:
cd db

C:\Users\Anthony\Documents\db


In [20]:
conn = sqlite3.connect('indeed4.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE indeed_jobs
    (url text, 
    company_name text, 
    job_title text)''')

c.execute("insert into indeed_jobs (url, company_name, job_title) values (?, ?, ?)",
            (current_url, company, job_name))
conn.commit()
conn.close()

In [None]:
def scroll(webdriver):
    
    # scroll to bottom of page
    element=webdriver.find_element_by_xpath('//*[@id="resultsCol"]/div[18]')
    return element.location_once_scrolled_into_view

In [34]:
def smooth_scrolling():

    driver= setup_webdriver()
    
    # get specifics 
    driver.get("https://www.indeed.com/")
    driver.find_element_by_id('what').clear()
    time.sleep(1)
    driver.find_element_by_id('what').send_keys('python developer')
    time.sleep(1)
    driver.find_element_by_id('where').clear()
    time.sleep(1)
    driver.find_element_by_id('where').send_keys('New York, NY')
    time.sleep(1)
    driver.find_element_by_id('fj').click()
    time.sleep(1)
    
    driver.find_element_by_id('prime-popover-x').click()
    time.sleep(1)
    
    # scrape jobs from that url 
    job_links = scrape_job_links(driver.current_url)
    # filter to only indeed jobs
    jobs_filt = filter_links(job_links)
    
    # start at a good xpath and count up div tags that appear in every page to 'smooth' scroll
    count = 0
    for i in range(2):
        for j in jobs_filt:
            driver.get(j)
            time.sleep(2)
            count+=1
            #print(count, j)
            str_ = str(count)

            #concat_div = '//*[@id="resultsCol"]/div[' + str_ + ']'
            try: 
                concat_div = '//*[@id="job-content"]/tbody/tr/td[1]/table/tbody/tr/td/div[2]/div[2]/div[' + str_ + ']'
                #concat_div = '//*[@id="job-content"]/tbody/tr/td[1]/table/tbody/tr/td/div[2]/div[' + str_ + ']'
                element=driver.find_element_by_xpath(concat_div)
                element.location_once_scrolled_into_view
                time.sleep(2)
                driver.get(j)
                print(j)
            except:
                browser.get(j)
                print(j)

In [35]:
smooth_scrolling()

https://www.indeed.com/company/NYTP/jobs/Python-Engineer-8578a5b36c166821?fccid=ccff84ac3dd19bb9
https://www.indeed.com/company/RAPS-consulting/jobs/Senior-Python-Developer-21d5e8cd39ab722f?fccid=9b7a3860892e2fcd
https://www.indeed.com/company/Kasisto/jobs/Front-End-Software-Engineer-e8ee0083bfbe3831?fccid=521ca7ccc49e32d5
https://www.indeed.com/company/Chase-Dream-LLC/jobs/Software-Engineer-Python-98563184e467ab84?fccid=ed0eadb761b45e00
https://www.indeed.com/company/LIS-Solutions/jobs/Python-Developer-a424fc091c993864?fccid=58aef85eed32eea8
https://www.indeed.com/company/BlindData/jobs/Software-Engineer-010d7c76f5d1fa19?fccid=55962a85574fa94f
https://www.indeed.com/company/NYTP/jobs/Python-Engineer-8578a5b36c166821?fccid=ccff84ac3dd19bb9
https://www.indeed.com/company/RAPS-consulting/jobs/Senior-Python-Developer-21d5e8cd39ab722f?fccid=9b7a3860892e2fcd
https://www.indeed.com/company/Kasisto/jobs/Front-End-Software-Engineer-e8ee0083bfbe3831?fccid=521ca7ccc49e32d5
https://www.indeed.com

In [29]:
keywords = ['python', 'training']

skip_keywoords = ['3', '4', '5', '6', '7', '8', '9']

for url in jobs_filt:
    job_desc =job_description_to_query(url)
    if any(word in job_desc for word in keywords):
        print(url)
    elif any(s in job_desc for s in skip_keywoords):
        print('!!!! too many years', url)
        pass
    else:
        print('!!!!!bad', url)

https://www.indeed.com/company/Intelletec/jobs/Ruby-Rail-Engineer-7d5e112d9ae5fe2b?fccid=c72a0a2653e9c59d
https://www.indeed.com/company/Alagen/jobs/Devop-af642c1243af04f4?fccid=8762a3e16717dc82
https://www.indeed.com/company/Ascon-Soft/jobs/Scala-Developer-e1f6d27e50cdacd0?fccid=cf817a4f78ecb56a
https://www.indeed.com/company/Afactory-HUB-Inc/jobs/ETL-Architect-abafff36e03d6133?fccid=20075c7f8d5406e8
https://www.indeed.com/company/Strivector/jobs/Java-Developer-3364e453fc19fe80?fccid=9ed4d98a4f5cad27
https://www.indeed.com/company/Amiga-Informatics/jobs/Java-Developer-Kafka-abef2b4680f585c3?fccid=531057b22748b190
!!!! too many years https://www.indeed.com/company/The-Lamont--Doherty-Earth-Observatory-of/jobs/Lead-System-Analyst-Programmer-d0b1a31fedf072b8?fccid=6f303f69f85262e0
https://www.indeed.com/company/CA--One-Tech-Cloud-Inc./jobs/Lead-Devop-Engineer-8be511513fa2ef69?fccid=3902f8b39489930b
https://www.indeed.com/company/Lorven-Technologies-Inc/jobs/Salesforce-Consultant-Net-961f

In [81]:
driver=setup_webdriver()
driver.get(jobs[0])


lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
        lastCount = lenOfPage
        time.sleep(3)
        lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        if lastCount==lenOfPage:
            match=True