In [1]:
from selenium.webdriver.common.keys import Keys
from selenium import webdriver 
from bs4 import BeautifulSoup
from datetime import datetime
from math import ceil
import urllib.request
import requests
import sqlite3
import os
import time
import re

In [None]:
from selenium import webdriver

Instance = None

def Initialize():
    global Instance
    Instance = webdriver.Chrome('/Users/Anthony/Desktop/chromedriver')
    Instance.implicitly_wait(5)
    return Instance

def CloseDriver():
    global Instance
    Instance.quit()
 

In [None]:
def setup_webdriver(path_to_driver): 
    '''set up webdriver'''

    chromedriver = path_to_driver
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    return driver   


def create_soup(url):
    ''' create bs4 object '''
    r = urllib.request.urlopen(url).read()
    return  BeautifulSoup(r, "lxml")


def scrape_job_post_links(url):
    ''' returns all links from a query on indeed mobile '''
    
    soup = create_soup(url)

    # get employers -- br tags
    h2 = soup.find('h2')
    company = [h.next_element.next_element.next_element
           for h in h2.find_all_next('a')][0:10]
    company_stripped = [i.strip().replace(' -', '') for i in company]

    # get job title and job url
    job_title = []
    job_url = []
    for h2 in soup.find_all('h2', {'class': 'jobTitle'}):
        job = h2.text
        job_title.append(job)
        for a in h2:
            concat_urls = 'https://www.indeed.com/m/' + a['href']
            job_url.append(concat_urls)

    # turn unique identifiers into dictionary of tuples
    jobs_dict = dict(zip(job_url, zip(job_title, company_stripped)))

    return jobs_dict


def iterate_job_pages(driver, int_pages_to_scrape, show_uid_dict=None):
    '''returns a list of job links'''
    
    jobs_dict = {}
    
    if len(driver.current_url.split('=')) > 3:
        
        # get all links & extract url key
        uids_dict = scrape_job_post_links(driver.current_url)
        
        # store uids from 1st page in new dict
        jobs_dict = uids_dict
        
        # extend and next page 
        driver.find_element_by_xpath('/html/body/p[22]/a').click()

    page_count = 0

    for x in range(int(int_pages_to_scrape)):
        page_count+=1
        last_date = [i.text for i in driver.find_elements_by_class_name('date')][-1]
        print('Last date on page', page_count, 'of', int_pages_to_scrape, '==>', last_date)

        # page urls is a diuctionary 
        uids_dict2 = scrape_job_post_links(driver.current_url)

        # store uids from every page thats not 1 in same dict
        jobs_dict.update(uids_dict2)
        
        time.sleep(2)
        driver.find_element_by_xpath('/html/body/p[22]/a[2]').click()
        
    return jobs_dict


def create_db_table(full_path_to_db):
    ''' create database table & names columns '''

    conn = sqlite3.connect(full_path_to_db)
    c = conn.cursor()
    c.execute('''CREATE TABLE indeed_jobs
        (id integer primary key, data,
        url text, 
        company_name text, 
        job_title text,
        date_added timestamp )''')

    conn.commit()
    conn.close()
    
def check_db_for_job(uid_dict, path_to_db, new_db=None):
    
    if new_db:
        create_db_table(path_to_db)
    
    conn = sqlite3.connect(path_to_db)
    c = conn.cursor()
    
    # turn dictionary into tuple 
    tup = [(k, v[0], v[1]) for k,v in uid_dict.items()]
    
    print('Check database for duplicates...')
    new_urls = []
    for job_url, job_title, job_company in tup:

        c.execute('SELECT * FROM indeed_jobs WHERE (url=? AND company_name=? AND job_title=? AND date_added=?)', (job_url, job_company, job_title, datetime.now().date()))
        entry = c.fetchone()

        if entry is None:
            c.execute("insert or ignore into indeed_jobs (url, company_name, job_title, date_added) values (?, ?, ?, ?)",
                (job_url, job_company, job_title, datetime.now().date()))
            conn.commit()
            
            new_urls.append(job_url)
            
            print ('\n', 'New entry added', '\n', job_title.encode("utf-8"), job_company.encode("utf-8"), '\n')
        
        else:
            print ('Entry found')
            
    return new_urls
    
    
def indeed_scraper(path_to_driver, job_name, job_location, pages_to_search_int, db_path, sleep_int,
                   new_db=False):
     
    driver = setup_webdriver(path_to_driver)
 
    driver.get('https://indeed.com/m/')
    driver.find_element_by_xpath('/html/body/form/p[1]/input').clear()
    driver.find_element_by_xpath('/html/body/form/p[1]/input').send_keys(job_name)
    driver.find_element_by_xpath('/html/body/form/p[2]/input').clear()
    driver.find_element_by_xpath('/html/body/form/p[2]/input').send_keys(job_location)
    driver.find_element_by_xpath('/html/body/form/p[3]/input').click()
 
    driver.get(driver.current_url  +  '&sort=date')
    
    # returns a dictionary 
    job_dict = iterate_job_pages(driver, pages_to_search_int)
    
    # return a list 
    unique_urls = check_db_for_job(job_dict, db_path, new_db=new_db)   
    len_urls = len(unique_urls)
    minutes = len_urls * int(sleep_int) / 60
    
    
    msg = '{} {} {} {} {}'.format('There are',  len_urls,  
                              'urls and this is going to take', minutes, 
                              'minutes to view. Break it up into chunks by entering number. Enter n to skip.  ')
    
    ques = input(msg)
    if ques == 'n':
        for u in unique_urls:
            concat_url = 'window.open("' + u + '","_blank");'
            driver.execute_script(concat_url)
            time.sleep(int(sleep_int))
    else:
        count = 0
        run_loop = True
        while run_loop:
            for u in unique_urls:
                count+=1
                concat_url = 'window.open("' + u + '","_blank");'
                driver.execute_script(concat_url)
                time.sleep(int(sleep_int))
                if count % int(ques) ==0:
                    print(count, 'of', len_urls)
                    q = input('Press enter to continue')
                if count == len_urls:
                    run_loop=False
                    input('Press enter to CLOSE CURRENT WEBDRIVER WINDOW')
                    
def applied_jobs(url, full_path_to_db, new_db=False):
    ''' create db of jobs already applied to -- use new_db if 1st time creating db '''
    
    soup = create_soup(url)
    p = soup.find('p')
    
    company = [i.next_element for i in p.find_all_next('br')][0]
    company_stripped = name.strip().replace(' -', '')

    job_name = soup.find('font', {'size': '+1'}).text
    
    if new_db:
        create_db_table(full_path_to_db)
        
    conn = sqlite3.connect(full_path_to_db)
    c = conn.cursor()
    c.execute('SELECT * FROM indeed_jobs WHERE (url=? AND company_name=? AND job_title=?)', (url, company_stripped, job_name))
    entry = c.fetchone()
    
    if entry is None:
        c.execute("insert or ignore into indeed_jobs (url, company_name, job_title) values (?, ?, ?)",
            (url, company_stripped, job_name))
        conn.commit()
        print ('\n', 'New entry added', '\n', company_stripped.encode("utf-8"), job_name.encode("utf-8"), '\n')

    else:
        print ('Entry found')

def ask_questions():
    add_or_search = input('Do you want to search for jobs or add job applied to into database? search/add   ')
    if add_or_search == 'search':
        job_name = input('Enter a job name.  ')
        job_location = input('Enter a City, State location.  ')
        pages_to_scrape = input('Enter the number of pages to scrape.  ')
        sleep_int = input('Enter amount of time alloted to read job posting (in seconds).  ')
        db_path = input('Enter FULL PATH to folder you want database (must end with .sqlite).  ')
        driver_path = input('Enter FULL PATH to webdriver.  ')
        new_db = input('Is this a new database? y/n  ')

        if new_db == 'y':
            indeed_scraper(driver_path, job_name, job_location, pages_to_scrape, db_path, sleep_int, new_db=new_db)
        else:
            indeed_scraper(driver_path, job_name, job_location, pages_to_scrape, db_path, sleep_int)

    if add_or_search == 'add':
        url = input('Enter the url you would like to add to database.  ')
        db_path_applied = input('Enter FULL PATH to folder you want database (must end with .sqlite).  ')
        new_db_applied = input('Is this a new database? y/n  ')
        if new_db_applied == 'y':
            applied_jobs(url, db_path_applied, new_db=new_db_applied)
        else:
            applied_jobs(url, db_path_applied)

In [4]:
driver_path = 'C:\\Users\\Anthony\\Desktop\\chromedriver.exe'
db_path = 'C:\\Users\\Anthony\\Documents\\DB\\indeed66.sqlite'
job_name = 'python developer'
job_location = 'new york, ny'
pages_to_search = 4

In [204]:
ask_questions()

Do you want to search for jobs or add job applied to into database? search/add   search
Enter a job name.  python developer
Enter a City, State location.  new york, ny
Enter the number of pages to scrape.  2
Enter amount of time alloted to read job posting (in seconds).  2
Enter FULL PATH to folder you want database (must end with .sqlite).  C:\\Users\\Anthony\\Documents\\DB\\indeed33.sqlite
Enter FULL PATH to webdriver.  C:\\Users\\Anthony\\Desktop\\chromedriver.exe
Is this a new database? y/n  y
Last date on page 1 of 2 ==> 16 hours ago
Last date on page 2 of 2 ==> 14 hours ago
Check database for duplicates...

 New entry added 
 b'QE-GG2-Technical Engineer' b'SQS Group' 


 New entry added 
 b'Chatbot Developer Intern Fall 2017' b'Fresh Digital Group' 


 New entry added 
 b'Senior Front End Developer' b'Smith & Keller' 


 New entry added 
 b'Sr. Software Engineer - Java Development' b'Tenable' 


 New entry added 
 b'Applications Bioinformatician' b'Oxford Nanopore Technologies' 


In [None]:
C:\\Users\\Anthony\\Desktop\\chromedriver.exe 
C:\\Users\\Anthony\\Documents\\DB\\indeed35.sqlite

In [22]:
driver = setup_webdriver(driver_path)

driver.get('https://indeed.com/m/')
driver.find_element_by_xpath('/html/body/form/p[1]/input').clear()
driver.find_element_by_xpath('/html/body/form/p[1]/input').send_keys('python')
driver.find_element_by_xpath('/html/body/form/p[2]/input').clear()
driver.find_element_by_xpath('/html/body/form/p[2]/input').send_keys('new york')
driver.find_element_by_xpath('/html/body/form/p[3]/input').click()

driver.get(driver.current_url  +  '&sort=date')

# returns a dictionary 
job_dict = iterate_job_pages(driver, 3)

Last date on page 1 of 3 ==> 1 hour ago
Last date on page 2 of 3 ==> 1 hour ago
Last date on page 3 of 3 ==> 2 hours ago


In [None]:
# working version                 
def query_job_posting(url, query_list_include, query_list_exclude): 
    ''' |QUERIES MUST BE LOWERCASE|
        query_list = turns job post to list of words & if any word in list match, return url 
        query_phrase_as_str = search job post as 1 big list for phrases '''
    
    soup = create_soup(url)
    desc = soup.find_all('div', {'id': 'desc'})
    
    
    desc_lol = [i.get_text().lower().split() for i in desc]
    desc_flattened = [inner for outer in desc_lol for inner in outer]
    desc_regex = [(re.sub('[^A-Za-z0-9]+', ' ', i)) for i in desc_flattened]
    
    split_after_regex = []
    for i in desc_regex:
        split_after_regex.extend(i.split())
        
    for i in split_after_regex:
        include = any(word in i for word in query_list_include)
    
    
    uns = []
    if any(w in split_after_regex for w in query_list_include) and not any(w in split_after_regex for w in query_list_exclude):
        print(url)
        
        
def iter_query(jobs_dict, query_list_include, query_list_exclude):

    urls = [k for k,v in jobs_dict.items()]

    matches = []
    for u in urls:
        matches = query_job_posting(u, query_list_include, query_list_exclude)        
        if matches == None:
            pass
        else:
            matches.append(u)
    return matches

In [32]:
# finds all jobs that have python but not javascript in description 
query_list_include = ['python']
query_list_exclude = ['javascript']

iter_query(job_dict, query_list_include,  query_list_exclude)

https://www.indeed.com/m/viewjob?jk=15d586a5d4f1d0bd
https://www.indeed.com/m/viewjob?jk=cba50f3ef28f77a6
https://www.indeed.com/m/viewjob?jk=ff4ce51c9cdf1adc
https://www.indeed.com/m/viewjob?jk=e6f6bacb55e74937
https://www.indeed.com/m/viewjob?jk=d746d33b2d2c596d
https://www.indeed.com/m/viewjob?jk=36798c7ca32ed8d3
https://www.indeed.com/m/viewjob?jk=d5ed5321c9c867fd
https://www.indeed.com/m/viewjob?jk=cbeaae72bfb2f860
https://www.indeed.com/m/viewjob?jk=e25f549a3d98b75b
https://www.indeed.com/m/viewjob?jk=ecdfa9fdfee59449
https://www.indeed.com/m/viewjob?jk=be5f20ec5a46f151
https://www.indeed.com/m/viewjob?jk=c275fbd6e7832d67
https://www.indeed.com/m/viewjob?jk=4a96ece7d11fa109
https://www.indeed.com/m/viewjob?jk=dc4c827f92698eff
https://www.indeed.com/m/viewjob?jk=b1d2ee5e934f8e04
https://www.indeed.com/m/viewjob?jk=6f9bdba265edb2cd
https://www.indeed.com/m/viewjob?jk=3aa075762cd73fe9
https://www.indeed.com/m/viewjob?jk=2c7df1ac2499e3b7
https://www.indeed.com/m/viewjob?jk=ae96d2e070

In [12]:
ur = 'https://www.indeed.com/m/viewjob?jk=c41178aee2681cc3'

d = 'C:\\Users\\Anthony\\Documents\\DB\\indeed25.sqlite'

applied_jobs(ur, d, new_db=True)

'https://www.indeed.com/m/viewjob?jk=622a1204304c05ca'