In [265]:
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
nltk.download('stopwords')

from selenium import webdriver
import time
import pandas as pd

PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cisco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [266]:
def get_user_info():
    # Prompting user for job title and zip-code
    while True:
        job_title = str(input("What job title are you searching for? (Don't enter digits or symbols)"))
        if all(i.isalpha() or i.isspace() for i in job_title):
            break
        else:
            print("Please do not enter digits or symbols. ")

    # Handling non-integer exceptions
    while True:
        try:
            user_zip_code = int(input("What is your 5 digit zip code? "))
            if len(str(user_zip_code)) == 5:
                break
        except ValueError:
            print("That was not a valid 5 digit zip code. Please try again. ")

    job_title = "front end developer"
    user_zip_code = "33976"
    get_url(job_title, user_zip_code)


In [267]:
def get_url(job_title, user_zip_code):
    # Using user input to create url
    cleaned_url = job_title.replace(" ", "+")
    url = "https://www.indeed.com/jobs?q=" + cleaned_url + "&l=" + str(user_zip_code)
    driver.get(url)

    # Give page time to load to see if pop up actually appears
    time.sleep(3)

    # Check if pop up box appears
    try:
        # Exits "creating a account" popup window
        driver.find_element_by_xpath("//button[@class='popover-x-button-close icl-CloseButton']").click()
    except:
        pass

    print(f"Searching through {url} for results...")

    scrape()


In [268]:
def scrape():
    page_info = []
    main_info = []
    job_website_links = []

    '''
    Alternative form of getting job text
    main_content = driver.find_element_by_id('resultsCol')
    # print(main_content.text)

    time.sleep(5)

    jobs = main_content.find_elements_by_class_name('resultContent')
    time.sleep(5)

    for job in jobs:
        time.sleep(5)
        print(job.text)
        print("-----------------------")
    '''

    main_content = driver.find_element_by_id('mosaic-provider-jobcards')
    time.sleep(10)
    jobs = main_content.find_elements_by_tag_name('a')
    time.sleep(10)

    # Looping through every 'a' tag within the main_content
    for job in jobs:
        # Getting correct href and printing it
        #   there is several 'a' tags within the original 'a' tag.
        correct_link = job.get_attribute('data-mobtk')
        if correct_link is None:
            continue
        else:
            href = job.get_attribute('href')
            time.sleep(10)

            # Put info into list
            page_info.append(job.text + '\n' +href)

    # Looping through page_info to append the main info we want to a new list
    for item in page_info:

        if not item.startswith('new'):
            [position_name, company_name, company_location, *_, link] = item.split('\n')
            main_info.append({'Position Title' : position_name, 'Company Name': company_name, 'Location': company_location, 'Link': link})
        else:
            [_, position_name, company_name, company_location, *_, link] = item.split('\n')
            main_info.append({'Position Title' : position_name, 'Company Name': company_name, 'Location': company_location, 'Link': link})

        job_website_links.append(link)

    create_dataframe(main_info)
    analyzing_sites(job_website_links)

    '''
    Works but takes much longer

    print("-------------------Job Titles--------------------")
    titles = driver.find_elements_by_tag_name("h2")
    for title in titles:
        print(title.text)

    time.sleep(5)

    print("===================Job Locations====================")
    time.sleep(5)

    company_location = driver.find_elements_by_class_name('companyLocation')
    for location in company_location:
        print(location.text)

    time.sleep(5)
    print("===================Company Names=====================")
    time.sleep(5)
    company_name = driver.find_elements_by_class_name('companyName')
    for company in company_name:
        print(company.text)
    '''


In [269]:
def create_dataframe(main_info):
    df = pd.DataFrame(main_info)
    # creating a csv
    file_name = 'first_page_jobs_list.csv'
    df.to_csv(file_name)


In [270]:
def analyzing_sites(job_website_links):

    STOP = stopwords.words("english")

    # writing to a file
    with open('analysis.txt', 'w') as wf:

        # looping through the href list
        for item in job_website_links:
            driver.get(item)

            # getting job description of a site
            job_description_text = driver.find_element_by_id('jobDescriptionText').text
            time.sleep(10)

            wf.write(job_description_text)
            time.sleep(10)

    with open('analysis.txt', 'r') as rf:
        text = rf.read()
        text_tokens = word_tokenize(text)

        # creating bag of words
        word_bag = [word.lower() for word in text_tokens if len(word) > 2 and word not in STOP]

        # creating ngrams of different length
        unigram = ngrams(word_bag, 1)
        bigram = ngrams(word_bag, 2)
        trigram = ngrams(word_bag, 3)
        quadrigram = ngrams(word_bag, 4)

    unigram_freq = collections.Counter(unigram)
    bigram_freq = collections.Counter(bigram)
    trigram_freq = collections.Counter(trigram)
    quadrigram_freq = collections.Counter(quadrigram)


    frequency(unigram_freq, bigram_freq, trigram_freq, quadrigram_freq)




In [271]:
def frequency(unigram_freq, bigram_freq, trigram_freq, quadrigram_freq):


    for word, freq in unigram_freq.most_common(20):
        print(word, "appears", freq, "times")
    print()
    for words, freq in bigram_freq.most_common(15):
        print(words, "appears", freq, "times")
    print()
    for words, freq in trigram_freq.most_common(15):
        print(words, "appears", freq, "times")
    print()
    for words, freq in quadrigram_freq.most_common(15):
        print(words, "appears", freq, "times")



In [272]:
interested = input("Are you interested in finding a job in your area? (yes/no)")

if interested.lower().startswith("y"):
    get_user_info()
else:
    print("Have A Great Day!")


Searching through https://www.indeed.com/jobs?q=front+end+developer&l=33976 for results...
('experience',) 91
('development',) 39
('work',) 33
('team',) 32
('sales',) 30
('web',) 28
('years',) 23
('business',) 21
('required',) 21
('preferred',) 21
('design',) 20
('software',) 19
('projects',) 18
('new',) 17
('time',) 17

('development', 'team') 10
('design', 'development') 9
('preferred', 'experience') 7
('physician', 'group') 7
('best', 'practices') 6
('web', 'developer') 6
('millennium', 'physician') 6
('3-5', 'years') 6
('help', 'grow') 6
('grow', 'business') 6
('independent', 'contractor') 6
('sales', 'experience') 6
('please', 'briefly') 6
('briefly', 'describe') 6
('team', 'members') 5

('millennium', 'physician', 'group') 6
('please', 'briefly', 'describe') 6
('the', 'senior', 'front-end') 4
('senior', 'front-end', 'engineer') 4
('experience', 'bachelor', 'degree') 3
('equal', 'opportunity', 'employer') 3
('job', 'type', 'full-time') 3
('experience', '3-5', 'years') 3
('3-5', 'y