In [4]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Employer Postings

In [5]:
# Specify path of employer postings:
# CB 7.16 - In future, read this from hosted db. For now, Excel file will suffice
path = "C://Users//Colin Bryan//Documents//Personal//TiO//TiO-Job-Connections//data//TiO - Employer Partner Job Postings.xlsx"

# Create employer postings dataframe and return URLs
def get_employer_postings(path):

    # Read file path into dataframe
    postings_df = pd.read_excel(path)

    # Drop NaNs
    postings_df.dropna(subset = ['Opening URL'], inplace = True)
    
    # Save URLs into a list
    url_list = list(postings_df['Opening URL'])

    return postings_df, url_list

def scrape_indeed_postings(url_list):
    
    # Create list of indeed_postings onl
    indeed_url_list = []
    
    # Get list of posts from Indeed
    for url in url_list:
        if 'indeed.com' in url:
            indeed_url_list.append(url)
    
    # Create lists to store data
    job_titles = []
    job_locations = []
    job_descriptions = []

    # Get data from each Indeed URL:    
    for url in indeed_url_list:
        
        # Get html data from the URL
        html_data = requests.get(url).text
        
        # Pass into parser
        soup = BeautifulSoup(html_data, 'html.parser')
        
        # Save job titles and locations into list
        try:
            # Get page title
            page_title = soup.title.get_text(strip = True)
            
            # Split page title to get job title and location
            job_title = page_title.split(' - ')[0]            
            location = page_title.split(' - ')[1]
            
            # If there is a hyphen in the job title, resplit. 
            # Hypothesis is that a location will be missing a comma if there is a hypen in the title and it's not remote
            if ',' not in location:
                if 'remote' not in location.lower():
                    # Split page title to get job title and location
                    job_title = page_title.split(' - ')[0] + ' - ' + page_title.split(' - ')[1]         
                    location = page_title.split(' - ')[2]
            
            # Append title and location to lists
            job_titles.append(job_title)
            job_locations.append(location)
            
        except:
            job_titles.append('Could not find title')
            job_locations.append('Could not find location')
        
        # Save job descriptions into list
        try:
            job_descriptions.append(
                soup.select_one("#jobDescriptionText").get_text(strip=True, separator="\n")
            )
        except:
            job_descriptions.append('Could not find description')
            
        
    # Package up everything into dataframe by creating dictionary first
    indeed_dict = {'Title':job_titles, 'Location':job_locations, 'Description':job_descriptions, 'URL':indeed_url_list}

    # Create dataframe
    indeed_df = pd.DataFrame(indeed_dict)
    
    # Drop rows that didn't return results
    indeed_df = indeed_df[indeed_df['Description'] != 'Could not find description'].reset_index(drop=True)
    
    return indeed_df      
    

In [6]:
employer_df, url_list = get_employer_postings(path)

In [7]:
df = scrape_indeed_postings(url_list)

In [8]:
df

Unnamed: 0,Title,Location,Description,URL
0,Controls Engineer,"Souderton, PA 18964",Are you ready to break free of the mundane?\nL...,https://www.indeed.com/viewjob?jk=80d9cb72bee4...
1,MECHANICAL ENGINEER,"Souderton, PA 18964",MECHANICAL ENGINEER\nWe are currently seeking ...,https://www.indeed.com/viewjob?jk=e2579ee28fab...
2,JANITOR FT/PT,"Souderton, PA 18964",Are you ready to join a winning team?\nLooking...,https://www.indeed.com/viewjob?jk=f68b80a593f2...
3,Payroll Specialist/HR Support,"Souderton, PA 18964",Are you ready to join a winning team?\nLooking...,https://www.indeed.com/viewjob?jk=5ffd7d307573...
4,INDUSTRIAL COATING HELPER,"Souderton, PA 18964",INDUSTRIAL COATING HELPER\nAre you ready to jo...,https://www.indeed.com/viewjob?jk=aef5651413f2...
5,Production Manager,"Souderton, PA 18964",Are you ready to break free of the mundane?\nL...,https://www.indeed.com/viewjob?jk=0a7a4f606c7a...
6,WELDER,"Souderton, PA 18964",WELDERS 1ST AND 2ND SHIFT\nAre you ready to jo...,https://www.indeed.com/viewjob?jk=0d1089220c54...
7,Junior Buyer/Purchaser,"Souderton, PA 18964",Are you ready to join a winning team?\nLooking...,https://www.indeed.com/viewjob?jk=b9ff8f377567...
8,MILLWRIGHT MECHANIC,"Souderton, PA 18964",MILLWRIGHT MECHANIC\nAre you ready to break fr...,https://www.indeed.com/viewjob?jk=8ba9de851371...
9,Assembler - 1st Shift,"Malvern, PA 19355",*We are currently offering a $1000 sign-on bon...,https://www.indeed.com/viewjob?jk=b0d3146de697...


In [10]:
sub_df = df[df['Location'].str.contains('Concordville')]

In [13]:
sub_df.Location.unique()

array(['Concordville, PA 19331', 'Concordville, PA'], dtype=object)

# API

In [68]:
def skills_API(skill):
    # Use API Layer's skills database
    url = "https://api.apilayer.com/skills?q={}".format(skill)

    # CB 7.16 - Hide API key in a secrets file before sharing?
    headers= {
    "apikey": ""
    }

    # Get response and result
    response = requests.request('GET', url, headers=headers)
    result = response.json()
    
    if response.status_code == 200:
        return result
    raise Exception(result.get('message'))

In [69]:
response = skills_API('java')

In [70]:
response

['Java',
 'Java2D',
 'JavaFX',
 'Java3D',
 'Java 6',
 'Java 5',
 'JavaSE',
 'JavaCC',
 'JavaOS',
 'Java 1']

In [21]:
5 % 3

2