In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Employer Postings

In [2]:
# Specify path of employer postings:
# CB 7.16 - In future, read this from hosted db. For now, Excel file will suffice
path = "C://Users//Colin Bryan//Documents//Personal//TiO//TiO-Job-Connections//data//TiO - Employer Partner Job Postings.xlsx"

# Create employer postings dataframe and return URLs
def get_employer_postings(path):

    # Read file path into dataframe
    postings_df = pd.read_excel(path)

    # Drop NaNs
    postings_df.dropna(subset = ['Opening URL'], inplace = True)
    
    # Save URLs into a list
    url_list = list(postings_df['Opening URL'])

    return postings_df, url_list

## Indeed

In [18]:
def scrape_job_postings(url_list):
    
    # Create list of indeed_postings onl
    indeed_url_list = []
    
    # Get list of posts from Indeed
    for url in url_list:
        if 'indeed.com' in url:
            indeed_url_list.append(url)
    
    # Create lists to store data
    job_titles = []
    job_locations = []
    job_descriptions = []

    # Get data from each Indeed URL:    
    for url in indeed_url_list:
        
        # Get html data from the URL
        html_data = requests.get(url).text
        
        # Pass into parser
        soup = BeautifulSoup(html_data, 'html.parser')
        
        # Save job titles and locations into list
        try:
            # Get page title
            page_title = soup.title.get_text(strip = True)
            
            # Split page title to get job title and location
            job_title = page_title.split(' - ')[0]            
            location = page_title.split(' - ')[1]
            
            # If there is a hyphen in the job title, resplit. 
            # Hypothesis is that a location will be missing a comma if there is a hypen in the title and it's not remote
            if ',' not in location:
                if 'remote' not in location.lower():
                    # Split page title to get job title and location
                    job_title = page_title.split(' - ')[0] + ' - ' + page_title.split(' - ')[1]         
                    location = page_title.split(' - ')[2]
            
            # Append title and location to lists
            job_titles.append(job_title)
            job_locations.append(location)
            
        except:
            job_titles.append('Could not find title')
            job_locations.append('Could not find location')
        
        # Save job descriptions into list
        try:
            job_descriptions.append(
                soup.select_one("#jobDescriptionText").get_text(strip=True, separator="\n")
            )
        except:
            job_descriptions.append('Could not find description')
            
        
    # Package up everything into dataframe by creating dictionary first
    indeed_dict = {'Title':job_titles, 'Location':job_locations, 'Description':job_descriptions, 'URL':indeed_url_list}

    # Create dataframe
    indeed_df = pd.DataFrame(indeed_dict)
    
    # Drop rows that didn't return results
    indeed_df = indeed_df[indeed_df['Description'] != 'Could not find description'].reset_index(drop=True)
    
    return indeed_df      
    

In [19]:
employer_df, url_list = get_employer_postings(path)

In [20]:
df = scrape_indeed_postings(url_list)

In [21]:
df.head()

Unnamed: 0,Title,Location,Description,URL
0,Controls Engineer,"Souderton, PA 18964",Are you ready to break free of the mundane?\nL...,https://www.indeed.com/viewjob?jk=80d9cb72bee4...
1,MECHANICAL ENGINEER,"Souderton, PA 18964",MECHANICAL ENGINEER\nWe are currently seeking ...,https://www.indeed.com/viewjob?jk=e2579ee28fab...
2,JANITOR FT/PT,"Souderton, PA 18964",Are you ready to join a winning team?\nLooking...,https://www.indeed.com/viewjob?jk=f68b80a593f2...
3,Payroll Specialist/HR Support,"Souderton, PA 18964",Are you ready to join a winning team?\nLooking...,https://www.indeed.com/viewjob?jk=5ffd7d307573...
4,INDUSTRIAL COATING HELPER,"Souderton, PA 18964",INDUSTRIAL COATING HELPER\nAre you ready to jo...,https://www.indeed.com/viewjob?jk=aef5651413f2...


## LinkedIn

In [17]:
def scrape_job_postings(url_list):
    
    # Create list of indeed_postings onl
    linkedin_url_list = []
    
    # Get list of posts from Indeed
    for url in url_list:
        if 'linkedin.com' in url:
            linkedin_url_list.append(url)
    
   # Re-initialize lists to store data while looping
    job_titles = []
    job_locations = []
    job_descriptions = []

    # Get data from each Indeed URL:    
    for url in linkedin_url_list:
        
        # Get html data from the URL
        html_data = requests.get(url).text
        # Pass into parser
        soup = BeautifulSoup(html_data, 'html.parser')
                
        
        print(soup)
        #job_location = soup.find_all('h3', class_='sub-nav-cta__header')
         
        
        #print(job_location)
                

        # Save job titles into list
        try:
            # Get job title
            job_title = soup.find_all('h1', class_='top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title')[0].text.strip()
            
            # Append title to list
            job_titles.append(job_title)
            
        except:
            job_titles.append('Could not find title')
        
        # Save job locations into list
        try:
            # Get job location
            job_location = soup.find_all('span', class_="topcard__flavor topcard__flavor--bullet")[0].text.strip()
            
            # Remote processing:
            if soup.find_all('span', class_="jobs-unified-top-card__workplace-type")[0].text.strip() == 'Remote':
                job_location = job_location + " (Remote)"            
            
            # Append location to list
            job_locations.append(job_location)
            
        except:
            job_locations.append('Could not find location')
        
    #     # Save job descriptions into list
    #     try:
    #         job_descriptions.append(
    #             soup.select_one("#jobDescriptionText").get_text(strip=True, separator="\n")
    #         )
    #     except:
    #         job_descriptions.append('Could not find description')
            
        
    # Package up everything into dataframe by creating dictionary first
    linkedin_dict = {'Title':job_titles, 'Location':job_locations, 'URL':linkedin_url_list}
    
    print(job_titles)
    print(job_locations)
    # Create dataframe
    linkedin_df = pd.DataFrame(linkedin_dict)

    # Add "Source" column
    linkedin_df['Source'] = 'LinkedIn'

    return linkedin_df

In [18]:
employer_df, url_list = get_employer_postings(path)

In [19]:
df = scrape_job_postings(url_list)

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="d_jobs_guest_details" name="pageKey"/>
<!-- --> <meta content="en_US" name="locale"/>
<meta data-app-version="2.0.1064" data-browser-id="8e6aed2e-0f9a-4d34-8273-584ce0206e31" data-call-tree-id="AAXkUXGtb9mV4U9dTBXNkg==" data-disable-jsbeacon-pagekey-suffix="false" data-enable-page-view-heartbeat-tracking="" data-multiproduct-name="jobs-guest-frontend" data-page-instance="urn:li:page:d_jobs_guest_details;CLwzMk4fSo6WTmrmAVz1mg==" data-service-name="jobs-guest-frontend" id="config"/>
<link href="https://www.linkedin.com/jobs/view/director-of-product-ecommerce-remote-at-under-armour-3178004598" rel="canonical"/>
<!-- --><!-- -->
<!-- -->
<meta content="https://www.linkedin.com/jobs/view/director-of-product-ecommerce-remote-at-under-armour-3178004598" property="al:android:url"/>
<meta content="com.linkedin.android" property="al:android:package"/>
<meta content="LinkedIn" property="al:android:app_name"/>
<meta content="https://www.link

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="d_jobs_guest_details" name="pageKey"/>
<!-- --> <meta content="en_US" name="locale"/>
<meta data-app-version="2.0.1064" data-browser-id="7c9202c7-bf7d-4efc-8e36-da8e1e5a6e47" data-call-tree-id="AAXkUXG08fEkQVpXJo+PTA==" data-disable-jsbeacon-pagekey-suffix="false" data-enable-page-view-heartbeat-tracking="" data-multiproduct-name="jobs-guest-frontend" data-page-instance="urn:li:page:d_jobs_guest_details;n221GN33RP2MP91GcvNqKg==" data-service-name="jobs-guest-frontend" id="config"/>
<link href="https://www.linkedin.com/jobs/view/director-recordkeeping-product-architect-at-fidelity-investments-2792829814" rel="canonical"/>
<!-- --><!-- -->
<!-- -->
<meta content="https://www.linkedin.com/jobs/view/director-recordkeeping-product-architect-at-fidelity-investments-2792829814" property="al:android:url"/>
<meta content="com.linkedin.android" property="al:android:package"/>
<meta content="LinkedIn" property="al:android:app_name"/>
<meta c

In [6]:
df

Unnamed: 0,Title,Location,URL,Source
0,"Director of Product, Ecommerce (REMOTE)",Could not find location,https://www.linkedin.com/jobs/view/3178004598/...,LinkedIn
1,"Director, Recordkeeping Product Architect",Could not find location,https://www.linkedin.com/jobs/view/2792829814/...,LinkedIn


# API

In [68]:
def skills_API(skill):
    # Use API Layer's skills database
    url = "https://api.apilayer.com/skills?q={}".format(skill)

    # CB 7.16 - Hide API key in a secrets file before sharing?
    headers= {
    "apikey": ""
    }

    # Get response and result
    response = requests.request('GET', url, headers=headers)
    result = response.json()
    
    if response.status_code == 200:
        return result
    raise Exception(result.get('message'))

In [69]:
response = skills_API('java')

In [70]:
response

['Java',
 'Java2D',
 'JavaFX',
 'Java3D',
 'Java 6',
 'Java 5',
 'JavaSE',
 'JavaCC',
 'JavaOS',
 'Java 1']

In [21]:
5 % 3

2