# Objective
Speed up my job search by automating searching through the websites I frequently use.

## Next steps
- Add more websites
- Filter jobs based on currently active jobs (date filter)
- Make filter criteria configurable externally
- Make script executable
- Send update to email
- Schedule script on server

# Preparation

In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import requests
import pprint
import re
from bs4 import BeautifulSoup

In [2]:
# Make root folder the current working directory
os.chdir('..')
# Folders
folder_settings = './data/'
folder_raw = './data/raw/'
folder_interim = './data/interim/'
folder_clean = './data/processed/'

In [3]:
# Create empty arrays per datapoint
job_titles = []
job_links = []
job_locations = []
job_companies = []
job_exp_dates = []
job_source = []

In [4]:
date_today = datetime.date.today().strftime("%Y-%m-%d")

# Scrape websites

## Nextbillion

In [5]:
urls = ['https://nextbillion.net/jobs/?jobs-page=1', 'https://nextbillion.net/jobs/?jobs-page=2',  'https://nextbillion.net/jobs/?jobs-page=2']

In [6]:
# Use loop to extract required data per 
for url in urls:
    soup_page = requests.get(url)
    soup_page = BeautifulSoup(soup_page.content, 'html.parser')
    # grabs each job
    jobs = soup_page.findAll("li", {"class":"clearfix"})
    for job in jobs:
        job_title = job.h3.text
        job_titles.append(job_title)
        job_location = job.findAll("dd")[1].text
        job_locations.append(job_location)
        job_company = job.findAll("dd")[0].text
        job_companies.append(job_company)
        job_exp_date = job.findAll("dl")[3].dd.text
        job_exp_dates.append(job_exp_date)
        job_link = job.h3.a.get('href')
        job_links.append(job_link)
        job_source.append('nextbillion')

## Findevgateway

In [7]:
urls = ['https://www.findevgateway.org/jobs?job_type=All&regions=&countries=&remote=All&f%5B0%5D=job_remote%3A4111&f%5B1%5D=job_type%3A3906']

In [8]:
# Use loop to extract required data per website
for url in urls:
    soup_page = requests.get(url)
    soup_page = BeautifulSoup(soup_page.content, 'html.parser')
    # select each available job
    jobs = soup_page.findAll("div", {"class":"listing__text"})
    # extract data per job in arrays
    for job in jobs:
        try:
            job_title = job.find("a", {"hreflang":"en"}).text
        except IndexError:
            job_title = 'Not available'
        job_titles.append(job_title)
        try:
            job_location = job.findAll("div", {"class":"postmeta"})[1].text.split(':')[1]
        except IndexError:
            job_location = 'Not available'
        job_locations.append(job_location)
        try:
            job_company = job.find("div", {"class":"postmeta"}).text[10:]
        except IndexError:
            job_company = 'Not available'
        job_companies.append(job_company)
        try:
            job_exp_date = job.find("time", {"class":"datetime"}).text
        except IndexError:
            job_exp_date = 'Not available'
        job_exp_dates.append(job_exp_date)
        try:
            job_link = 'https://www.findevgateway.org' + job.h3.a.get('href')
        except IndexError:
            job_post_date = 'Not available'
        job_links.append(job_link)
        job_source.append('findevgateway')

# Create dataframe

In [9]:
# create dataframe based on extracted data
jobs_df = pd.DataFrame({'job_title': job_titles,
                        'company': job_companies,
                        'location': job_locations,
                        'expiration_date': job_exp_dates,
                        'source': job_source,
                        'weblink': job_links})

In [10]:
# Count unfiltered jobs
jobs_df['job_title'].count()

32

# Filter relevant jobs

In [11]:
jobs_filtered_df = jobs_df.copy()

In [12]:
# make job titles lower case to prevent 
jobs_filtered_df['job_title'] = jobs_filtered_df['job_title'].str.lower()

In [13]:
filter_criteria = 'manager|data science|data scientist|analytics'

In [14]:
jobs_filtered_df = jobs_filtered_df[jobs_filtered_df['job_title'].str.contains(filter_criteria)]

# Export

In [16]:
# Export complete dataframe
filename = date_today + ' - Job search social data scientist - UNFILTERED.csv'
jobs_df.to_csv(folder_interim + filename)

In [17]:
# Export filtered data
filename = date_today + ' - Job search social data scientist - FILTERED.csv'
jobs_filtered_df.to_csv(folder_clean + filename)

In [18]:
# count jobs after filtering
jobs_filtered_df['job_title'].count()

9

In [19]:
jobs_filtered_df

Unnamed: 0,job_title,company,location,expiration_date,source,weblink
7,senior engagement manager,Sattva Consulting,Mumbai,05/27/2021,nextbillion,https://nextbillion.net/jobs/senior-engagement...
9,product manager,Taptap,"London, Paris, or New York",05/02/2021,nextbillion,https://nextbillion.net/jobs/product-manager/
13,community manager,MovingWorlds,Global / Virtual,05/15/2021,nextbillion,https://nextbillion.net/jobs/community-manager/
14,grants & impact manager,Pawame Kenya Ltd,"Nairobi, Kenya",04/25/2021,nextbillion,https://nextbillion.net/jobs/grants-impact-man...
17,product manager,Taptap,"London, Paris, or New York",05/02/2021,nextbillion,https://nextbillion.net/jobs/product-manager/
21,community manager,MovingWorlds,Global / Virtual,05/15/2021,nextbillion,https://nextbillion.net/jobs/community-manager/
22,grants & impact manager,Pawame Kenya Ltd,"Nairobi, Kenya",04/25/2021,nextbillion,https://nextbillion.net/jobs/grants-impact-man...
24,data scientist,Angaza,"San Francisco, CA",31 Jul 2021,findevgateway,https://www.findevgateway.org/jobs-internships...
26,manager,Instiglio,"Nairobi, Kenya",30 Jun 2021,findevgateway,https://www.findevgateway.org/jobs-internships...
