In [123]:
import requests
#!pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
#!pip install webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# driver using chrome 
driver = webdriver.Chrome(ChromeDriverManager().install())


Checking for mac64 chromedriver:76.0.3809.126 in cache
Driver found in /Users/chi/.wdm/chromedriver/76.0.3809.126/mac64/chromedriver


# Scraping data off of Indeed for Data Scientist and Machine Learning Engineer job posts 

* I decided not only to include Data Scientist job posts but job titles such as machine learning engineer and machine learning developer. There are many jobs out there that are similar to Data Scientist and will definitely help in  in collecting more data for a better performance when modelling. The quality of data that goes in the model will reflect the quality of the modelling results vastly. 

* I will be only scraping from Indeed

In [87]:
# Lets define a use_soup function so we can use it everytime it needs to ead a url
def use_soup(url):
    driver.get(url)
    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    return soup 
    
# Note: the url scraped in this case is a search on the indeed website for data scientist jobs
# The location of the search can be altered easily if needed

vancouver = 'Vancouver%2C+BC'
toronto = 'Toronto%2C+ON'
montreal = 'Montreal%2C+QC'
ottawa = 'Ottawa%2C+ON'
calgary = 'Calgary%2C+AB'

# This is the basic url that lets us input a query and a location
basic_url = 'https://www.indeed.ca/jobs?q={}&l={}'

data_scientist = 'data+scientist'
machine_learning_engineer = 'machine+learning+engineer'



# extract_post_links takes two inputs location such as vancouver, toronto, etc
# and it also takes the number of pages num_pages of job posts after searching for data scientist

def extract_post_links(position,location, num_pages):
    url_posts = []
    titles = []
    # pages*20 since each page has roughly 20 job posts 
    total_posts = num_pages*20
    
    # we are working with an increment of 20 since there are 20 items per page 
    # suppose there are 12 pages => 220 jobs , which makes total post = 240 since python's range is not inclusive
    for item_num in range(0,total_posts,20):
        if item_num ==0:
            # the first page is slightly different where there is no '&start={}', an error will rise when included
            url = basic_url.format(position,location)

        else:
            url = basic_url.format(position,location)+'&start='+str(item_num)

        soup = use_soup(url)
        # the links are all stored under the tag div class = 'title' when inspected
        job_post_urls = soup.find_all('div', {'class': 'title'})
    
    
        # save the url for each post in url_posts (the list defined above)
        for link in job_post_urls:
            partial = link.a.get('href')
            job_title = link.a.get('title')
            # we need to add https://ca.indeed.com before the url obtained from the href 
            # this gives us a complete url
            post_url = 'https://ca.indeed.com' + partial
            titles.append(job_title)
            url_posts.append(post_url)
            
        
    return (url_posts,titles)


In [124]:
# the extract_text function is used to extract the 
# job description from clicking on the url we obtained from the extract_post_links function

def extract_text(url_posts):
    extracted = []
    for link in url_posts:
        soup = use_soup(link)
        extracted_information = soup.find('div', {'id': 'jobDescriptionText'})
        extracted_text = extracted_information.get_text()
        extracted.append(extracted_text)
    return extracted

In [89]:
# I have already manually checked all the urls in the different locations 
# since they all have different number of pages 
# this is subject to change since employers can take down job posts

# ds for data scientist, and ml for machine learning engineer

ds_van, ds_van_titles = extract_post_links(data_scientist,vancouver, 12)
ml_van, ml_van_titles  = extract_post_links(machine_learning_engineer,vancouver, 8)

# Collect Toronto Links
ds_toronto, ds_toronto_titles = extract_post_links(data_scientist,vancouver,12)
ml_toronto, ml_toronto_titles =extract_post_links(machine_learning_engineer,toronto, 10)

# Collect Ottawa Links
ds_ottawa, ds_ottawa_titles = extract_post_links(data_scientist,ottawa,3)

# Collect Montreal Links
ds_montreal, ds_montreal_titles = extract_post_links(data_scientist,montreal,9)
ml_montreal, ml_montreal_titles =extract_post_links(machine_learning_engineer,montreal, 8)

# Collect Calgary Links 
ds_Calgary, ds_Calgary_titles = extract_post_links(data_scientist,calgary,4)

In [105]:
ds_van_text = extract_text(ds_van)
ml_van_text = extract_text(ml_van)

In [125]:
ds_toronto_text = extract_text(ds_toronto)
ml_toronto_text = extract_text(ml_toronto)

In [126]:
ds_ottawa_text = extract_text(ds_ottawa)
ds_calgary_text = extract_text(ds_Calgary)

In [127]:
# I spelled montreal wrong, but its just defining a variable 
# It takes some time to extract data so we won't re run it again 
ds_montral_text = extract_text(ds_montreal)
ml_montral_text = extract_text(ml_montreal)

# Lets quickly check that the number of rows are the same 
* It may be that while I was doing this the job post was taken down and url may not work
* There is usually no need to check, if a more complicated try and except is implemented. 

In [131]:
print(len(ds_van), len(ds_van_titles), len(ds_van_text))
print(len(ml_van), len(ml_van_titles), len(ml_van_text))

print(len(ds_toronto), len(ds_toronto_titles), len(ds_toronto_text))
print(len(ml_toronto), len(ml_toronto_titles), len(ml_toronto_text))

print(len(ds_ottawa), len(ds_ottawa_titles), len(ds_ottawa_text))

print(len(ds_montreal), len(ds_montreal_titles), len(ds_montral_text))
print(len(ml_montreal), len(ml_montreal_titles), len(ml_montral_text))

print(len(ds_Calgary),len(ds_Calgary_titles), len(ds_calgary_text))


302 302 302
194 194 194
305 305 305
252 252 252
53 53 53
221 221 221
202 202 202
86 86 86


# Create dataframe and save to csv file

In [153]:
vancouver_list1 = pd.DataFrame(
    {'Job Title': ds_van_titles,
     'Description': ds_van_text,
     'Location': ['Vancouver']*302,
     'url': ds_van
    })

vancouver_list2 = pd.DataFrame(
    {'Job Title': ml_van_titles,
     'Description': ml_van_text,
     'Location': ['Vancouver']*194,
     'url': ml_van
    })

toronto_list1 = pd.DataFrame(
    {'Job Title': ds_toronto_titles,
     'Description': ds_toronto_text,
     'Location': ['Toronto']*305,
     'url':ds_toronto
    })

toronto_list2 = pd.DataFrame(
    {'Job Title': ml_toronto_titles,
     'Description': ml_toronto_text,
     'Location': ['Toronto']*252,
     'url':ml_toronto
    })

ottawa_list = pd.DataFrame(
    {'Job Title': ds_ottawa_titles,
     'Description': ds_ottawa_text,
     'Location': ['Ottawa']*53,
     'url':ds_ottawa
    })

montreal_list1 = pd.DataFrame(
    {'Job Title': ds_montreal_titles,
     'Description': ds_montral_text,
     'Location': ['Montreal']*221,
     'url':ds_montreal
    })

montreal_list2 = pd.DataFrame(
    {'Job Title': ml_montreal_titles,
     'Description': ml_montral_text,
     'Location': ['Montreal']*202,
     'url':ml_montreal
    })

calgary_list = pd.DataFrame(
    {'Job Title': ds_Calgary_titles,
     'Description': ds_calgary_text,
     'Location': ['Calgary']*86,
     'url':ds_Calgary
    })


In [154]:
frames = [vancouver_list1, vancouver_list2, 
                      toronto_list1,toronto_list2,
                      ottawa_list,montreal_list1,
                      montreal_list2, calgary_list]

In [155]:
df = pd.concat(frames)
df = df.reset_index(inplace = False, drop = True)

In [156]:
df.to_csv('job_postings.csv', index = False)