# Download Webpages

In [17]:
import requests
import urllib.request
import bs4
from bs4 import BeautifulSoup

import uuid
import urllib.request
from tqdm import tqdm

# Generate URL of 104 Website

In [2]:
def get_url_job_list(num_pages=50):
    """
    Generate url of 104 website.
        Sample url: 
              ['https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=1&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=2&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=3&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=10&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=11&mode=s&jobsource=2018indexpoc']

    Return: return url lists
    """
    
    url_job_list = []
    
    for index in range(1, num_pages+1):
        url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page={}&mode=s&jobsource=2018indexpoc'.format(index)
        url_job_list.append(url)
        
        
    return url_job_list


url_job_list = get_url_job_list()

In [3]:
url_job_list[:5]

['https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=1&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=2&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=3&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=4&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=5&mode=s&jobsource=2018indexpoc']

# Declare Parser Class

In [11]:
class ListParser():
    def __init__(self, url):
        
        # fetch html page and parse it.
        response = requests.get(url, timeout=10)
        self.content = BeautifulSoup(response.content, "html.parser")
        assert self.content is not None, 'Error: Empty content.'
        
        self.num_list = 0
        
        
    def get_lists(self):
        self.num_list = 0
        
        for element in self.content.findAll('article', attrs={'class': ['job-mode', 'js-job-item']}):
            if 'b-block--ad' in element.attrs['class']:
                continue
            else:
                self.num_list += 1
                yield element
        
        
class EntryParser():
    def __init__(self, element):
        """
        Args:
            element (bs4.element): element in BeautifulSoup
        """
        
        self.element = element
        
        
    def get_url(self):
        job = element.find('a', attrs={'class': 'js-job-link'})
        
        return 'http:' + job['href']
    
    
    def get_filename(self):
        
        # get a random unique id
        str_uuid = uuid.uuid4()
        
        filename = '{}_{}_{}.html'.format(element['data-cust-name'], element['data-job-name'], str_uuid)
        return filename
    

# Define Utility: Save Function

In [14]:
def save_page(url, filename):
    urllib.request.urlretrieve(url, filename)

# Main Program

In [21]:
folder_name = './dataset/'

# for url in url_job_list[:1]:
for url in tqdm(url_job_list[:3]):
    parser = ListParser(url)
    
    for element in parser.get_lists():
        entry = EntryParser(element)
        
        print(entry.get_url())
        print(folder_name + entry.get_filename())
            
        save_page(entry.get_url(), folder_name + entry.get_filename())
        
        if parser.num_list == 2:
            break

  0%|          | 0/3 [00:00<?, ?it/s]

http://www.104.com.tw/job/6lng1?jobsource=jolist_a_relevance
./dataset/現觀科技股份有限公司_Machine Learning Engineer_fb6fc616-8ff8-45a6-917c-e75f249c5e8a.html
http://www.104.com.tw/job/6arae?jobsource=jolist_a_relevance
./dataset/新馳科技有限公司_Machine Learning Technical Manager_1e3479da-6053-4618-add8-afbebc431bc7.html


 33%|███▎      | 1/3 [00:04<00:09,  4.90s/it]

http://www.104.com.tw/job/6c3kr?jobsource=jolist_a_relevance
./dataset/英屬開曼群島商鼎峰智能股份有限公司台灣分公司_Machine Learning RD (Hsinchu)_b7352ed4-9451-4dfe-820b-dc4e535a7fc8.html
http://www.104.com.tw/job/5nub2?jobsource=jolist_a_relevance
./dataset/英屬開曼群島商鼎峰智能股份有限公司台灣分公司_Machine Learning RD (Taipei)_431af9c6-8e4e-4c0f-a7e3-a3dab91bebfe.html


 67%|██████▋   | 2/3 [00:09<00:04,  4.90s/it]

http://www.104.com.tw/job/6gu8v?jobsource=2018indexpoc
./dataset/LINE Taiwan_台灣連線股份有限公司_(LTEG) Data Scientist - Machine Learning (Junior)_a8a76b8b-c9aa-4a9e-90bb-58301b5e9ab7.html
http://www.104.com.tw/job/6gu8p?jobsource=2018indexpoc
./dataset/LINE Taiwan_台灣連線股份有限公司_(LTEG) Data Scientist - Machine Learning (Senior)_98ef7d61-6557-4e64-92eb-682cf6500f07.html


100%|██████████| 3/3 [00:14<00:00,  4.93s/it]
