# Download Webpages

In [1]:
import requests
import urllib.request
import bs4
from bs4 import BeautifulSoup

import uuid
import urllib.request
from tqdm import tqdm

# Generate URL of 104 Website

In [2]:
def get_url_job_list(num_pages=50):
    """
    Generate url of 104 website.
        Sample url: 
              ['https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=1&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=2&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=3&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=10&mode=s&jobsource=2018indexpoc',
               'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=11&mode=s&jobsource=2018indexpoc']

    Return: return url lists
    """
    
    url_job_list = []
    
    for index in range(1, num_pages+1):
        url = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page={}&mode=s&jobsource=2018indexpoc'.format(index)
        url_job_list.append(url)
        
        
    return url_job_list


url_job_list = get_url_job_list()

In [3]:
url_job_list[:5]

['https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=1&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=2&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=3&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=4&mode=s&jobsource=2018indexpoc',
 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=machine%20learning&order=1&asc=0&page=5&mode=s&jobsource=2018indexpoc']

# Declare Parser Class

In [13]:
class ListParser():
    def __init__(self, url):
        
        # fetch html page and parse it.
        response = requests.get(url, timeout=10)
        self.content = BeautifulSoup(response.content, "html.parser")
        assert self.content is not None, 'Error: Empty content.'
        
        self.num_list = 0
        
        
    def get_lists(self):
        self.num_list = 0
        
        for element in self.content.findAll('article', attrs={'class': ['job-mode', 'js-job-item']}):
            if 'b-block--ad' in element.attrs['class']:
                continue
            else:
                self.num_list += 1
                yield element
        
        
class EntryParser():
    def __init__(self, element):
        """
        Args:
            element (bs4.element): element in BeautifulSoup
        """
        
        self.element = element
        
        
    def get_url(self):
        job = element.find('a', attrs={'class': 'js-job-link'})
        
        return 'http:' + job['href']
    
    
    def get_filename(self):
        
        # get a random unique id
        str_uuid = uuid.uuid4()
        
        company = element['data-cust-name'].replace('/', '-')
        title = element['data-job-name'].replace('/', '-')
        
        filename = '{}_{}_{}.html'.format(company, title, str_uuid)
        return filename
    

# Define Utility: Save Function

In [7]:
def save_page(url, filename):
    urllib.request.urlretrieve(url, str(filename))

# Main Program

In [15]:
folder_name = './dataset/'

# for url in url_job_list[:1]:
for url in tqdm(url_job_list[:3]):
    parser = ListParser(url)
    
    for element in parser.get_lists():
        entry = EntryParser(element)
        
        print(entry.get_url())
        print(folder_name + entry.get_filename())
            
        save_page(entry.get_url(), folder_name + entry.get_filename())
        
        if parser.num_list == 2:
            break

  0%|          | 0/3 [00:00<?, ?it/s]

http://www.104.com.tw/job/6lng1?jobsource=jolist_c_relevance
./dataset/現觀科技股份有限公司_Machine Learning Engineer_bc6f1ada-0fa6-4f24-9c3b-96ffd781d41a.html
http://www.104.com.tw/job/6arae?jobsource=jolist_c_relevance
./dataset/新馳科技有限公司_Machine Learning Technical Manager_12b8c011-8950-4b0e-b25f-ae4393bd7307.html


 33%|███▎      | 1/3 [00:09<00:18,  9.05s/it]

http://www.104.com.tw/job/6af3o?jobsource=jolist_b_relevance
./dataset/Spring Professional_藝珂人事顧問股份有限公司躍科分公司_Deep Learning- Machine Learning RD Engineer_9b4ecd37-d5b6-4ada-8bed-b4421b6fe93d.html
http://www.104.com.tw/job/61rxb?jobsource=jolist_b_relevance
./dataset/阿福股份有限公司_電腦視覺與機器學習研究員 Computer Vision-Machine Learning Researcher_558d7e42-21ed-48c1-b335-1de730ef9f5f.html


 67%|██████▋   | 2/3 [00:13<00:07,  7.63s/it]

http://www.104.com.tw/job/6gu8p?jobsource=2018indexpoc
./dataset/LINE Taiwan_台灣連線股份有限公司_(LTEG) Data Scientist - Machine Learning (Senior)_b9956006-2526-42f5-8435-a45a3a7d938c.html
http://www.104.com.tw/job/52qyx?jobsource=2018indexpoc
./dataset/香港商女媧創造股份有限公司台灣分公司_資深機器學習工程師 (Machine Learning Engineer)_82194652-e892-4614-bd36-31ebd5952650.html


100%|██████████| 3/3 [00:17<00:00,  6.62s/it]
