# Scrape Webpages in a Folder

In [24]:
import bs4
from bs4 import BeautifulSoup
import re
import json
import glob
from tqdm import tqdm
import pandas as pd

# Declare Parser Lambda

In [33]:
def parser_benefit(content):
    """
    Parse benefit text
        
    Returns:
        None if not found
    """
    benefit = content.find('h2', string=re.compile('公司福利'))
    
    if benefit is not None:
        benefit = benefit.find_next().text
        
        
    return benefit


parsers = {'company': lambda content: content.find('header', attrs={'class': ['header', 'mb-0']}).find('a', attrs={'class': 'cn'}).text,
          'title': lambda content: content.find('header', attrs={'class': ['header', 'mb-0']}).find('h1').find(text=True),
          'salary': lambda content: content.find('dd', attrs={'class': 'salary'}).find(text=True),
          'requirement': lambda content: content.find('h2', string='工作內容').find_parent('section').find('p').text,
          'requirement_others': lambda content: content.find('dt', string=re.compile('其他條件')).find_next().text,
          'address': lambda content: content.find('dd', attrs={'class': 'addr'}).find(text=True),
           'experience': lambda content: content.find('dt', string=re.compile('工作經歷')).find_next().text,
          'benefit': lambda content: parser_benefit(content)}

# Declare Parser Class

In [7]:
class HtmlParser():
    def __init__(self, parsers):
        self.parsers = parsers
        
        
    def parse(self, filename):
        
        with open(filename) as f:
            content = BeautifulSoup(f, "html.parser")
        
        jobObject = {}
        
        for key, parser in self.parsers.items():
            jobObject[key] = parser(content)
            
        return jobObject    
        
        
    def parse_pages(self, path_folder):
        """
        Args:
            path_folder (str): folder path of html files
            
        Returns:
            list of dict
        """
        jobs = []
        
        for f in tqdm(glob.glob(path_folder + "*.html")):
            jobObject = self.parse(f)
            jobs.append(jobObject)
            
        return jobs

# Declare Save Function

In [21]:
def save_job_objects(job_objects, filename, encoding='utf8'):
    """
    Args:
        job_objects (list of dict): list of jobs
    """
    with open(filename, 'w', encoding=encoding) as f:
        json.dump(job_objects, f, ensure_ascii=False)
    

# Main Function

In [34]:
path_folder='./dataset/'

# parse html pages in a folder
jobs = HtmlParser(parsers).parse_pages(path_folder)


100%|██████████| 7/7 [00:00<00:00, 17.26it/s]


In [37]:
print('Number of jobs={}\n'.format(len(jobs)))
print(jobs[:2])

Number of jobs=7

[{'company': '現觀科技股份有限公司', 'title': '\n                        Machine Learning Engineer                        ', 'salary': '待遇面議                                                                                    （', 'requirement': '- Interaction cross-functionally with technical/non-technical departments\n- Ability to develop reliable and scalable data pipelines\n- Knowledge about Machine Learning technique (SVM, Boosting, Regression techniques) \n- Experienced in scripting programming (python, R)\n- Experienced in model design and model tuning ', 'requirement_others': '# Requirements\n- Interaction cross-functionally with technical/non-technical departments\n- Ability to develop reliable and scalable data pipelines\n- Knowledge about Machine Learning technique (SVM, Boosting, Regression techniques) \n- Experienced in scripting programming (python, R)\n- Experienced in model design and model tuning \n\n# Optional \n- Familiar with CS algorithm\n- Familiar with datab

In [38]:
filename_json='jobs_104.json'

# save job (list of dictionary)
save_job_objects(jobs, filename_json)

# Inspect JSON file using Pandas

In [39]:
job_df = pd.read_json(filename_json)

In [40]:
job_df.head()

Unnamed: 0,address,benefit,company,experience,requirement,requirement_others,salary,title
0,\n 台北市中...,\n福利制度優於勞基法\n1. 年資滿一年保障年薪14個月\n2. 三節禮金、績效獎金、專案...,現觀科技股份有限公司,1年以上,- Interaction cross-functionally with technica...,# Requirements\n- Interaction cross-functional...,待遇面議 ...,\n Machine Learning Eng...
1,\n 台北市中...,\n年終保障\n 介紹獎金\n 員工配股\n 員工聚餐\n 點心下午茶\n ...,香港商女媧創造股份有限公司台灣分公司,3年以上,Job Description: \nDevelop algorithms to make ...,Good to have: \n-Have interests in natural lan...,待遇面議 ...,\n 資深機器學習工程師 (Machine L...
2,\n 台北市中山區,\n▍公司制度 \n具競爭力的薪資水準、年終獎金與綜合績效獎金 \n彈性上下班時間，標準工時...,阿福股份有限公司,2年以上,1. Facilitate the adoption of computer-vision ...,1. Self-motivated and enthusiastic in artifici...,"月薪 60,000~140,000元 ...",\n 電腦視覺與機器學習研究員 Compute...
3,\n 台北市信...,\n【Competitive salary】\n -提供具競爭力的薪酬，激勵、吸引並留任...,新馳科技有限公司,7年以上,* Design or apply machine learning models in o...,* MS or PhD in computer science or electrical ...,待遇面議 ...,\n Machine Learning Tec...
4,\n 台北市信...,,Spring Professional_藝珂人事顧問股份有限公司躍科分公司,3年以上,Our client is a top tier company which is focu...,未填寫,待遇面議 ...,\n Deep Learning/ Machi...
