# Import Moduel

In [1]:
import pandas as pd
import re, time, requests, random
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# 同步版本

In [2]:
# 同步版本
class Job_search104():
    current_date = datetime.now().date()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }
    search_url = 'https://www.104.com.tw/jobs/search/?'
    def __init__(self, filter_params, key_word, page = 15):
        self.filter_params = filter_params
        self.key_word = key_word
        self.page = page
           
    def search_job(self):
        url = requests.get(self.search_url, self.filter_params, headers=self.headers).url
        option = Options()
        option.add_experimental_option('excludeSwitches', ['enable-automation']) # 開發者模式。可以避開某些防爬機制，有開有保佑
        option.add_argument('--headless') # 無頭模式，開發完成之後再使用，可以完全背景執行，有機會變快
        option.add_argument("--disable-gpu") # 禁用GPU加速，有些情況下需要設置這個參數
        driver = webdriver.Chrome(options=option)
        driver.get(url)

        element = driver.find_element(By.XPATH,'//*[@id="js-job-header"]/div[1]/label[1]/select/option[1]')
        total_page = int(re.sub(r'\D', '', element.text.split('/')[-1]))
        print(f'Total_page = {total_page}')
        # 滾頁面
        scroll_times = self.page
        for _ in range(scroll_times):
            driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
            time.sleep(2)

        # 自動加載結束後要自行點選載入(15以後)
        # 使用CSS選擇器定位最後一個按鈕並點擊
        if total_page >= 15:
            k = 1
            while True:
                try:
                    button_element = WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '#js-job-content > div:last-child > button'))
                    )
                    print(f'手動載入第{15 + k}頁')
                    button_element.click()
                    k += 1
                    if k == 86 or k == total_page - 14 :
                        break
                except Exception as e:
                    print("發生未知錯誤：", e)
                    break

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        print(f'共{len(soup.find_all("a",class_="js-job-link"))}筆資料')
        driver.quit()
        return soup
    
    def job_details(self, url):
        option = Options()
        option.add_experimental_option('excludeSwitches', ['enable-automation'])
        driver = webdriver.Chrome(options=option)
        driver.maximize_window()
        wait = WebDriverWait(driver, 3)
        driver.get(url)
        try:
            element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'job-header__title')))
        except Exception as e:
            print('Driver Error')
        page_source = driver.page_source
        driver.quit()
        return page_source
    
    def update_date(self, soup) -> str:
        update_date = soup.find("div", class_="job-header__title")
        return update_date.find('span').text.strip().replace('更新','') if update_date else None
    
    def company(self, soup) -> str:
        name = soup.find("div", class_="mt-3")
        return name.select_one('div > a').text.strip() if name else None
    
    def jd_info(self, soup) -> dict:
        result = {}
        JD = soup.find('div', class_='job-description-table row')
        if JD:
            jd_items = JD.find_all('div', recursive=False)
            if jd_items:
                    result = {
                                '工作內容': jd_items[0].find('p').text,
                                '職務類別': ', '.join(i.text for i in jd_items[1].find_all('u')),
                                '工作待遇': jd_items[2].find_all('div', recursive=False)[-1].text.strip(),
                                '工作性質': jd_items[3].find_all('div')[-1].text.strip(),
                                '上班地點': jd_items[4].find_all('div')[-1].text.strip(),
                                '管理責任': jd_items[6].find_all('div')[-1].text.strip(),
                                '出差外派': jd_items[7].find_all('div')[-1].text.strip(),
                                '上班時段': jd_items[8].find_all('div')[-1].text.strip(),
                                '休假制度': jd_items[9].find_all('div')[-1].text.strip(),
                                '可上班日': jd_items[10].find_all('div')[-1].text.strip(),
                                '需求人數': jd_items[11].find_all('div')[-1].text.strip()
                            }
        return result
    
    def jr_info(self, soup) -> dict:
        result = {}
        JR = soup.find('div', class_= 'job-requirement-table row')
        JRO = soup.find('div', class_= 'job-requirement col opened')
        if JR:
            jr_items = JR.find_all('div', recursive=False)
            if jr_items:
                result = {
                    '工作經歷' : jr_items[0].find_all('div')[-1].text.strip(),
                    '學歷要求' : jr_items[1].find_all('div')[-1].text.strip(),
                    '科系要求' : jr_items[2].find_all('div')[-1].text.strip(),
                    '語文條件' : jr_items[3].find_all('div')[-1].text.strip(),
                    '擅長工具' : ', '.join(i.text for i in jr_items[4].find_all('u')),
                    '工作技能' : jr_items[5].find_all('div')[-1].text.strip(),
                    '其他要求' : JRO.find_all('div')[-1].text.strip() if JRO else '無'
                }
        return result
    
    def main(self, Job_list : list, DF):
        for idx, item in enumerate(Job_list):
            title = item['title']
            lower_title = title.lower()
            if not re.search(self.key_word, lower_title):
                continue
                
            try:
                Job_link = f"https:{item['href']}"
                response = requests.get(Job_link)
                soup = BeautifulSoup(response.text, 'lxml')
                
                Data = {
                        '更新日期' : [self.update_date(soup)],
                        '職缺名稱' : [title],
                        '公司名稱' : [self.company(soup)],
                        '連結' : [Job_link]
                }
                Data.update(self.jd_info(soup))
                Data.update(self.jr_info(soup))
                df = pd.DataFrame(Data, columns=['更新日期', '職缺名稱', '公司名稱', '工作內容', '職務類別', '工作待遇',
                                                    '工作性質', '上班地點', '管理責任', '出差外派', '上班時段', '休假制度',
                                                    '可上班日', '需求人數', '工作經歷', '學歷要求', '科系要求', '語文條件',
                                                    '擅長工具', '工作技能', '其他要求', '連結'])
                DF = pd.concat([DF, df], ignore_index=True)
                if (idx + 1) % 10 == 0:
                    print(f"success {idx+1} !")
                time.sleep(random.uniform(0.4, 0.7)) # 0.4 ~ 0.7 second
            except Exception as e:
                print(f"{idx} 發生錯誤", e)
                DF.to_csv(f"JBLIST_{self.current_date}.csv", sep='|', index=False)
                
        DF.to_csv(f"JBLIST_{self.current_date}.csv", sep='|', index=False)
        return DF

In [None]:
start_time = time.time()

# 建立物件
JBDF = pd.DataFrame()

# 搜尋關鍵字 與 過濾非相關職缺關鍵字
keywords_pattern = r'工程|資料|python|data|數據'
filter_params = {
    'ro' : 1, # 1 全職
    'keyword' : '資料工程',
    'area' : '6001002000,6001001000,6001006000,C6001008000',  # 6001001000 台北市 6001002000 新北 6001006000 新竹縣市 6001008000 台中市
    'isnew' : 0, # 0:本日 3:3天內 7:1週內 14 30
    'jobexp' : '1,3', # 工作經驗1年以下 + 1-3年
    'mode' : 'l', # 列表模式(比較多筆資料)
    'order' : 16 # 照日期排序
}

JBS = Job_search104(filter_params, keywords_pattern)
while True:
    try:
        soup = JBS.search_job()
        break
    except:
        print('執行錯誤, retry')
Job_list = soup.find_all("a",class_="js-job-link")

res = JBS.main(Job_list, JBDF)
print("花費：" + str(time.time() - start_time) + "秒")

# 非同步版本

In [2]:
# import grequests # 看起要在.py才能用
from aiohttp import ClientSession, TCPConnector
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [3]:
class eJob_search104():
    current_date = datetime.now().date()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }
    search_url = 'https://www.104.com.tw/jobs/search/?'
    def __init__(self, filter_params, key_word, page = 15):
        self.filter_params = filter_params
        self.key_word = key_word
        self.page = page
           
    def search_job(self):
        url = requests.get(self.search_url, self.filter_params, headers=self.headers).url
        print(url)
        option = Options()
        option.add_experimental_option('excludeSwitches', ['enable-automation']) # 開發者模式。可以避開某些防爬機制，有開有保佑
        option.add_argument('--headless') # 無頭模式，開發完成之後再使用，可以完全背景執行，有機會變快
        option.add_argument("--disable-gpu") # 禁用GPU加速，有些情況下需要設置這個參數
        driver = webdriver.Chrome(options=option)
        driver.get(url)

        element = driver.find_element(By.XPATH,'//*[@id="js-job-header"]/div[1]/label[1]/select/option[1]')
        total_page = int(re.sub(r'\D', '', element.text.split('/')[-1]))
        print(f'Total_page = {total_page}')
        # 滾頁面
        scroll_times = self.page
        for _ in range(scroll_times):
            driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
            time.sleep(2)

        # 自動加載結束後要自行點選載入(15以後)
        # 使用CSS選擇器定位最後一個按鈕並點擊
        if total_page >= 15:
            k = 1
            while True:
                try:
                    button_element = WebDriverWait(driver, 3).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, '#js-job-content > div:last-child > button'))
                    )
                    print(f'手動載入第{15 + k}頁')
                    button_element.click()
                    k += 1
                    if k == 86 or k == total_page - 14 :
                        break
                except Exception as e:
                    print("發生未知錯誤：", e)
                    break

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        raw_Job_list = soup.find_all("a",class_="js-job-link")
        print(f'共{len(raw_Job_list)}筆資料')
        driver.quit()
        return raw_Job_list
    
    def filter_job(self, raw_Job_list:list):
        filter_job_list = [i for i in raw_Job_list if re.search(self.key_word, i['title'].lower())]
        print(f'過濾完有{len(filter_job_list)}筆')
        return filter_job_list
    
    def update_date(self, soup) -> str:
        update_date = soup.find("div", class_="job-header__title")
        return update_date.find('span').text.strip().replace('更新','') if update_date else None
    
    def company(self, soup) -> str:
        name = soup.find("div", class_="mt-3")
        return name.select_one('div > a').text.strip() if name else None
    
    def jd_info(self, soup) -> dict:
        result = {}
        JD = soup.find('div', class_='job-description-table row')
        if JD:
            jd_items = JD.find_all('div', recursive=False)
            if jd_items:
                    result = {
                                '工作內容': jd_items[0].find('p').text,
                                '職務類別': ', '.join(i.text for i in jd_items[1].find_all('u')),
                                '工作待遇': jd_items[2].find_all('div', recursive=False)[-1].text.strip(),
                                '工作性質': jd_items[3].find_all('div')[-1].text.strip(),
                                '上班地點': jd_items[4].find_all('div')[-1].text.strip(),
                                '管理責任': jd_items[6].find_all('div')[-1].text.strip(),
                                '出差外派': jd_items[7].find_all('div')[-1].text.strip(),
                                '上班時段': jd_items[8].find_all('div')[-1].text.strip(),
                                '休假制度': jd_items[9].find_all('div')[-1].text.strip(),
                                '可上班日': jd_items[10].find_all('div')[-1].text.strip(),
                                '需求人數': jd_items[11].find_all('div')[-1].text.strip()
                            }
        return result
    
    def jr_info(self, soup) -> dict:
        result = {}
        JR = soup.find('div', class_= 'job-requirement-table row')
        JRO = soup.find('div', class_= 'job-requirement col opened')
        if JR:
            jr_items = JR.find_all('div', recursive=False)
            if jr_items:
                result = {
                    '工作經歷' : jr_items[0].find_all('div')[-1].text.strip(),
                    '學歷要求' : jr_items[1].find_all('div')[-1].text.strip(),
                    '科系要求' : jr_items[2].find_all('div')[-1].text.strip(),
                    '語文條件' : jr_items[3].find_all('div')[-1].text.strip(),
                    '擅長工具' : ', '.join(i.text for i in jr_items[4].find_all('u')),
                    '工作技能' : jr_items[5].find_all('div')[-1].text.strip(),
                    '其他要求' : JRO.find_all('div')[-1].text.strip() if JRO else '無'
                }
        return result
    
    async def fetch(self, session, url):
        async with session.get(url, headers = {'User-Agent':'GoogleBot'}) as response:
            return await response.text()

    async def get_job_info(self, item):
        try:
            title = item['title']
            Job_link = f"https:{item['href']}"
            connector = TCPConnector(limit=10)
            async with ClientSession(connector=connector) as session:
                html = await self.fetch(session, Job_link)
                soup = BeautifulSoup(html, 'lxml')

            Data = {
                '更新日期': [self.update_date(soup)],
                '職缺名稱': [title],
                '公司名稱': [self.company(soup)],
                '連結': [Job_link]
            }
            Data.update(self.jd_info(soup))
            Data.update(self.jr_info(soup))
            df = pd.DataFrame(Data, columns=['更新日期', '職缺名稱', '公司名稱', '工作內容', '職務類別', '工作待遇',
                                            '工作性質', '上班地點', '管理責任', '出差外派', '上班時段', '休假制度',
                                            '可上班日', '需求人數', '工作經歷', '學歷要求', '科系要求', '語文條件',
                                            '擅長工具', '工作技能', '其他要求', '連結'])
            return df
        except Exception as e:
            print("發生錯誤", e)
            return None

    async def scrape(self, Job_list):
        tasks = []
        semaphore = asyncio.Semaphore(10)  # Limit concurrent requests to 10

        for item in Job_list:
            async with semaphore:
                task = asyncio.ensure_future(self.get_job_info(item))
                tasks.append(task)

        return await asyncio.gather(*tasks)
    
    def main(self, Job_list: list, DF):
        print(len(Job_list))
        batch_size = 30
        num_batches = (len(Job_list) + batch_size - 1) // batch_size
        for batch_idx in range(num_batches):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(Job_list))
            Job_list_batch = Job_list[start_idx:end_idx]
            loop = asyncio.get_event_loop()
            results = loop.run_until_complete(self.scrape(Job_list_batch))
            # loop.close()

            for df in results:
                if df is not None:
                    DF = pd.concat([DF, df], ignore_index=True)

        DF.to_csv(f"JBLIST_{self.current_date}.csv", sep='|', index=False)
        return DF

In [None]:
start_time = time.time()

# 過濾關鍵字以外的職缺
keywords_pattern = r'工程|資料|python|data|數據'

# 搜尋關鍵字
filter_params = {
    'ro' : 1, # 1 全職
    'keyword' : '資料工程',
    'area' : '6001002000,6001001000,6001006000,C6001008000',  # 6001001000 台北市 6001002000 新北 6001006000 新竹縣市 6001008000 台中市
    'isnew' : 3, # 0:本日 3:3天內 7:1週內 14 30
    'jobexp' : '1,3', # 工作經驗1年以下 + 1-3年
    'mode' : 'l', # 列表模式(比較多筆資料)
    'order' : 16 # 照日期排序
}

# 建立物件
DF = pd.DataFrame()
EJS = eJob_search104(filter_params, keywords_pattern)
while True:
    try:
        raw_Job_list = EJS.search_job()
        break
    except:
        print('執行錯誤, retry')
Job_list = EJS.filter_job(raw_Job_list)
result_df = EJS.main(Job_list, DF)
print(f"花費 {time.time() - start_time} 秒")

In [None]:
# check missing data
result_df[result_df.isnull().any(axis=1)]
clean_df = result_df.dropna()

# 儲存

In [42]:
current_date = datetime.now().date()
clean_df.to_csv(f"JBLIST_{current_date}.csv", sep='|', index=False)

In [None]:
d = pd.read_csv('JBLIST_2023-07-27.csv',sep ='|')