#### **Import library**

In [1]:
import os
import logging
import pandas as pd
import requests
import time

#### **Set environment**

In [2]:
cmp_source_dir = "CMP_LIST"
cmp_source_file_name = 'CAMPANY_LIST_v20250424_2.csv'

cmp_business_data_dir = "CMP_BUSINESS_DATA"
cmp_business_data_name = "CAMPANY_BUSINESS_DATA_v20250424_2.CSV"

business_data_format = "json"
business_data_skip_rows = 0
business_data_top_n = 1000 #平台API回傳預設50筆，下限為1，上限為1000

log_file_name = 'campany_business_crawler_log_' + time.strftime("%Y%m%d") + '.log'

#### **Create function - cmp_business_api**

In [3]:
def cmp_business_api(camp_id, business_data_format, business_data_skip_rows, business_data_top_n):
    url = f"https://data.gcis.nat.gov.tw/od/data/api/236EE382-4942-41A9-BD03-CA0709025E7C?$format={business_data_format}&$filter=Business_Accounting_NO eq {camp_id}&$skip={business_data_skip_rows}&$top={business_data_top_n}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,'
                  'image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-TW,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'DNT': '1',  # Do Not Track
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'Referer': 'https://www.google.com/'  # 模擬從 Google 連進來
    }
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            logging.info(f"{camp_id}, 處理成功，狀態碼: {response.status_code}")
            return response.json()
        else:
            logging.error(f"{camp_id}, 查詢失敗，狀態碼: {response.status_code}")
            return None
    except Exception as e:
        logging.error(f"{camp_id}, 查詢錯誤，錯誤訊息: {e}")
        return None

#### **Create function - parsing_json**

In [4]:
def parsing_json(json_str):
    for business in json_str[0]['Cmp_Business']:
        df_cmp_business.loc[len(df_cmp_business)] = {
            "SNAP_DATE": time.strftime("%Y%m%d"),
            "BUSINESS_ACCOUNTING_NO": json_str[0]["Business_Accounting_NO"],
            "COMPANY_NAME": json_str[0]["Company_Name"],
            "COMPANY_STATUS": json_str[0]["Company_Status"],
            "COMPANY_STATUS_DESC": json_str[0]["Company_Status_Desc"],
            "COMPANY_SETUP_DATE": json_str[0]["Company_Setup_Date"],
            "BUSINESS_SEQ_NO": business['Business_Seq_NO'],
            "BUSINESS_ITEM": business['Business_Item'],
            "BUSINESS_ITEM_DESC": business['Business_Item_Desc']
        }

#### **Create function - save_df_to_csv**

In [5]:
def save_df_to_csv(df):
    df_cmp_business.to_csv(
        cmp_business_data_path,
        index=False,
        encoding='utf-8-sig'
    ) 

#### **Initial environment**

In [8]:
cmp_source_path = os.path.join(os.getcwd(), cmp_source_dir, cmp_source_file_name)
cmp_business_data_path = os.path.join(os.getcwd(), cmp_business_data_dir, cmp_business_data_name)

columns = ["SNAP_DATE",
           "BUSINESS_ACCOUNTING_NO", "COMPANY_NAME", "COMPANY_STATUS", "COMPANY_STATUS_DESC", "COMPANY_SETUP_DATE",
           "BUSINESS_SEQ_NO", "BUSINESS_ITEM", "BUSINESS_ITEM_DESC"]
df_cmp_business = pd.DataFrame(columns=columns)

logging.basicConfig(
    level=logging.CRITICAL, # 全部訊息都寫入log
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y/%m/%d %H:%M:%S',
    filename=log_file_name,  
    filemode='a' # append log紀錄
)

#### **Read CSV**

In [9]:
df_source = pd.read_csv(cmp_source_path, dtype={"CAMP_ID": str})
df_source = df_source.drop_duplicates()
print(f"本次爬蟲預計處理 {df_source.shape[0]} 間公司")

本次爬蟲預計處理 311200 間公司


#### **Launch  crawler**

In [None]:
for row in df_source.itertuples():
    camp_id_cleaned = row.CAMP_ID if len(row.CAMP_ID) <= 8 else row.CAMP_ID[:8]

    json_str = cmp_business_api(
        camp_id_cleaned,
        business_data_format,
        business_data_skip_rows,
        business_data_top_n
    )

    if json_str is not None:
        parsing_json(json_str)
    else:
        logging.critical(f"{camp_id_cleaned}, 發查失敗，請確認統編是否正確")
    
    if row.Index != 0 and row.Index % 1000 == 0:
        save_df_to_csv(df_cmp_business)
        print(f"處理第{row.Index}筆資料，清理前統一編號{row.CAMP_ID}，清理後統一編號{camp_id_cleaned}")

# 爬蟲完畢後最後一次存成CSV
save_df_to_csv(df_cmp_business)

處理第1000筆資料，清理前統一編號01027772，清理後統一編號01027772
處理第2000筆資料，清理前統一編號01325076，清理後統一編號01325076
處理第3000筆資料，清理前統一編號01612416，清理後統一編號01612416
處理第4000筆資料，清理前統一編號01826487，清理後統一編號01826487
處理第5000筆資料，清理前統一編號02752961，清理後統一編號02752961
處理第6000筆資料，清理前統一編號04181778，清理後統一編號04181778
處理第7000筆資料，清理前統一編號04783770，清理後統一編號04783770
處理第8000筆資料，清理前統一編號05350856，清理後統一編號05350856
處理第9000筆資料，清理前統一編號06649341，清理後統一編號06649341
處理第10000筆資料，清理前統一編號07723699，清理後統一編號07723699
處理第11000筆資料，清理前統一編號08629412，清理後統一編號08629412
處理第12000筆資料，清理前統一編號09618336，清理後統一編號09618336
處理第13000筆資料，清理前統一編號10098116，清理後統一編號10098116
處理第14000筆資料，清理前統一編號10381252，清理後統一編號10381252
處理第15000筆資料，清理前統一編號10651585，清理後統一編號10651585
處理第16000筆資料，清理前統一編號12289554，清理後統一編號12289554
處理第17000筆資料，清理前統一編號12504982，清理後統一編號12504982
處理第18000筆資料，清理前統一編號12618465，清理後統一編號12618465
處理第19000筆資料，清理前統一編號12676056，清理後統一編號12676056
處理第20000筆資料，清理前統一編號12731826，清理後統一編號12731826
處理第21000筆資料，清理前統一編號12800196，清理後統一編號12800196
處理第22000筆資料，清理前統一編號12861382，清理後統一編號12861382
處理第23000筆資料，清理前統一編號12917484，清理後統一編號129174