In [None]:
from pymongo import MongoClient
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [None]:
# 連接 MongoDB
client = MongoClient('mongodb://localhost:27017')  # 替換為您的 MongoDB 連接 URI
db = client["your_database_name"]
source_collection = db["your_source_collection"]  # 原始資料 collection
target_collection = db["your_target_collection"]  # 儲存結果的 collection

In [None]:

# 初始化 CKIP Transformers
ws_driver = CkipWordSegmenter(model="bert-base")
pos_driver = CkipPosTagger(model="bert-base")
ner_driver = CkipNerChunker(model="bert-base")


In [None]:
# 從 MongoDB 取出特定數量的數據
def fetch_data_from_mongodb(limit=10):
    data = source_collection.find({}, {"url": 1, "content": 1}).limit(limit)
    return [{"url": item["url"], "content": item["content"]} for item in data]

In [None]:
# 處理文本的函數
def process_text(text_list):
    ws = ws_driver(text_list)
    pos = pos_driver(ws)
    ner = ner_driver(text_list)
    return [{"ws": ws[i], "pos": pos[i], "ner": ner[i]} for i in range(len(text_list))]

In [None]:
# 儲存處理結果到 MongoDB
def save_to_mongodb(results):
    for result in results:
        # 組合新格式的數據
        doc = {
            "url": result["url"],
            "content": result["content"],
            "ckip_results": {
                "ws": result["ckip"]["ws"],
                "pos": result["ckip"]["pos"],
                "ner": result["ckip"]["ner"]
            }
        }
        # 存回目標 collection
        target_collection.update_one({"url": doc["url"]}, {"$set": doc}, upsert=True)

In [None]:

# 主執行邏輯
def main():
    data = fetch_data_from_mongodb(limit=10)  # 您可以調整數量
    if not data:
        print("No data found.")
        return

    # 將 content 文本提取出來
    texts = [item["content"] for item in data]
    ckip_results = process_text(texts)

    # 合併結果與 URL 並存回 MongoDB
    results = []
    for i, item in enumerate(data):
        results.append({
            "url": item["url"],
            "content": item["content"],
            "ckip": ckip_results[i]
        })

    save_to_mongodb(results)
    print("Data processing and saving completed.")
# 執行主函數
main()

In [None]:
# 主執行邏輯
def trail():
    data = fetch_data_from_mongodb(limit=10)  # 您可以調整數量
    if not data:
        print("No data found.")
        return

    # 將 content 文本提取出來
    texts = [item["content"] for item in data]
    ckip_results = process_text(texts)

    # 合併 URL 與處理結果並輸出
    for i, item in enumerate(data):
        print(f"URL: {item['url']}")
        print(f"原文: {item['content']}")
        print("CKIP 處理結果:")
        print(f"  分詞: {ckip_results[i]['ws']}")
        print(f"  詞性標註: {ckip_results[i]['pos']}")
        print(f"  命名實體: {ckip_results[i]['ner']}")
        print("-" * 50)

# 執行主函數
trail()

測試連結mongodb

In [4]:
from pymongo import MongoClient
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
import re

In [None]:
# 連接 MongoDB
client = MongoClient('mongodb://host.docker.internal:27017')  # 替換為您的 MongoDB 連接 URI
db = client["kafka"]
collection = db["ptt"]  # 您的 collection 名稱

In [6]:
# 初始化 CKIP Transformers
ws_driver = CkipWordSegmenter(model="bert-base")
pos_driver = CkipPosTagger(model="bert-base")
ner_driver = CkipNerChunker(model="bert-base")

In [7]:
# 文本預處理函數
def preprocess_text(text):
    # 移除空行和多餘的標記
    text = re.sub(r"\n+", "\n", text)  # 替換多個換行為單個換行
    text = re.sub(r"-----\nSent from.*", "", text)  # 移除結尾的 "Sent from" 部分
    text = re.sub(r"--+", "", text)  # 移除 "--" 標記
    text = text.strip()  # 移除首尾空白
    return text

# 從 MongoDB 取出並處理數據
def fetch_and_process_data():
    data = collection.find({}, {"key": 1, "value": 1}).limit(1)  # 限制為一條數據以進行測試
    for item in data:
        url = item["key"]
        value = item["value"]

        # 提取和預處理 "內容"
        raw_content = value.get("內容", "")
        preprocessed_content = preprocess_text(raw_content)

        # 執行 CKIP 斷詞、詞性標註和命名實體識別
        ws_result = ws_driver([preprocessed_content])
        pos_result = pos_driver(ws_result)
        ner_result = ner_driver([preprocessed_content])

        # 組合處理結果
        processed_value = {
            "發佈日期": value.get("發佈日期"),
            "標題": value.get("標題"),
            "作者": value.get("作者"),
            "內容": {
                "原文": raw_content,
                "預處理後": preprocessed_content,
                "分詞": ws_result[0],
                "詞性標註": pos_result[0],
                "命名實體": ner_result[0]
            },
            "推": value.get("推"),
            "噓": value.get("噓"),
            "箭頭": value.get("箭頭"),
            "連結": value.get("連結"),
            "留言": value.get("留言")
        }

        # 輸出結果
        print(f"URL: {url}")
        print("處理後的內容:")
        print(f"  原文: {processed_value['內容']['原文']}")
        print(f"  預處理後: {processed_value['內容']['預處理後']}")
        print(f"  分詞: {processed_value['內容']['分詞']}")
        print(f"  詞性標註: {processed_value['內容']['詞性標註']}")
        print(f"  命名實體: {processed_value['內容']['命名實體']}")
        print("-" * 50)

# 執行函數
fetch_and_process_data()

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused, Timeout: 30s, Topology Description: <TopologyDescription id: 6728a2509483bad9d4bab1d7, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused')>]>