In [1]:
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ws_driver = CkipWordSegmenter(model="bert-base", device=0)
pos_driver = CkipPosTagger(model="bert-base", device=0)

df_0 = pd.read_csv("./data/data_0.csv")
df_1 = pd.read_csv("./data/data_1.csv")

print(f"df_0 資料筆數: {len(df_0)}")
print(f"df_0 原始內文: {df_0['內文'][10]}")
print("=========================================================")
print(f"df_1 資料筆數: {len(df_1)}")
print(f"df_1 原始內文: {df_1['內文'][10]}")

df_0 資料筆數: 1518
df_0 原始內文: 不要輕易向陌生人透露個人金融信息，以防被騙。
df_1 資料筆數: 1814
df_1 原始內文: 配合完成任一方案即可領取福利加碼
限定名額100名
https://line.me/ti/p/QxL551-7u0


In [3]:
def remove_special_characters(text):
    # 保留中文字、英文字母、數字以及常見標點符號
    pattern = r'[^a-zA-Z0-9\u4e00-\u9fff，。！？、；：‘’“”（）《》〈〉【】{}]'
    return re.sub(pattern, '', text)

# 定義過濾詞性的函數
def clean(sentence_ws, sentence_pos):
    short_sentence = []
    stop_pos = set(['Nep', 'Nh'])
    for word_ws, word_pos in zip(sentence_ws, sentence_pos):
        # 去掉名詞裡的某些詞性
        is_not_stop_pos = word_pos not in stop_pos
        # 組成串列
        if is_not_stop_pos:
            short_sentence.append(word_ws)
    return "".join(short_sentence)

In [4]:
df_0['內文_清理後'] = df_0['內文'].apply(remove_special_characters) # 清理資料

# 進行斷詞和詞性標註
text_list = df_0['內文_清理後'].tolist()
ws_result = ws_driver(text_list)
pos_result = pos_driver(ws_result)

# 將斷詞和詞性標註結果進行過濾
filtered_sentences = [clean(sentence_ws, sentence_pos) for sentence_ws, sentence_pos in zip(ws_result, pos_result)]

# 將過濾後的結果存儲回 DataFrame
df_0['內文_斷詞後'] = filtered_sentences

# 顯示前處理後的內文
print(f"清理後內文: {df_0['內文_清理後'][10]}")
print(f"斷詞結果: {df_0['內文_斷詞後'][10]}")

df_0.to_csv("./train_data/data_0.csv", index=False, encoding="utf-8-sig")

df_1['內文_清理後'] = df_1['內文'].apply(remove_special_characters) # 清理資料

# 進行斷詞和詞性標註
text_list = df_1['內文_清理後'].tolist()
ws_result = ws_driver(text_list)
pos_result = pos_driver(ws_result)

# 將斷詞和詞性標註結果進行過濾
filtered_sentences = [clean(sentence_ws, sentence_pos) for sentence_ws, sentence_pos in zip(ws_result, pos_result)]

# 將過濾後的結果存儲回 DataFrame
df_1['內文_斷詞後'] = filtered_sentences

# 顯示前處理後的內文
print(f"清理後內文: {df_1['內文_清理後'][10]}")
print(f"斷詞結果: {df_1['內文_斷詞後'][10]}")

df_1.to_csv("./train_data/data_1.csv", index=False, encoding="utf-8-sig")

Tokenization: 100%|██████████| 1518/1518 [00:00<00:00, 84375.21it/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Inference: 100%|██████████| 6/6 [00:15<00:00,  2.61s/it]
Tokenization: 100%|██████████| 1518/1518 [00:00<00:00, 108436.43it/s]
Inference: 100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


清理後內文: 不要輕易向陌生人透露個人金融信息，以防被騙。
斷詞結果: 不要輕易向陌生人透露金融信息，以防被騙。


Tokenization: 100%|██████████| 1814/1814 [00:00<00:00, 7092.52it/s]
Inference: 100%|██████████| 8/8 [27:59<00:00, 209.99s/it]
Tokenization: 100%|██████████| 1814/1814 [00:00<00:00, 4047.40it/s]
Inference: 100%|██████████| 58/58 [1:16:07<00:00, 78.75s/it]


清理後內文: 配合完成任一方案即可領取福利加碼限定名額100名httpslinemetipQxL5517u0
斷詞結果: 配合完成任一方案即可領取福利加碼限定名額100名httpslinemetipQxL5517u0
