In [2]:
import pandas as pd
from bertopic import BERTopic
from ckiptagger import construct_dictionary, WS, POS, NER
from transformers import AutoModelForTokenClassification
import numpy as np
import random
import torch

In [3]:
def set_seed(seed: int) -> None:
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

  if torch.cuda.is_available():
    # Disable cuDNN benchmark for deterministic selection on algorithm.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
  
set_seed(4698)

In [4]:
keysfile = "data/keys.txt"
with open(keysfile) as file:
    lines = file.read().splitlines() 

print(lines[1])

南韓


In [5]:
keydict = { l: 1 for l in lines}
dictionary = construct_dictionary(keydict)

In [6]:
ws = WS("./data")
pos = POS("./data")
ner = NER("./data")

2024-04-10 13:31:21.802783: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled


In [7]:
df = pd.read_csv("data/data.csv")
df = df[["year", "name", "label", "description"]]

In [8]:

stoptext = open('data/stopword.txt', encoding='utf-8').read()
stopwords = stoptext.split('\n')


In [9]:
sentence_list = df["description"].tolist()
word_sentence_list = ws(
    sentence_list,
    sentence_segmentation = True, # To consider delimiters
    segment_delimiter_set = {",", "。", ":", "?", "!", ";"}, # This is the defualt set of delimiters
    recommend_dictionary = dictionary # words in this dictionary are encouraged    
)


In [10]:
# 轉換為BERTopic 可接受格式
ws = [" ".join(w) for w in word_sentence_list]

In [11]:
model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ws")
topic_model = BERTopic(
    language="chinese", 
    embedding_model=model,  
    verbose=True
)
topics, probs = topic_model.fit_transform(ws)


2024-04-10 13:34:52,128 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 99/99 [00:37<00:00,  2.67it/s]
2024-04-10 13:35:32,437 - BERTopic - Embedding - Completed ✓
2024-04-10 13:35:32,438 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-04-10 13:35:41,960 - BERTopic - Dimensionality - Completed ✓
2024-04-10 13:35:41,960 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-04-10 13:35:42,022 - BERTopic - Cluster - Completed ✓
2024-04-10 13:35:42,026 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-04-10 13:35:42,353 - BERTopic - Representation - Completed ✓


In [12]:
timestamps = df.year.tolist()  # 讀取data.csv檔案中的 year 資料，作為我們的timestamp
timestamps = [f"{str(int(t)+1911)}-01-01" for t in timestamps]

In [19]:
# 各 Topic 時間序列圖
topics_over_time = topic_model.topics_over_time(
    ws,     
    timestamps,     
)
tot_fig = topic_model.visualize_topics_over_time(
    topics_over_time, top_n_topics=12, width=1000
)
tot_fig

0it [00:00, ?it/s]

8it [00:01,  4.88it/s]


In [17]:
# 各 Topic TF-IDF 關鍵字直方圖
bar_fig = topic_model.visualize_barchart(
    top_n_topics=12,
    width=230,
)
bar_fig

In [22]:
# 各 Topic 間距離圖
topic_fig = topic_model.visualize_topics(
    top_n_topics=10,
    width=1000,
)
topic_fig