In [901]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import pytextrank  # We're not going to execute this at home.
# import pprint
# from collections import Counter
# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, precision_recall_curve, auc
# from sklearn.metrics import precision_score, recall_score, f1_score

In [902]:
#TF-IDF Function
def Tfidf(seg_list:list, top_k:int):
    vectorizer = CountVectorizer() # 建立 CountVectorizer 物件
    word_count = vectorizer.fit_transform(seg_list) # 將 text 轉成詞頻矩陣
    tfidf_transformer = TfidfTransformer() # 建立 TfidfTransformer 物件
    tfidf_matrix = tfidf_transformer.fit_transform(word_count) # 將詞頻矩陣轉換成 TF-IDF 矩陣
    words = vectorizer.get_feature_names() # 取得詞彙表
    tfidf_values = tfidf_matrix.toarray()[0] # 取得每個詞彙的 TF-IDF 值
    sorted_index = np.argsort(tfidf_values)[::-1] # 取得排序後的索引位置
    tfidf_keywords = [words[i] for i in sorted_index[:top_k]] # 取得排名前 K位 的關鍵字
    return tfidf_keywords

#TextRank Function
def Txtrank(seg_list, top_k:int):
    nlp = spacy.load('zh_core_web_sm')
    nlp.add_pipe("textrank")
    doc = nlp(seg_list)
    textrank_keywords = []
    for p in doc._.phrases:
        if len(textrank_keywords) >= top_k:
            break
        textrank_keywords.append(p.text)
    return(textrank_keywords)

#LDA Function

def Lda(seg_list:list, Num_topics:int, Num_keywords:int, Max_iter:int):
    vectorizer = CountVectorizer() # 建立CountVectorizer物件
    X = vectorizer.fit_transform(seg_list) # 使用CountVectorizer物件將文本轉換成詞頻矩陣
    lda = LatentDirichletAllocation(n_components=Num_topics, max_iter=Max_iter)
    lda.fit_transform(X)
    for topic_idx, topic in enumerate(lda.components_): # 取出每個主題的關鍵字
        top_keyword_idxs = topic.argsort()[:-Num_keywords-1:-1]
        top_keywords = [vectorizer.get_feature_names()[idx] for idx in top_keyword_idxs]
    return top_keywords

#以下檢驗準確率使用
# Accuracy = （TP+TN） / （TP+TN+FP+FN）
# Precision = TP /（TP+FP）
# Recall = TP /（TP+FN）
# extracted_list = TP + FP
# valid_list = TP + FN
# TP = set(extracted_list).intersection(set(valid_list))
# FP = set(extracted_list).difference(set(valid_list))
# FN = set(valid_list).difference(set(extracted_list))
# TP+FP+FN = set(extracted_list).union(set(valid_list))
# TN = ?????
#
# GPT算法
# 這裡用來驗證關鍵字提取準確率的方式是：準確率 = (交集中的關鍵字數 / 驗證關鍵字數) × 100%
# 其中，交集中的關鍵字數是指從文件中提取的關鍵字列表和驗證關鍵字列表的交集中包含的關鍵字數量。

def accuracy_gpt(extracted_list:list, valid_list:list):
    accuracy_results = []
    for i in range(len(extracted_list)):
        y_pred = extracted_list[i]
        y_true = valid_list[i]
        intersection = set(y_pred).intersection(set(y_true)) # 算出 TP
        accuracy = len(intersection) / len(y_true)  # TP / TP + FN, GPT沒有把FP放進分母
        accuracy_results.append(round(accuracy, 4)) #準確率僅顯示至小數點後四位
    return accuracy_results
#
def accuracy_mine(extracted_list:list, valid_list:list):
    accuracy_results = []
    for i in range(len(extracted_list)):
        y_pred = extracted_list[i]
        y_true = valid_list[i]
        intersection = set(y_pred).intersection(set(y_true)) # 算出 TP
        accuracy = len(intersection) / len(set(y_pred).union(set(y_true)))  # TP / TP + FN + FP
        accuracy_results.append(round(accuracy, 4)) #準確率僅顯示至小數點後四位
    return accuracy_results
#
def precision_mine(extracted_list:list, valid_list:list):
    precision_results = []
    for i in range(len(extracted_list)):
        y_pred = extracted_list[i]
        y_true = valid_list[i]
        intersection = set(y_pred).intersection(set(y_true)) # 算出 TP
        precision = len(intersection) / len(y_pred) # TP / TP + FP, 這邊應該是沒有問題的
        precision_results.append(round(precision, 4)) #精確率僅顯示至小數點後四位
    return precision_results
#
def recall_mine(extracted_list:list, valid_list:list):
    recall_results = []
    for i in range(len(extracted_list)):
        y_pred = extracted_list[i]
        y_true = valid_list[i]
        intersection = set(y_pred).intersection(set(y_true)) # 算出 TP
        recall = len(intersection) / len(y_true) # TP / TP + FN, 這邊應該是沒有問題的
        recall_results.append(round(recall, 4))
    return recall_results
#
def f1_score_mine(precision_results:list, recall_results:list):
    f1_results = []
    for i in range(len(precision_results)):
        pc = precision_results[i]
        rc = recall_results[i]
        if pc + rc == 0:
            f1 = 0
        else:
            f1 = 2 * ((pc * rc) / (pc + rc))
        f1_results.append(round(f1, 4))
    return f1_results

def fbeta_score(precision_results:list, recall_results:list, beta:float):
    fbeta_results = []
    for i in range(len(precision_results)):
        pc = precision_results[i]
        rc = recall_results[i]
        if pc + rc == 0:
            fbeta = 0
        else:
            fbeta = ((1+beta**2) * pc * rc) / ((pc * beta**2) + rc)
        fbeta_results.append(round(fbeta, 4))
    return fbeta_results

# 以上使用的是集體檢測預測詞命中驗證詞的狀況，以下是採取將預測詞一一取出驗證真假，並填寫1/0的方法。
# def calculate_roc_auc(extracted_list:list, valid_list:list):
#     # 將預測詞轉換為二元陣列，其中1表示該詞存在於真正的答案中，0表示不存在
#     y_true = [1 if keyword in valid_list else 0 for keyword in extracted_list]
#     y_scores = [0 if keyword in valid_list else 1 for keyword in extracted_list]
#     # 計算ROC Curve下的FPR和TPR以及閾值
#     fpr, tpr, thresholds = roc_curve(y_true, y_scores)
#     # 計算ROC Curve下的AUC
#     roc_auc = auc(fpr, tpr)
#     return roc_auc

# def calculate_pr_auc(extracted_list:list, valid_list:list):
#     # 將預測詞轉換為二元陣列，其中1表示該詞存在於真正的答案中，0表示不存在
#     y_true = [1 if keyword in valid_list else 0 for keyword in extracted_list]
#     y_scores = [0 if keyword in valid_list else 1 for keyword in extracted_list]
#     # 計算Precision-Recall Curve下的precision和recall以及閾值
#     precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
#     # 計算Precision-Recall Curve下的AUC
#     pr_auc = auc(recall, precision)
#     return pr_auc

# def calculate_precision(extracted_list:list, valid_list:list):
#     # 將預測詞轉換為二元陣列，其中1表示該詞存在於真正的答案中，0表示不存在
#     y_true = [1 if keyword in valid_list else 0 for keyword in extracted_list]
#     y_pred = [1 if keyword in extracted_list else 0 for keyword in valid_list]
#     # 計算Precision
#     precision = precision_score(y_true, y_pred)
#     return precision

# def calculate_recall(extracted_list:list, valid_list:list):
#     # 將預測詞轉換為二元陣列，其中1表示該詞存在於真正的答案中，0表示不存在
#     y_true = [1 if keyword in valid_list else 0 for keyword in extracted_list]
#     y_pred = [1 if keyword in extracted_list else 0 for keyword in valid_list]
#     # 計算Recall
#     recall = recall_score(y_true, y_pred)
#     return recall

# def calculate_f1_score(extracted_list:list, valid_list:list):
#     # 將預測詞轉換為二元陣列，其中1表示該詞存在於真正的答案中，0表示不存在
#     y_true = [1 if keyword in valid_list else 0 for keyword in extracted_list]
#     y_pred = [1 if keyword in extracted_list else 0 for keyword in valid_list]
#     # 計算F1-Score
#     f1_score = f1_score(y_true, y_pred)
#     return f1_score

#以上用來跑多個K

In [903]:
data_topic = pd.read_csv('taiwan_charity_news_topic.csv', encoding='UTF-8')

In [904]:
data_topic.head(3)

Unnamed: 0,Foreign_key,Date,Title,Media,Content,Topic_test,Seg_list,News_tags
0,96,2020/10/22,視力僅剩0.3，愛盲助其考證照、工作、帶大4子長大,Chinatimes,視障媽媽林佳臻，在視障、單親、低收入戶等窘迫條件下，15年來，不但一手撐起整個家、帶大四個孩...,Visually_impaired_mom,視障 媽媽 林佳臻 視障 單親 低收入戶 窘迫 條件 下 15 年 來 撐起 整 個 家 帶...,"愛盲, 服務, 視障, 媽媽"
1,97,2020/10/22,視力0.3的「超能力」。她靠1把掃把，養活4子女。,United_Daily_News,「天下的媽媽都是一樣的」，誰說視障女性無法勝任媽媽的角色？\n愛盲基金會表示，基金會成立近3...,Visually_impaired_mom,天下 媽媽 一樣 說 視障 女性 勝任 媽媽 角色 愛盲 基金會 表示 基金會 成立 近 3...,"愛盲, 服務, 視障, 媽媽"
2,98,2020/10/22,視力僅剩0.3，愛盲助其考證照、工作、帶大4子長大。,Commercial_Times,視障媽媽林佳臻，在視障、單親、低收入戶等窘迫條件下，15年來，不但一手撐起整個家、帶大四個孩...,Visually_impaired_mom,視障 媽媽 林佳臻 視障 單親 低收入戶 窘迫 條件 下 15 年 來 撐起 整 個 家 帶...,"愛盲, 服務, 視障, 媽媽"


In [905]:
segs = []

for seg in data_topic['Seg_list']:
    sg = seg.split(' ')
    segs.append(sg)

In [906]:
tags = []
for tag in data_topic['News_tags']:
    tag = tag.replace(",", '').split(' ')
    tags.append(tag)

In [907]:
K_value = 35

In [908]:
tfidf_list = []
for sg in segs:
    x= Tfidf(sg, K_value)
    tfidf_list.append(x)

In [909]:
# print(tfidf_list[0])
# print(len(tfidf_list))

In [910]:
txtrank_list =[]
for sg in segs:
    sgj = " ".join(sg)
    y= Txtrank(sgj, K_value)
    txtrank_list.append(y)

In [911]:
print(txtrank_list[0])

['今年', '台灣', '單親', '林佳臻', '0', '12 年', '15 年', '20', '3', '500', '6', '87％', '三', '三成', '六', '四', '第二']


In [912]:
lda_list = []
for sg in segs:
    z= Lda(sg, 1, K_value, 50)
    lda_list.append(z)

In [913]:
# print(lda_list[0])

In [914]:
# Topic1_idx = [0, 1, 2, 3, 4]
# Topic2_idx = [5, 6, 7, 8, 9, 10, 11, 12]
# Topic3_idx = [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 29]
# Topic4_idx = [24, 25, 26, 27, 28]

In [915]:
#TFIDF-Metrics
TFIDF_acc = accuracy_mine(tfidf_list, tags)
TFIDF_pc = precision_mine(tfidf_list, tags)
TFIDF_rc = recall_mine(tfidf_list, tags)
TFIDF_f1 = f1_score_mine(TFIDF_pc, TFIDF_rc)
TFIDF_fb15 = fbeta_score(TFIDF_pc, TFIDF_rc, 1.5)
TFIDF_fb20 = fbeta_score(TFIDF_pc, TFIDF_rc, 2.0)
TFIDF_fb25 = fbeta_score(TFIDF_pc, TFIDF_rc, 2.5)
TFIDF_fb30 = fbeta_score(TFIDF_pc, TFIDF_rc, 3.0)
print(TFIDF_acc)
print(TFIDF_pc)
print(TFIDF_rc)
print(TFIDF_f1)
print(TFIDF_fb15)
print(TFIDF_fb20)
print(TFIDF_fb25)
print(TFIDF_fb30)

[0.0541, 0.0541, 0.0541, 0.0541, 0.0541, 0.0811, 0.0811, 0.1111, 0.0811, 0.1111, 0.0811, 0.0811, 0.0811, 0.0233, 0.0465, 0.0233, 0.0476, 0.075, 0.0476, 0.0233, 0.0513, 0.0233, 0.0513, 0.0476, 0.025, 0.0833, 0.025, 0.025, 0.0789, 0.0732]
[0.0571, 0.0571, 0.0571, 0.0571, 0.0571, 0.0857, 0.0857, 0.1143, 0.0857, 0.1143, 0.0857, 0.0857, 0.0857, 0.0286, 0.0571, 0.0286, 0.0571, 0.0857, 0.0571, 0.0286, 0.0571, 0.0286, 0.0571, 0.0571, 0.0286, 0.0857, 0.0286, 0.0286, 0.0857, 0.0857]
[0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.6, 0.8, 0.6, 0.8, 0.6, 0.6, 0.6, 0.1111, 0.2, 0.1111, 0.2222, 0.375, 0.2222, 0.1111, 0.3333, 0.1111, 0.3333, 0.2222, 0.1667, 0.75, 0.1667, 0.1667, 0.5, 0.3333]
[0.1025, 0.1025, 0.1025, 0.1025, 0.1025, 0.15, 0.15, 0.2, 0.15, 0.2, 0.15, 0.15, 0.15, 0.0455, 0.0888, 0.0455, 0.0909, 0.1395, 0.0909, 0.0455, 0.0975, 0.0455, 0.0975, 0.0909, 0.0488, 0.1538, 0.0488, 0.0488, 0.1463, 0.1363]
[0.1476, 0.1476, 0.1476, 0.1476, 0.1476, 0.2108, 0.2108, 0.2811, 0.2108, 0.2811, 0.2108, 0.2108, 0.2108, 0

In [916]:
# TextRank-Metrics
TextRank_acc = accuracy_mine(txtrank_list, tags)
TextRank_pc = precision_mine(txtrank_list, tags)
TextRank_rc = recall_mine(txtrank_list, tags)
TextRank_f1 = f1_score_mine(TextRank_pc, TextRank_rc)
TextRank_fb15 = fbeta_score(TextRank_pc, TextRank_rc, 1.5)
TextRank_fb20 = fbeta_score(TextRank_pc, TextRank_rc, 2.0)
TextRank_fb25 = fbeta_score(TextRank_pc, TextRank_rc, 2.5)
TextRank_fb30 = fbeta_score(TextRank_pc, TextRank_rc, 3.0)
print(TextRank_acc)
print(TextRank_pc)
print(TextRank_rc)
print(TextRank_f1)
print(TextRank_fb15)
print(TextRank_fb20)
print(TextRank_fb25)
print(TextRank_fb30)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0263, 0.0278, 0.0345, 0.0278, 0.0, 0.0357, 0.0323, 0.0, 0.027, 0.0, 0.0417, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0294]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0833, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0333, 0.037, 0.0476, 0.0357, 0.0, 0.05, 0.0435, 0.0, 0.0345, 0.0, 0.0625, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0385]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111, 0.1, 0.1111, 0.1111, 0.0, 0.1111, 0.1111, 0.0, 0.1111, 0.0, 0.1111, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111]
[0, 0, 0, 0, 0, 0, 0.1176, 0, 0, 0, 0, 0, 0, 0.0512, 0.054, 0.0666, 0.054, 0, 0.069, 0.0625, 0, 0.0527, 0, 0.08, 0, 0, 0, 0, 0, 0.0572]
[0, 0, 0, 0, 0, 0, 0.1398, 0, 0, 0, 0, 0, 0, 0.0646, 0.0656, 0.0788, 0.0673, 0, 0.0807, 0.0752, 0, 0.066, 0, 0.0897, 0, 0, 0, 0, 0, 0.0703]
[0, 0, 0, 0, 0, 0, 0.1562, 0, 0, 0, 0, 0, 0, 0.0757, 0.0746, 0.0877, 0.0781, 0, 0.0893, 0.0848, 0, 0.0769, 0, 0.0961, 0, 0, 0, 0, 0, 0.0807]
[0, 0, 0, 0, 0, 0, 0.1676, 0, 0, 0, 0, 0, 0, 0.084

In [917]:
# LDA-Metrics
LDA_acc = accuracy_mine(lda_list, tags)
LDA_pc = precision_mine(lda_list, tags)
LDA_rc = recall_mine(lda_list, tags)
LDA_f1 = f1_score_mine(LDA_pc, LDA_rc)
LDA_fb15 = fbeta_score(LDA_pc, LDA_rc, 1.5)
LDA_fb20 = fbeta_score(LDA_pc, LDA_rc, 2.0)
LDA_fb25 = fbeta_score(LDA_pc, LDA_rc, 2.5)
LDA_fb30 = fbeta_score(LDA_pc, LDA_rc, 3.0)
print(LDA_acc)
print(LDA_pc)
print(LDA_rc)
print(LDA_f1)
print(LDA_fb15)
print(LDA_fb20)
print(LDA_fb30)

[0.1143, 0.1143, 0.1143, 0.1143, 0.0833, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.2571, 0.25, 0.2571, 0.2571, 0.1622, 0.2571, 0.2222, 0.0789, 0.2571, 0.0789, 0.2222, 0.1714, 0.1143, 0.1714, 0.1714, 0.1081, 0.2222]
[0.1143, 0.1143, 0.1143, 0.1143, 0.0857, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.2571, 0.2571, 0.2571, 0.2571, 0.1714, 0.2571, 0.2286, 0.0857, 0.2571, 0.0857, 0.2286, 0.1714, 0.1143, 0.1714, 0.1714, 0.1143, 0.2286]
[1.0, 1.0, 1.0, 1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 0.75, 1.0, 0.8889, 0.5, 1.0, 0.5, 0.8889, 1.0, 1.0, 1.0, 1.0, 0.6667, 0.8889]
[0.2052, 0.2052, 0.2052, 0.2052, 0.1538, 0.2501, 0.2501, 0.2501, 0.2501, 0.2501, 0.2501, 0.2501, 0.2501, 0.409, 0.3999, 0.409, 0.409, 0.279, 0.409, 0.3637, 0.1463, 0.409, 0.1463, 0.3637, 0.2926, 0.2052, 0.2926, 0.2926, 0.1951, 0.3637]
[0.2955, 0.2955, 0.2955, 0.2955, 0.2216, 0.3514, 0.3514, 0.3514, 0.3514, 0.3514, 0.3514, 0.3514, 0.3514, 0.5294, 0.5086,

In [918]:
Text_Num = data_topic['Foreign_key'].to_list()
Topics = data_topic['Topic_test'].to_list()
print(Text_Num)
print(Topics)

[96, 97, 98, 101, 107, 128, 129, 130, 131, 132, 133, 134, 135, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 256, 260, 262, 263, 264, 265, 270]
['Visually_impaired_mom', 'Visually_impaired_mom', 'Visually_impaired_mom', 'Visually_impaired_mom', 'Visually_impaired_mom', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'Okmart_charity_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'life_rebuild_event', 'dark experience', 'dark experience', 'dark experience', 'dark experience', 'dark experience', 'life_rebuild_event']


In [919]:
Metrics = {'Text_Num':data_topic['Foreign_key'], 'Topic':data_topic['Topic_test'], 'TFIDF_acc':TFIDF_acc, 'TFIDF_pc':TFIDF_pc, 
           'TFIDF_rc':TFIDF_rc, 'TFIDF_f1':TFIDF_f1, 'TFIDF_fb15': TFIDF_fb15, 'TFIDF_fb20':TFIDF_fb20, 'TFIDF_fb25':TFIDF_fb25, 'TFIDF_fb30':TFIDF_fb30, 
           'TextRank_acc':TextRank_acc, 'TextRank_pc':TextRank_pc, 'TextRank_rc':TextRank_rc, 'TextRank_f1':TextRank_f1, 
           'TextRank_fb15':TextRank_fb15, 'TextRank_fb20':TextRank_fb20, 'TextRank_fb25':TextRank_fb25, 'TextRank_fb30':TextRank_fb30,
           'LDA_acc':LDA_acc, 'LDA_pc':LDA_pc, 'LDA_rc':LDA_rc, 'LDA_f1':LDA_f1, 
           'LDA_fb15':LDA_fb15, 'LDA_fb20':LDA_fb20, 'LDA_fb25':LDA_fb25, 'LDA_fb30':LDA_fb30}

keyword_metrics = pd.DataFrame(data=Metrics)

In [920]:
keyword_metrics.to_csv(f'keyword_metrics_k{K_value}.csv')

In [921]:
# 篩選'Topic'為'Visually_impaired_mom'的行
topic1 = keyword_metrics.loc[keyword_metrics['Topic'] == 'Visually_impaired_mom']
# 計算各項目的算術平均數
topic1_mean = round(topic1.iloc[:, 2:26].mean(numeric_only=True), 4).to_list()
# 篩選'Topic'為'Okmart_charity_event'的行
topic2 = keyword_metrics.loc[keyword_metrics['Topic'] == 'Okmart_charity_event']
# 計算各項目的算術平均數
topic2_mean = round(topic2.iloc[:, 2:26].mean(numeric_only=True), 4).to_list()
# 篩選'Topic'為'life_rebuild_event'的行
topic3 = keyword_metrics.loc[keyword_metrics['Topic'] == 'life_rebuild_event']
# 計算各項目的算術平均數
topic3_mean = round(topic3.iloc[:, 2:26].mean(numeric_only=True), 4).to_list()
# 篩選'Topic'為'dark experience'的行
topic4 = keyword_metrics.loc[keyword_metrics['Topic'] == 'dark experience']
# 計算各項目的算術平均數
topic4_mean = round(topic4.iloc[:, 2:26].mean(numeric_only=True), 4).to_list()

# 製作列資料
Line1 = ['Visually_impaired_mom'] + topic1_mean
Line2 = ['Okmart_charity_event'] + topic2_mean
Line3 = ['life_rebuild_event'] + topic3_mean
Line4 = ['dark experience'] + topic4_mean

print(topic1_mean)
print(Line1)
print(len(Line1))

[0.0541, 0.0571, 0.5, 0.1025, 0.1476, 0.196, 0.2416, 0.2816, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1081, 0.1086, 0.95, 0.1949, 0.2807, 0.3726, 0.4592, 0.5352]
['Visually_impaired_mom', 0.0541, 0.0571, 0.5, 0.1025, 0.1476, 0.196, 0.2416, 0.2816, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1081, 0.1086, 0.95, 0.1949, 0.2807, 0.3726, 0.4592, 0.5352]
25


In [922]:
# 創建Comparison DataFrame
cols = [
    'Topics', 
    'TFIDF_acc_avg', 
    'TFIDF_pc_avg', 
    'TFIDF_rc_avg', 
    'TFIDF_f1_avg', 
    'TFIDF_fb15_avg',
    'TFIDF_fb20_avg',
    'TFIDF_fb25_avg',
    'TFIDF_fb30_avg',
    'TextRank_acc_avg', 
    'TextRank_pc_avg', 
    'TextRank_rc_avg', 
    'TextRank_f1_avg', 
    'TextRank_fb15_avg',
    'TextRank_fb20_avg',
    'TextRank_fb25_avg',
    'TextRank_fb30_avg',
    'LDA_acc_avg', 
    'LDA_pc_avg', 
    'LDA_rc_avg', 
    'LDA_f1_avg',
    'LDA_fb15_avg',
    'LDA_fb20_avg',
    'LDA_fb25_avg',
    'LDA_fb30_avg']

# 建立DF物件
Comparison = pd.DataFrame(columns=cols)

Comparison.loc[0] = Line1
Comparison.loc[1] = Line2
Comparison.loc[2] = Line3
Comparison.loc[3] = Line4

In [923]:
Comparison

Unnamed: 0,Topics,TFIDF_acc_avg,TFIDF_pc_avg,TFIDF_rc_avg,TFIDF_f1_avg,TFIDF_fb15_avg,TFIDF_fb20_avg,TFIDF_fb25_avg,TFIDF_fb30_avg,TextRank_acc_avg,...,TextRank_fb25_avg,TextRank_fb30_avg,LDA_acc_avg,LDA_pc_avg,LDA_rc_avg,LDA_f1_avg,LDA_fb15_avg,LDA_fb20_avg,LDA_fb25_avg,LDA_fb30_avg
0,Visually_impaired_mom,0.0541,0.0571,0.5,0.1025,0.1476,0.196,0.2416,0.2816,0.0,...,0.0,0.0,0.1081,0.1086,0.95,0.1949,0.2807,0.3726,0.4592,0.5352
1,Okmart_charity_event,0.0886,0.0928,0.65,0.1625,0.2284,0.2954,0.3557,0.4062,0.0078,...,0.021,0.0219,0.1429,0.1429,1.0,0.2501,0.3514,0.4546,0.5473,0.6251
2,life_rebuild_event,0.0444,0.0524,0.2238,0.0845,0.1108,0.1343,0.1531,0.1675,0.0235,...,0.0671,0.0707,0.2102,0.2143,0.8597,0.3423,0.4448,0.5346,0.6052,0.6586
3,dark experience,0.0474,0.0514,0.35,0.0893,0.1248,0.1605,0.1926,0.2194,0.0,...,0.0,0.0,0.1473,0.1486,0.9333,0.2556,0.3539,0.4513,0.5367,0.607


In [924]:
Comparison.to_csv(f'Comparisonk{K_value}.csv')

In [925]:
#製作統計表格的表頭
#Table to Pic
# fig = go.Figure(data=[go.Table(
#     header=dict(values=cols,
#                 line_color='darkslategray',
#                 fill_color='#C0C0C0',
#                 align='center',
#                 font=dict(color='#000000', size=12)),
#     cells=dict(values=[Comparison['Topics'],
#                        Comparison['TFIDF_acc_avg'], 
#                        Comparison['TFIDF_pc_avg'], 
#                        Comparison['TFIDF_rc_avg'], 
#                        Comparison['TFIDF_f1_avg'], 
#                        Comparison['TextRank_acc_avg'],
#                        Comparison['TextRank_pc_avg'],
#                        Comparison['TextRank_rc_avg'],
#                        Comparison['TextRank_f1_avg'],
#                        Comparison['LDA_acc_avg'],
#                        Comparison['LDA_pc_avg'],
#                        Comparison['LDA_rc_avg'],
#                        Comparison['LDA_f1_avg'],
#                        ],
#                line_color='darkslategray',
#                fill_color='#FFFFFF',
#                align='center',
#                font=dict(color='#000000', size=12)))
# ])
# fig.update_layout(height=140, width=1800, margin=dict(r=10, l=10, t=10, b=10))
# fig.write_image("Keyword_Metrics_K_10.png", scale=2)
# fig.show()

In [926]:
#製作統計表格的表頭 No acc
#Table to Pic
# fig = go.Figure(data=[go.Table(
#     header=dict(values=no_acc,
#                 line_color='darkslategray',
#                 fill_color='#C0C0C0',
#                 align='center',
#                 font=dict(color='#000000', size=12)),
#     cells=dict(values=[Comparison['Topics'],
#                        Comparison['TFIDF_pc_avg'], 
#                        Comparison['TFIDF_rc_avg'], 
#                        Comparison['TFIDF_f1_avg'], 
#                        Comparison['TextRank_pc_avg'],
#                        Comparison['TextRank_rc_avg'],
#                        Comparison['TextRank_f1_avg'],
#                        Comparison['LDA_pc_avg'],
#                        Comparison['LDA_rc_avg'],
#                        Comparison['LDA_f1_avg'],
#                        ],
#                line_color='darkslategray',
#                fill_color='#FFFFFF',
#                align='center',
#                font=dict(color='#000000', size=12)))
# ])
# fig.update_layout(height=140, width=1400, margin=dict(r=10, l=10, t=10, b=10))
# fig.write_image("Keyword_Metrics_K_15_noacc.png", scale=2)
# fig.show()