In [12]:
import glob
import json
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup


# Spider all GDPR

## URL

In [13]:

# def fetch_page_content(url):
#     response = requests.get(url)
#     return BeautifulSoup(response.content, 'html.parser')

# def extract_links_from_table(soup, table_id):
#     table = soup.find('table', {'id': table_id})
#     if table:
#         return table.find_all('a')
#     return []

# def extract_links_from_div(soup, div_class):
#     div = soup.find('div', {'class': div_class})
#     if div:
#         return div.find_all('a')
#     return []

# def extract_text_from_spans(soup, span_class):
#     spans = soup.find_all('span', {'class': span_class})
#     return [span.text for span in spans][0]

# def extract_list_items_from_ol(soup, div_class):
#     div = soup.find('div', {'class': div_class})
#     if div:
#         ols = div.find_all('ol')
#         return [li.text for ol in ols for li in ol.find_all('li')]
#     return []

# def extract_text_from_paragraphs(soup, div_class):
#     div = soup.find('div', {'class': div_class})
#     if div:
#         paragraphs = div.find_all('p')
#         return [p.text for p in paragraphs]
#     return []

# def prepare_data(links, recitals_links):
#     data = []
#     for link in links + recitals_links:
#         text = link.text.strip()
#         href = link['href']
#         if "chapter" not in text.lower() and 'recitals' not in text.lower():
#             current_temp = {'Text': text, 'Link': href}
#             if 'art' in href:
#                 current_temp['Type'] = 'Article'
#             elif 'recitals' in href:
#                 current_temp['Type'] = 'Recital'
#             data.append(current_temp)
#     return data

# def save_to_csv(data, filename):
#     df = pd.DataFrame(data)
#     df.to_csv(filename, index=False)
#     return df

# gdpr_url = "https://gdpr-info.eu/"
# recitals_url = "https://gdpr-info.eu/recitals/"

# gdpr_soup = fetch_page_content(gdpr_url)
# recitals_soup = fetch_page_content(recitals_url)

# gdpr_links = extract_links_from_table(gdpr_soup, 'tablepress-12')
# recitals_links = extract_links_from_div(recitals_soup, 'widget-area recital-widget-area')

# print("gdpr count:", len(gdpr_links))
# print("recitals count:", len(recitals_links))

# data = prepare_data(gdpr_links, recitals_links)
# link_df = pd.DataFrame(data)

# print("GDPR Articles and Recitals Links saved to GDPR_Articles_Recitals_Links.xlsx")
# print(f"應有 99(GDPR) + 173(Recitals) = {99+173}, 實際有：{len(link_df)}")

# link_df.head()

## Article

In [14]:
# articles_and_recitals_list = []
# for index, row in link_df.iterrows():
#     current_url = row['Link']
#     current_gdpr_article_content = fetch_page_content(current_url)
#     title = extract_text_from_spans(current_gdpr_article_content, span_class='dsgvo-title')
#     content_item_list = extract_list_items_from_ol(current_gdpr_article_content, div_class='entry-content')
#     content_paragraphs = extract_text_from_paragraphs(current_gdpr_article_content, div_class='entry-content')
    
#     articles_and_recitals_list.append({
#         'Text': row['Text'],
#         'Link': row['Link'],
#         'Type': row['Type'],
#         'Title': title,
#         'Content_Items': content_item_list if content_item_list else content_paragraphs
#     })

# articles_df = pd.DataFrame(articles_and_recitals_list)

In [15]:
# with pd.ExcelWriter("GDPR_Articles_Recitals_Links.xlsx", engine='xlsxwriter') as writer:
#     articles_df.to_excel(writer, index=False, sheet_name='Article & Recitals')

# Data Preprocessing

In [26]:
# 找到所有以 'report' 開頭的 .json 文件
source_data_folder_name = 'batch0716'
files = glob.glob(f'{source_data_folder_name}/*/report_*.json')

# 定義一個函數來讀取和處理單個 JSON 文件
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    compliant = data.get('compliant', [])
    non_compliant = data.get('non_compliant', [])
    
    compliant_df = pd.DataFrame(compliant)
    non_compliant_df = pd.DataFrame(non_compliant)
    
    if 'article numbers' not in compliant_df.columns:
        compliant_df['article numbers'] = ''
    if 'article numbers' not in non_compliant_df.columns:
        non_compliant_df['article numbers'] = ''
    
    compliant_split_df = split_articles(compliant_df)
    non_compliant_split_df = split_articles(non_compliant_df)
    
    compliant_split_df['compliance_status'] = 'compliant'
    non_compliant_split_df['compliance_status'] = 'non-compliant'
    
    combined_df = pd.concat([compliant_split_df, non_compliant_split_df], ignore_index=True)
    
    # 增加一個欄位標示資料夾名稱
    folder_name = os.path.basename(os.path.dirname(file_path))
    combined_df['folder'] = folder_name
    
    return combined_df

# 定義一個函數來拆分條文
def split_articles(df):
    rows = []
    for _, row in df.iterrows():
        # 確保 'article numbers' 為字符串，並處理缺失值
        article_numbers = str(row['article numbers']) if pd.notnull(row['article numbers']) else ''
        articles = article_numbers.split(', ')
        for article in articles:
            new_row = row.copy()
            new_row['article numbers'] = article
            rows.append(new_row)
    return pd.DataFrame(rows)

# 處理所有文件並合併結果
all_data = pd.DataFrame()
for file in files:
    combined_df = process_file(file)
    all_data = pd.concat([all_data, combined_df], ignore_index=True)

# 保留原始的 article numbers 並移除贅詞的版本
all_data['clean_article numbers'] = (
    all_data['article numbers']
    .str.replace(r'\b(Article|Articles|Art|GDPR|recital|Recital|Rec)\b', '', regex=True)  # 移除指定的贅詞
    .str.replace(r'(?<!\d)\.(?!\d)', '', regex=True)  # 移除不在數字之間的點
    .str.strip()
)

# 將合併後的 DataFrame 存成 Excel 文件
excel_file_path = f'./{source_data_folder_name}_cleaned_compliance_data_with_original.xlsx'
all_data.to_excel(excel_file_path, index=False)

# 打印數量並顯示頭部數據
print("所有數量:", len(all_data))
print("clean_article numbers:", len(all_data['article numbers'].unique()))
all_data.head()

所有數量: 1776
clean_article numbers: 436


Unnamed: 0,section,article numbers,legal provisions,compliance_status,amend,folder,article_numbers,legal_provisions,article provisions,clean_article numbers
0,台灣大車隊非常重視您的隱私權。請您閱讀以下有關隱私權保護政策的更多內容。,13,Information provided to data subjects must be ...,compliant,,taiwantaxi_chunks,,,,13
1,台灣大車隊非常重視您的隱私權。請您閱讀以下有關隱私權保護政策的更多內容。,14,Information provided to data subjects must be ...,compliant,,taiwantaxi_chunks,,,,14
2,本政策涵蓋的內容包括：台灣大車隊如何處理蒐集或收到的個人資料 (包括與您過去使用 台灣大車隊...,12,Data controllers must provide information abou...,compliant,,taiwantaxi_chunks,,,,12
3,本政策涵蓋的內容包括：台灣大車隊如何處理蒐集或收到的個人資料 (包括與您過去使用 台灣大車隊...,13,Data controllers must provide information abou...,compliant,,taiwantaxi_chunks,,,,13
4,本政策涵蓋的內容包括：台灣大車隊如何處理蒐集或收到的個人資料 (包括與您過去使用 台灣大車隊...,14,Data controllers must provide information abou...,compliant,,taiwantaxi_chunks,,,,14


# Clustering

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np
import plotly.express as px

In [18]:
output_file_path = f'{source_data_folder_name}_cluster_compliance_data.xlsx' 
cluster_writer_excel = pd.ExcelWriter(output_file_path, engine='xlsxwriter')

In [19]:
# 讀取上傳的 Excel 文件
file_path = excel_file_path
compliance_data_df = pd.read_excel(file_path)

# 將各欄位分別進行 TF-IDF 向量化
def vectorize_column(data, column_name):
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data[column_name].astype(str).tolist())
    return vectors

vector_section = vectorize_column(compliance_data_df, 'section')
vector_article_numbers = vectorize_column(compliance_data_df, 'clean_article numbers')
vector_legal_provisions = vectorize_column(compliance_data_df, 'legal provisions')

# 合併各欄位的向量表示
from scipy.sparse import hstack
X_combined = hstack([vector_section, vector_article_numbers, vector_legal_provisions])

## K_means

手肘法（Elbow Method）：  
通過計算不同k值下的SSE（Sum of Squared Errors）來找到最佳k值，SSE會隨著k的增加而減小，當SSE減小幅度變緩時，即所謂的“手肘”點，即為最佳k值。

輪廓係數（Silhouette Coefficient）：  
輪廓係數能同時考慮簇內和簇間距離，其值在-1到1之間，越接近1說明聚類效果越好。可以計算不同k值下的平均輪廓係數來選擇最佳k值。

Calinski-Harabasz Index：  
該指標基於簇內和簇間距離計算，值越大越好。

In [20]:
# 定義範圍
max_k = 20  # 限制最大k值
k_range = range(2, max_k)

# 保存不同指標的結果
sse = []
silhouette_scores = []
calinski_scores = []
davies_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_combined)
    sse.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_combined, clusters))
    calinski_scores.append(calinski_harabasz_score(X_combined.toarray(), clusters))
    davies_scores.append(davies_bouldin_score(X_combined.toarray(), clusters))

# 找到手肘點 (SSE)
diff = np.diff(sse)
diff_r = diff[1:] / diff[:-1]
knee_point = np.argmin(diff_r) + 2

# 找到最大輪廓係數點
best_silhouette = np.argmax(silhouette_scores) + 2

# 找到最大Calinski-Harabasz Index點
best_calinski = np.argmax(calinski_scores) + 2

# 找到最小Davies-Bouldin Index點
best_davies = np.argmin(davies_scores) + 2

# 綜合考慮這些指標，選擇最常出現的分群數量
best_k_candidates = [knee_point, best_silhouette, best_calinski, best_davies]
best_k = max(set(best_k_candidates), key=best_k_candidates.count)

print(f"手肘法最佳分群數量: {knee_point}")
print(f"最大輪廓係數最佳分群數量: {best_silhouette}")
print(f"最大Calinski-Harabasz Index最佳分群數量: {best_calinski}")
print(f"最小Davies-Bouldin Index最佳分群數量: {best_davies}")
print(f"綜合考慮的最佳分群數量: {best_k}")

# 使用K-means進行文本分群
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init='auto')  
clusters = kmeans.fit_predict(X_combined)

# 將分群結果添加回數據框
compliance_data_df['cluster'] = clusters

# 使用PCA進行降維
pca = PCA(n_components=2)
X_pca_2d = pca.fit_transform(X_combined.toarray())

# 將降維結果添加回數據框
compliance_data_df['pca-2d-one'] = X_pca_2d[:,0]
compliance_data_df['pca-2d-two'] = X_pca_2d[:,1]

# 使用Plotly進行互動式視覺化
fig = px.scatter(
    compliance_data_df, x='pca-2d-one', y='pca-2d-two', color='cluster',
    hover_data=['section', 'clean_article numbers', 'legal provisions', 'compliance_status'],
    title="PCA Clustering of Legal Provisions"
)
fig.show()


compliance_data_df.to_excel(cluster_writer_excel, index=False, sheet_name='K_means')
print(f"分群結果已保存到 {output_file_path}")


手肘法最佳分群數量: 5
最大輪廓係數最佳分群數量: 6
最大Calinski-Harabasz Index最佳分群數量: 2
最小Davies-Bouldin Index最佳分群數量: 3
綜合考慮的最佳分群數量: 2


分群結果已保存到 cluster_compliance_data.xlsx


## AgglomerativeClustering

In [21]:
from sklearn.cluster import AgglomerativeClustering

# 定義範圍
max_k = 20  # 限制最大k值
k_range = range(2, max_k)

# 保存不同指標的結果
silhouette_scores = []
calinski_scores = []
davies_scores = []

for k in k_range:
    clustering = AgglomerativeClustering(n_clusters=k)
    clusters = clustering.fit_predict(X_combined.toarray())
    silhouette_scores.append(silhouette_score(X_combined, clusters))
    calinski_scores.append(calinski_harabasz_score(X_combined.toarray(), clusters))
    davies_scores.append(davies_bouldin_score(X_combined.toarray(), clusters))

# 找到最大輪廓係數點
best_silhouette = np.argmax(silhouette_scores) + 2

# 找到最大Calinski-Harabasz Index點
best_calinski = np.argmax(calinski_scores) + 2

# 找到最小Davies-Bouldin Index點
best_davies = np.argmin(davies_scores) + 2

# 綜合考慮這些指標，選擇最常出現的分群數量
best_k_candidates = [best_silhouette, best_calinski, best_davies]
best_k = max(set(best_k_candidates), key=best_k_candidates.count)

print(f"最大輪廓係數最佳分群數量: {best_silhouette}")
print(f"最大Calinski-Harabasz Index最佳分群數量: {best_calinski}")
print(f"最小Davies-Bouldin Index最佳分群數量: {best_davies}")
print(f"綜合考慮的最佳分群數量: {best_k}")

# 使用AgglomerativeClustering進行文本分群
clustering = AgglomerativeClustering(n_clusters=best_k)  
clusters = clustering.fit_predict(X_combined.toarray())

# 將分群結果添加回數據框
compliance_data_df['cluster'] = clusters

# 使用PCA進行降維
pca = PCA(n_components=2)
X_pca_2d = pca.fit_transform(X_combined.toarray())

# 將降維結果添加回數據框
compliance_data_df['pca-2d-one'] = X_pca_2d[:,0]
compliance_data_df['pca-2d-two'] = X_pca_2d[:,1]

# 使用Plotly進行互動式視覺化
fig = px.scatter(
    compliance_data_df, x='pca-2d-one', y='pca-2d-two', color='cluster',
    hover_data=['section', 'clean_article numbers', 'legal provisions', 'compliance_status'],
    title="PCA Clustering of Legal Provisions"
)
fig.show()

compliance_data_df.to_excel(cluster_writer_excel, index=False, sheet_name='Agglomerative')
print(f"分群結果已保存到 {output_file_path}")

In [22]:
cluster_writer_excel.close()