In [9]:
from collections import Counter
import json
import math
import random
import re

import jieba
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [10]:
jieba.load_userdict("./dict.txt")

In [76]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))

    def kmeans(self, datapoints, k=2):
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = []
        for i in range(k):
            cluster_centers += [random.choice(datapoints)]

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [77]:
# 讀取資料
DATASET_DIR = './speech.json'
with open(DATASET_DIR) as f:
    dataset = json.load(f)
    
# 讀取 stop words
STOP_WORDS_DIR = './stop_words.txt'
with open(STOP_WORDS_DIR) as f:
    stop_words = f.read().splitlines() 

In [78]:
# 讀取演講內容與縣市
speech_list = list(map(lambda d: d['speech'], dataset))
country_list = list(map(lambda d: d['country'], dataset))

# 去除繁體中文以外的英文、數字、符號
rule = re.compile(r"[^\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]
for idx, speech in enumerate(speech_list):
    speech_list[idx] = ' '.join([word for word in speech if word not in stop_words])

In [89]:
# 解析最每一篇演講中會相關的字
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(speech_list))
bag_of_words = vectorizer.get_feature_names()
weight = tfidf.toarray()

news_most_related_words = {}
for i in range(len(weight)): 
    w = dict(zip(bag_of_words, weight[i]))
    w = sorted(w.items(), key=lambda x: x[1], reverse=True)
    top_10 = []
    for word, prob in w[:10]:
        if prob > 0:
            top_10.append(word)
    news_most_related_words.update({country_list[i]: top_10})

In [90]:
news_most_related_words

{'台北市': ['上工', '同仁', '工作'],
 '新北市': ['新北市', '市民', '侯友宜', '解決', '團隊', '政治', '希望', '便民', '朱市長', '正氣'],
 '桃園市': ['桃園', '程式', '建設', '發展', '政治', '好不好', '城市', '立委', '特別', '希望'],
 '高雄市': ['高雄市', '高雄', '春天', '轟動', '城市', '三十年', '世界', '全世界', '包容', '市府'],
 '新竹市': ['新竹市', '市民', '非常', '城市', '四年', '新竹', '市長', '加倍', '朋友', '參議'],
 '嘉義市': ['謝謝', '朋友', '力量', '善良', '能量', '告訴', '支持', '嘉義', '嘉義市', '過程'],
 '雲林縣': ['雲林', '鄉親', '發展', '上場', '失衡', '數位', '自我', '雲林良品', '面對', '品牌']}

In [81]:
# 將字詞向量化 BOW
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([' '.join(w) for w in list(news_most_related_words.values())]) 
X = X.toarray()

In [82]:
X.shape

(7, 107)

In [87]:
k = 4
K = KMeans()
speech_cluster_result = K.kmeans(X, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(speech_cluster_result):
    cluster[int(c)].append(country_list[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result)))

Cluster 0: 新竹市
Cluster 1: 桃園市
Cluster 2: 台北市
Cluster 3: 新北市 高雄市 嘉義市 雲林縣
