In [36]:
import json
import random
import re

import jieba
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

jieba.load_userdict("./dict.txt")

In [3]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return np.sqrt(np.sum((p0-p1)**2))
    
    def nearest_cluster_center(self, point, cluster_centers):
        """
        找到距離 point 最近的中心點
        """
        min_dist = float("inf")
        m = cluster_centers.shape[0]
        for i in range(m):
            d = self.cal_dist(point, cluster_centers[i])
            if min_dist > d:
                min_dist = d
        return min_dist 

    def get_centroids(self, datapoints, k):
        """
        K-means++ 演算法，取得初始化中心點
        """
        clusters = np.array([random.choice(datapoints)])
        dist = np.zeros(len(datapoints))
        
        for i in range(k-1):
            sum_dist = 0
            for j, point in enumerate(datapoints):
                dist[j] = self.nearest_cluster_center(point, clusters)
                sum_dist += dist[j]
            
            sum_dist *= random.random()
            for j, d in enumerate(dist):
                sum_dist = sum_dist - d
                if sum_dist <= 0:
                    clusters = np.append(clusters, [datapoints[j]], axis=0)
                    break
        
        return clusters
        
        
    def kmeans_plus_plus(self, datapoints, k=2):
        """
        K-means 演算法
        """
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = self.get_centroids(datapoints, k)

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [4]:
# 讀取資料
DATASET_DIR = './speech.json'
with open(DATASET_DIR, encoding='utf8') as f:
    dataset = json.load(f)
    
# 讀取 stop words
STOP_WORDS_DIR = './stop_words.txt'
with open(STOP_WORDS_DIR, encoding='utf8') as f:
    stop_words = f.read().splitlines() 

In [27]:
# 讀取演講內容與縣市
speech_list = list(map(lambda d: d['speech'], dataset))
country_list = list(map(lambda d: d['country'], dataset))

# 去除繁體中文以外的英文、數字、符號
rule = re.compile(r"[^\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]
for idx, speech in enumerate(speech_list):
    speech_list[idx] = ' '.join([word for word in speech if word not in stop_words])

In [28]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(speech_list)
tfidf = tfidf.toarray()

In [34]:
k = 4
Kmeans_cluster = KMeans()
speech_cluster_result = Kmeans_cluster.kmeans_plus_plus(tfidf, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(speech_cluster_result):
    cluster[int(c)].append(country_list[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result)))

Cluster 0: 桃園市 臺中市 高雄市 新竹市 嘉義市 南投縣
Cluster 1: 彰化縣 雲林縣 嘉義縣
Cluster 2: 台北市
Cluster 3: 新北市 台南市


In [39]:

# 解析最每一篇演講中會相關的字
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(speech_list))
bag_of_words = vectorizer.get_feature_names()
weight = tfidf.toarray()

news_most_related_words = {}
for i in range(len(weight)): 
    w = dict(zip(bag_of_words, weight[i]))
    w = sorted(w.items(), key=lambda x: x[1], reverse=True)
    top_10 = []
    for word, prob in w[:5]:
        if prob > 0:
            top_10.append(word)
    news_most_related_words.update({country_list[i]: top_10})

In [40]:
news_most_related_words

{'南投縣': ['南投', '明溱', '縣長', '南投人', '老人'],
 '台北市': ['上工', '同仁', '工作'],
 '台南市': ['市民', '市府', '團隊', '市政', '台南'],
 '嘉義市': ['謝謝', '朋友', '力量', '善良', '告訴'],
 '嘉義縣': ['轉型', '農業', '發展', '人才', '產業'],
 '彰化縣': ['彰化', '惠美', '中心', '區域', '魏縣'],
 '新北市': ['新北市', '市民', '侯友宜', '解決', '團隊'],
 '新竹市': ['新竹市', '市民', '非常', '城市', '新竹'],
 '桃園市': ['桃園', '程式', '建設', '好不好', '城市'],
 '臺中市': ['秀燕', '市長', '拼經濟', '台中市', '戶外'],
 '雲林縣': ['雲林', '上場', '失衡', '數位', '雲林良品'],
 '高雄市': ['高雄市', '春天', '高雄', '轟動', '城市']}