In [89]:
from collections import Counter
import json
import math
import random
import re

import jieba
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import dot
from numpy.linalg import norm

In [90]:
jieba.load_userdict("./dict.txt")

In [103]:
class KMeans:
    def cal_dist(self, p0, p1):
        """
        比較兩點的距離
        """
        return dot(p0, p1)/(norm(p0)*norm(p1))

    def kmeans(self, datapoints, k=2):
        # 定義資料維度
        d = datapoints.shape[1]
        # 最大的迭代次數
        Max_Iterations = 1000

        cluster = np.zeros(datapoints.shape[0])
        prev_cluster = np.ones(datapoints.shape[0])

        cluster_centers = []
        for i in range(k):
            cluster_centers += [random.choice(datapoints)]

        iteration = 0
        while np.array_equal(cluster, prev_cluster) is False or iteration > Max_Iterations:
            iteration += 1
            prev_cluster = cluster.copy()

            # 將每一個點做分群
            for idx, point in enumerate(datapoints):
                min_dist = float("inf")
                for c, cluster_center in enumerate(cluster_centers):
                    dist = self.cal_dist(point, cluster_center)
                    if dist == 'nan':
                        print(point)
                    if dist < min_dist:
                        min_dist = dist  
                        cluster[idx] = c   # 指定該點屬於哪個分群

            # 更新分群的中心
            for k in range(len(cluster_centers)):
                new_center = np.zeros(d)
                members = 0
                for point, c in zip(datapoints, cluster):
                    if c == k:
                        new_center += point
                        members += 1
                if members > 0:
                    new_center = new_center / members
                cluster_centers[k] = new_center

        return cluster

In [104]:
# 讀取資料
DATASET_DIR = './speech.json'
with open(DATASET_DIR) as f:
    dataset = json.load(f)
    
# 讀取 stop words
STOP_WORDS_DIR = './stop_words.txt'
with open(STOP_WORDS_DIR) as f:
    stop_words = f.read().splitlines() 

In [105]:
# 讀取演講內容與縣市
speech_list = list(map(lambda d: d['speech'], dataset))
country_list = list(map(lambda d: d['country'], dataset))

# 去除繁體中文以外的英文、數字、符號
rule = re.compile(r"[^\u4e00-\u9fa5]")
speech_list = [list(jieba.cut(rule.sub('', speech))) for speech in speech_list]
for idx, speech in enumerate(speech_list):
    speech_list[idx] = ' '.join([word for word in speech if word not in stop_words])

In [106]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(speech_list)
tfidf = tfidf.toarray()

In [107]:
k = 3
Kmeans_cluster = KMeans()
speech_cluster_result = Kmeans_cluster.kmeans(tfidf, k)
cluster = [[] for _ in range(k)]

for idx, c in enumerate(speech_cluster_result):
    cluster[int(c)].append(country_list[idx])
    
for c, result in enumerate(cluster):
    print('Cluster {}: {}'.format(c, ' '.join(result)))

  


KeyboardInterrupt: 