In [1]:
import joblib
from time import time
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score

In [2]:
language = ['ch', 'en']
files = ['2020',
         '202001', '202002', '202003', '202004', '202005', '202006', 
         '202007', '202008', '202009', '202010', '202011', '202012']

### 文档聚类

In [3]:
TRAIN = True
NC = [10, 12, 14, 16, 18]

In [4]:
for file in files:
    for lang in language:
        print(file+' | '+lang+':')
        df = pd.read_csv('./result/lda/'+file+'/'+lang+'/doc2theme.csv')
        x = df.iloc[:, :-2].values
        
        if TRAIN:
            score = -np.inf
            for n_cluster in NC:
                s = time()
                kmean = KMeans(n_clusters=n_cluster, random_state=42)
                kmean.fit(x)
                pred = kmean.predict(x)
                sc = calinski_harabasz_score(x, pred)
                e = time()

                if sc > score:
                    score = sc
                    print('Cost: {0:.4f} | Nclusters: {1} | Score: {2:.4f} | save model...'.format(e-s, n_cluster, sc))
                    joblib.dump(kmean, './result/kmean/'+file+'/'+lang+'/kmean.pkl') 
                    
                else:
                    print('Cost: {0:.4f} | Nclusters: {1} | Score: {2:.4f} |'.format(e-s, n_cluster, sc))
        
        kmean = joblib.load('./result/kmean/'+file+'/'+lang+'/kmean.pkl')
        pred = kmean.predict(x)
        df = pd.read_csv('./result/lda/'+file+'/'+lang+'/doc2theme_hard.csv')
        df['category'] = pred
        df.to_csv('./result/kmean/'+file+'/'+lang+'/doc2cate.csv', index=False)
        
        theme2word = pd.read_csv('./result/lda/'+file+'/'+lang+'/theme2word.csv')
        A = df.groupby('category').sum().iloc[:, :20].values.astype('float64')
        CHI = np.zeros_like(A)
        for i in range(len(A)):
            for j in range(len(A[0, :])):
                a = A[i, j]
                b = np.sum(A[i, :]) - a
                c = np.sum(A[:, j]) - a
                d = np.sum(A) - a - b - c
                if min(a+c, b+d, a+b, c+d)==0:
                    CHI[i ,j] = 0
                else:
                    CHI[i, j] = ((a*d-b*c)**2) / ((a+c)*(b+d)*(a+b)*(c+d))
                
        cate2theme = CHI
        
        mask = cate2theme.argsort()[:, -2:]
        for i in range(len(cate2theme)):
            for j in range(len(cate2theme[0, :])):
                if j in mask[i]:
                    cate2theme[i, j] = cate2theme[i, j]
                else:
                    cate2theme[i, j] = 0
        
        weight = np.matmul(cate2theme, theme2word.values)
        df_c2w = pd.DataFrame(weight, columns=theme2word.columns.values)
        df_c2w.to_csv('./result/kmean/'+file+'/'+lang+'/cate2word.csv', index=False)
                           
        print('==================================================================')

2020 | ch:
Cost: 4.7959 | Nclusters: 10 | Score: 81933.9836 | save model...
Cost: 5.0863 | Nclusters: 12 | Score: 94487.7619 | save model...
Cost: 5.8271 | Nclusters: 14 | Score: 115922.4820 | save model...
Cost: 5.8668 | Nclusters: 16 | Score: 141660.2615 | save model...
Cost: 6.5880 | Nclusters: 18 | Score: 189528.1772 | save model...
2020 | en:
Cost: 4.8732 | Nclusters: 10 | Score: 36017.8920 | save model...
Cost: 5.2088 | Nclusters: 12 | Score: 38311.7473 | save model...
Cost: 5.2688 | Nclusters: 14 | Score: 44418.2602 | save model...
Cost: 5.5671 | Nclusters: 16 | Score: 51831.2809 | save model...
Cost: 6.3743 | Nclusters: 18 | Score: 58350.4993 | save model...
202001 | ch:
Cost: 0.9536 | Nclusters: 10 | Score: 7617.8809 | save model...
Cost: 3.0934 | Nclusters: 12 | Score: 9382.7292 | save model...
Cost: 1.6599 | Nclusters: 14 | Score: 10446.6114 | save model...
Cost: 1.4179 | Nclusters: 16 | Score: 14267.4564 | save model...
Cost: 2.4620 | Nclusters: 18 | Score: 18867.1608 | sav