In [2]:
import re
import numpy as np
from typing import List

#定義簡易文本資料
corpus = ['You say goodbye and I say hello.']

word_dic = set()
processed_sentence = []
    
for sentence in corpus:
    #將所有字詞轉為小寫
    sentence = sentence.lower()

    #移除標點符號(可以依據使用狀況決定是否要移除標點符號)
    pattern = r'[^\W_]+'
    sentence = re.findall(pattern, sentence)      

    #添加字詞到字典中
    word_dic |= set(sentence)
    processed_sentence.append(sentence)

print(processed_sentence)

[['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello']]


In [3]:
#建立字詞ID清單
word2idx = dict()
idx2word = dict()
for word in word_dic:
    if word not in word2idx:
        idx = len(word2idx)
        word2idx[word] = idx
        idx2word[idx] = word

#將文本轉為ID型式
id_mapping = lambda x: word2idx[x]
corpus = np.array([list(map(id_mapping, sentence)) for sentence in processed_sentence])

print(corpus)
print(word2idx)
print(idx2word)

[[2 1 4 3 5 1 0]]
{'hello': 0, 'say': 1, 'you': 2, 'and': 3, 'goodbye': 4, 'i': 5}
{0: 'hello', 1: 'say', 2: 'you', 3: 'and', 4: 'goodbye', 5: 'i'}


In [4]:
# 共現矩陣
# 將轉化處理過的文本資料轉化為共現矩陣（字詞向量化）
# 以 window 為 1 建立共現矩陣

#定義共現矩陣函式
vocab_size = len(word2idx)
window_size = 1

# initialize co-occurrence matrix
co_matrix = np.zeros(shape=(vocab_size, vocab_size), dtype=np.int32)
    
for sentence in corpus:
    sentence_size = len(sentence)
    
    for idx, word_id in enumerate(sentence):
        for i in range(1, window_size+1):
            left_idx = idx - i
            right_idx = idx + i
        if left_idx >= 0:
            left_word_id = sentence[left_idx]
            co_matrix[word_id, left_word_id] += 1

        if right_idx < sentence_size:
            right_word_id = sentence[right_idx]
            co_matrix[word_id, right_word_id] += 1

            
print(co_matrix)

[[0 1 0 0 0 0]
 [1 0 1 0 1 1]
 [0 1 0 0 0 0]
 [0 0 0 0 1 1]
 [0 1 0 1 0 0]
 [0 1 0 1 0 0]]


In [5]:
#計算 ’i’ 與 ‘you’ 的字詞相似度
# 定義餘弦相似度
def cos_similarity(x: np.ndarray, y: np.ndarray, eps: float=1e-8):
    nx = x / (np.sqrt(np.sum(x**2)) + eps)
    ny = y / (np.sqrt(np.sum(y**2)) + eps)
    
    return np.dot(nx,ny)

# calculate the similarity between I and you
cos_similarity(co_matrix[word2idx['i']], co_matrix[word2idx['you']])

0.7071067691154799

In [6]:
#正向點間互資訊(PPMI)
M = np.zeros_like(co_matrix, dtype=np.float32)
N = np.sum(co_matrix)
S = np.sum(co_matrix, axis=0)
total = co_matrix.shape[0]*co_matrix.shape[1]
    
for i in range(co_matrix.shape[0]):
    for j in range(co_matrix.shape[1]):
        pmi = np.log2(co_matrix[i, j]*N / (S[i]*S[j]))
        M[i, j] = max(0, pmi)

print(M)

[[0.        1.5849625 0.        0.        0.        0.       ]
 [1.5849625 0.        1.5849625 0.        0.5849625 0.5849625]
 [0.        1.5849625 0.        0.        0.        0.       ]
 [0.        0.        0.        0.        1.5849625 1.5849625]
 [0.        0.5849625 0.        1.5849625 0.        0.       ]
 [0.        0.5849625 0.        1.5849625 0.        0.       ]]


  if __name__ == '__main__':


In [8]:
# 使用np的linalg.svd對PPMI矩陣進行奇異值分解

# SVD
U, S, V = np.linalg.svd(M)

# 使用SVD將將原本的稀疏向量轉變為稠密向量
print(f'hello in co-occurrence matrix: {co_matrix[0]}')
print(f"hello in PPMI: {M[0]}")
print(f"hello in SVD: {U[0]}")

hello in co-occurrence matrix: [0 1 0 0 0 0]
hello in PPMI: [0.        1.5849625 0.        0.        0.        0.       ]
hello in SVD: [-4.5236292e-01  0.0000000e+00 -5.4347748e-01  1.1102230e-16
 -7.0710677e-01  8.7396489e-17]
