In [1]:
import numpy as np

def generate_cooccurence_matrix(filepath):
    cooccurrence_words = []
    word2idx = {}
    all_words = []
    
    with open(filepath, encoding="UTF-8") as f:
        for line in f:
            splitted = line.split()
            all_words.append(splitted[0])
            cooccurrence_words.append(splitted)
            
    all_words = list(set(all_words))
    
    for word in all_words:
        word2idx[word] = len(word2idx)
    
    Xc = np.zeros((len(word2idx), len(word2idx)))
    
    for words in cooccurrence_words:
        i = word2idx[words[0]]
        j = word2idx[words[1]]
        cooccurrence = words[2]
        Xc[i, j] = cooccurrence
        Xc[j, i] = cooccurrence
    
    return Xc, word2idx

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import brown
from scipy import spatial
import nltk

# ブラウンコーパスを使用
brown_words = nltk.download('brown')
brown_words = brown.words()
brown_sentences = brown.sents()

# print(brown_words)

# 共起頻度ベクトルの作成
def create_cooccurance_vector(word, window_size=5):
    occurance_dict = dict.fromkeys(brown_words)
    windows = []
    indexes = [i for i, x in enumerate(brown_words) if x == word]
    for index in indexes:
        window = brown_words[index-window_size:index+window_size]
        windows.append(window)
    for window in windows:
        for word in window:
            occurance_dict[word] = sum([window.count(word) for window in windows]) / len(brown_words)
    return occurance_dict

#単語ベクトル作成
cooccurance_vector_think = pd.Series(create_cooccurance_vector('think')).fillna(0)
cooccurance_vector_thinking = pd.Series(create_cooccurance_vector('thinking')).fillna(0)

cooccurance_vector_read = pd.Series(create_cooccurance_vector('read')).fillna(0)
cooccurance_vector_reading = pd.Series(create_cooccurance_vector('reading')).fillna(0)

cooccurance_vector_possibly = pd.Series(create_cooccurance_vector('possibly')).fillna(0)
cooccurance_vector_impossibly = pd.Series(create_cooccurance_vector('impossibly')).fillna(0)

cooccurance_vector_good = pd.Series(create_cooccurance_vector('good')).fillna(0)
cooccurance_vector_bad = pd.Series(create_cooccurance_vector('bad')).fillna(0)

cooccurance_vector_bird = pd.Series(create_cooccurance_vector('bird')).fillna(0)

cooccurance_vector_german = pd.Series(create_cooccurance_vector('Germany')).fillna(0)
cooccurance_vector_berlin = pd.Series(create_cooccurance_vector('Berlin')).fillna(0)

cooccurance_vector_greece = pd.Series(create_cooccurance_vector('Greece')).fillna(0)
cooccurance_vector_athens = pd.Series(create_cooccurance_vector('Athens')).fillna(0)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\eCho2\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [7]:
cooccurance_vector_think

The              0.000009
Fulton           0.000000
County           0.000000
Grand            0.000000
Jury             0.000000
                   ...   
aviary           0.000000
olive-flushed    0.000000
coral-colored    0.000000
boucle           0.000000
stupefying       0.000000
Length: 56057, dtype: float64