# Word2Vec

### ライブラリのインポート

In [1]:
from gensim.models import Word2Vec

In [58]:
import MeCab
import re
import logging
from tqdm import tqdm
from multiprocessing import Pool

### wikiコーパスの読み込み

In [3]:
with open("wiki.txt", "r", encoding='utf-8') as f:
    text = f.read()

In [4]:
type(text)

str

In [5]:
len(text)

1300774777

### wikiコーパスを加工

In [27]:
# 改行か句点で分割して文のリストを作成
sentences = []
for s in tqdm(re.split("[\n。]", text)):
    s = s.strip()
    if s:
        sentences.append(s + "。")

100%|██████████| 39534947/39534947 [00:19<00:00, 2004560.01it/s]


In [21]:
print(type(sentences))

<class 'list'>


In [22]:
print(len(sentences))

23187094


##### 空の要素を排除

In [28]:
sentences = list(filter(None, sentences))
print(len(sentences))

30102377


### 分かち書きを行う関数を定義

##### 分かち書きする関数

In [29]:
def tokenize(text):
    tagger = MeCab.Tagger('-Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd')
    return tagger.parse(text).strip().split()

##### リスト全体の文章に対して分かち書きを行う関数

In [39]:
def tokenize_list(text_list):
    with Pool() as pool:
        results = list(tqdm(pool.imap(tokenize, text_list), total=len(text_list)))
    return results

##### 実行例

In [40]:
print(tokenize(sentences[11111]))

['また', '現代', 'の', 'ピアノ', 'と', 'は', '黒鍵', 'と', '白', '鍵', 'の', '色', 'が', '逆', 'の', 'もの', 'も', 'ある', '。']


### 加工

In [43]:
w2v_train_data = tokenize_list(sentences)

100%|██████████| 30102377/30102377 [12:46:10<00:00, 654.82it/s]    


In [46]:
type(w2v_train_data)

list

In [47]:
len(w2v_train_data)

30102377

### 分かち書きファイルの保存

In [53]:
def save_words_to_file(word_list, filename):
    with open(filename, "w", encoding='utf-8') as file:
        for word in tqdm(word_list):
            file.write(" ".join(word) + "\n")

In [54]:
save_words_to_file(w2v_train_data, 'wiki_wakati.txt')

100%|██████████| 30102377/30102377 [05:31<00:00, 90904.14it/s] 


### 学習

In [59]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [60]:
model = Word2Vec(w2v_train_data, size=200, window=5, sample=1e-3, negative=5, hs=0)

2023-03-12 13:40:51,003 : INFO : collecting all words and their counts
2023-03-12 13:40:51,005 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-12 13:40:51,036 : INFO : PROGRESS: at sentence #10000, processed 275011 words, keeping 24308 word types
2023-03-12 13:40:51,066 : INFO : PROGRESS: at sentence #20000, processed 531992 words, keeping 41922 word types
2023-03-12 13:40:51,095 : INFO : PROGRESS: at sentence #30000, processed 790028 words, keeping 56331 word types
2023-03-12 13:40:51,123 : INFO : PROGRESS: at sentence #40000, processed 1044343 words, keeping 66798 word types
2023-03-12 13:40:51,153 : INFO : PROGRESS: at sentence #50000, processed 1301608 words, keeping 76177 word types
2023-03-12 13:40:51,185 : INFO : PROGRESS: at sentence #60000, processed 1567149 words, keeping 84485 word types
2023-03-12 13:40:51,215 : INFO : PROGRESS: at sentence #70000, processed 1839730 words, keeping 93017 word types
2023-03-12 13:40:51,249 : INFO : PROGRESS:

### モデルの保存

In [61]:
model.save("2023-03-01-word2vec.model")

2023-03-12 15:13:41,090 : INFO : saving Word2Vec object under 2023-03-01-word2vec.model, separately None
2023-03-12 15:13:41,111 : INFO : storing np array 'vectors' to 2023-03-01-word2vec.model.wv.vectors.npy
2023-03-12 15:13:48,241 : INFO : not storing attribute vectors_norm
2023-03-12 15:13:48,254 : INFO : storing np array 'syn1neg' to 2023-03-01-word2vec.model.trainables.syn1neg.npy
2023-03-12 15:13:54,367 : INFO : not storing attribute cum_table
2023-03-12 15:14:16,171 : INFO : saved 2023-03-01-word2vec.model


### 類似単語の検索

In [74]:
outputs = model.wv.most_similar('ドラゴンボール', topn=10)

In [75]:
for word in outputs:
    print(word)

('ブロリー', 0.7255889773368835)
('ドラゴンボールZ', 0.6735256910324097)
('悟空', 0.6687308549880981)
('ベジータ', 0.6664971113204956)
('カービィ', 0.6645500659942627)
('セーラームーン', 0.654509961605072)
('孫悟空', 0.6476218104362488)
('ウルトラ兄弟', 0.646388828754425)
('アルセウス', 0.6420840620994568)
('フリーザ', 0.638883113861084)


In [81]:
outputs = model.wv.most_similar('ガンダム', topn=10)

In [82]:
for word in outputs:
    print(word)

('Ζガンダム', 0.8013923764228821)
('ΖΖガンダム', 0.7725387811660767)
('ザク', 0.7688357830047607)
('ユニコーンガンダム', 0.7505552172660828)
('モビルスーツ', 0.7493917346000671)
('メカ', 0.7353453040122986)
('ガンキャノン', 0.7345325946807861)
('百式', 0.7344613075256348)
('νガンダム', 0.7344272136688232)
('MS', 0.7329888939857483)


In [119]:
outputs = model.wv.most_similar('ウマ娘', topn=10)

In [120]:
for word in outputs:
    print(word)

('プリティーダービー', 0.7993226051330566)
('艦隊これくしょん', 0.6806005835533142)
('進撃の巨人', 0.6736159324645996)
('アイドルマスター', 0.6724649667739868)
('シンデレラガールズ', 0.672393262386322)
('けものフレンズ', 0.6692875623703003)
('ラブライブ!', 0.6602373123168945)
('幽☆遊☆白書', 0.6598060727119446)
('クイーンズブレイド', 0.6587121486663818)
('美少女戦士セーラームーン', 0.6581842303276062)


### 王様+女-男

In [89]:
outputs = model.wv.most_similar(positive=['王様', '女'], negative='男', topn=10)

In [90]:
for word in outputs:
    print(word)

('お姫様', 0.6879671812057495)
('白雪姫', 0.6557631492614746)
('貴婦人', 0.6463907957077026)
('ワシリーサ', 0.6440788507461548)
('妖精', 0.6429163217544556)
('ラプンツェル', 0.6399041414260864)
('ご主人様', 0.6386705636978149)
('シンデレラ', 0.6336333751678467)
('王さま', 0.6280249357223511)
('魔法使い', 0.6224725842475891)
