In [None]:
#@title ライブラリーの利用
!git clone https://github.com/Michi-123/whiteGPT

https://en.wikipedia.org/wiki/Word2vec

In [3]:
#@title ライブラリーの読み込み
import random
import torch
import torch.nn as nn
import torch.optim as optim
from whiteGPT import word2vec
from whiteGPT import CBOW
from whiteGPT import Vocab

In [4]:
# @title 学習用のテキストデータ
corpus_list = ["this animal is cat.",
    "the quick brown fox jumps over the lazy dog.",
    "dog and cat are animal.",
    "I love dogs and cats.",
    "the dog sat on the rug.",
    "cats are independent animals. ",
    "foxes are wild animals.",
    "The cat prowled through the moonlit garden.",
    "Dogs eagerly awaited their owner's return at the doorstep.",
    "A cat's purr filled the room with comfort.",
    "The dog wagged its tail in excitement.",
    "Cats gracefully leaped from rooftop to rooftop.",

    "Walking down the street, I spotted a stray dog searching for scraps.",
    "The cat stretched lazily in the warmth of the sunbeam.",
    "Dogs barked joyfully in the park.",
    "A sleek black cat slinked along the fence.",
    "The old dog snoozed contentedly by the fireplace.",
    "Cats darted through the alleyways, chasing shadows.",
    "A fluffy white cat napped peacefully on the windowsill.",

    "Dogs are furry friends who love to play fetch and cuddle with you.",
    "Cats are soft and independent pets that enjoy lounging in sunny spots.",
    "Dogs wag their tails when they're happy and bark to say hello.",
    "Cats purr when they're content and love to curl up in your lap.",

    "Fido loves going for car rides, he always sticks his head out the window with a goofy grin.",
    "the old dog hobbled over to greet us, his tail thumping gently against the floor.",
    "the cat perched regally on the windowsill, surveying its outdoor kingdom. ",
    "the cat stalked a dust bunny across the floor, pouncing with laser focus. ",
    "the neighbor's cat, notorious for its thievery, snuck into our yard and made off with a shiny red ball of yarn. ",
    "curled up with a good book, I felt a soft nudge – my cat, wanting some attention, was rubbing against my leg. ",
    "the therapy dog, with its gentle demeanor, brought a wave of calm to the anxious patients in the waiting room.",
    "despite their different personalities, the dog and cat often napped curled up together. ",
    "we need to buy more cat food, Whiskers seems to be inhaling everything in the bow. ",
    "despite being a scaredy cat, Luna the ginger bravely explored every corner of the new house. ",
    "Max the dog spent all afternoon digging a hole in the backyard, much to the gardener's dismay. ",
    "the smell of freshly baked cookies lured the cat out from its hiding spot. ",
    "during thunderstorms, Milo the dog would huddle under the bed, trembling uncontrollably. ",
    "we adopted a pair of playful kittens, and now our living room is a whirlwind of fur and feathery toys. ",
    "every morning, the rooster crows and the dog barks, creating a chaotic symphony to wake up the household. "
]

In [5]:
# @title テスト用のテキストデータ
test_corpus_list = ["this animal is dog ",
    "the old cat hobbled ",
    "the dog perched regally ",
    "quick brown cat jumps ",
    "cat and dog often ",
    "the dog out from ",
    "Fido the cat would ",
    "Luna the cat spent ",
    "neighbor's dog ",
    "cat stalked a dust ",
    "buy more cat food",
    "cat and the dog barks",]

# CBOWモデルの学習

In [6]:
# @title ハイパー・パラメータの設定
# @markdown 埋め込みの次元の数
embedding_dim = 16 #@param{type:'integer'}
# @markdown エポック数
num_epoch = 300 #@param{type:'integer'}
# @markdown 学習係数（学習率）
learning_rate = 0.001 #@param{type:'number'}
# @markdown 入力枠
window_size = 3 #@param{type:'integer'}
# word2vec.window_size = window_size

In [None]:
# @title インスタンス化
# 学習しやすいデータに加工します
# pad = '<PAD>' * window_size
# train_corpus = None.join(corpus_list)

# トレインコーパスとは、学習用のテキストデータのこと
train_corpus = word2vec.modify(corpus_list, window_size) # 追加コード

# (バグ修正用の追加コード)
# reは、正規表現を使うためのライブラリ
# 正規表現は、文字列のパターンを指定して、文字列を検索したり置換したりするための機能
# re.findallは、文字列から正規表現にマッチする部分を抽出する関数
# r'\w+|[^\w\s]'は、単語を抽出するための正規表現
# \wは、単語文字を表す。+は、1回以上の繰り返しを表す。|は、またはを表す。[^\w\s]は、単語文字と空白文字以外を表す
# re.findall(r'\w+|[^\w\s]', train_corpus)は、train_corpusから単語と空白文字以外を抽出する
# train_corpus = ' '.join(train_corpus)は、train_corpusを単語と空白文字で区切る
# 例えば、'this animal is cat.'は、'this', 'animal', 'is', 'cat', '.'に変換される
# この処理は、テキストデータを単語と空白文字で区切るための処理
import re
train_corpus = train_corpus.lower()
train_corpus = re.findall(r'\w+|[^\w\s]', train_corpus)
train_corpus = ' '.join(train_corpus)

# Dataset
vocab = Vocab(train_corpus)

# TextDatasetは、テキストデータを学習しやすい形に変換するためのクラス
# このクラスを使うことで、テキストデータを学習しやすい形に変換することができる
train_dataset = word2vec.TextDataset(vocab, train_corpus, window_size)

# バッチとは、学習するデータセットを分割したもの。バッチサイズは、その分割数を指定するもの。
# 例えば、1000個のデータを10個ずつのバッチに分割する場合、バッチサイズは10になる。
# DataLoaderは、データセットをバッチに分割してくれる機能を持っている。
train_dataloader = word2vec.DataLoader(train_dataset, batch_size=4, shuffle=True)

# 少ないコーパスで学習結果を検証するためで、次の処理でテストコーパスリストを与えて、テスト用のデータを加工させておく
test_dataset = word2vec.TextDataset(vocab, '', window_size)

# 検証しやすいデータを設定します
test_dataset.test_corpus(test_corpus_list)

# 損失関数の計算
criterion = nn.CrossEntropyLoss()

# モデルのインスタンス化
# CBOWは、Continuous Bag of Wordsの略で、単語の文脈を使って単語を予測するモデル
model = word2vec.CBOW(vocab.vocab_size, embedding_dim)

# 最適化モジュール
# optimizerは、最適化アルゴリズムを選択するためのモジュール
# Adamは、最適化アルゴリズムの一つで、勾配降下法をベースにしている
# 勾配降下法は、損失関数の最小値を求めるために、損失関数の勾配を使ってパラメータを更新するアルゴリズム
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# 語彙数の確認


In [None]:
# debug
# 自分自身のデータをラベルとして利用する、自己教師あり学習
# 文章のトークン化
train_iter = iter(train_dataloader)
next(train_iter)

{'source': tensor([[  0,   0,   0],
         [225,  48, 213],
         [ 86,   8,   0],
         [  0, 225,  48]]),
 'target': tensor([  0,  11,   0, 217])}

In [20]:
# @title 学習の実行
for epoch in range(num_epoch):
    total_loss = 0
    running_loss = 0.0
    for batch in train_dataloader:

        # 入力データと教師データの取得
        # soruce, targetには、上の通りトークン化された文章と次の単語が格納されている
        context_indices, target_index = batch['source'], batch['target']
        # 推論
        output = model(context_indices)
        # 損失計算
        # criterionというオブジェクトを使って、推論した単語の確率分布と、正解の単語のインデックスを使って損失を計算する
        loss = criterion(output, target_index)
        
        # 勾配の初期化
        optimizer.zero_grad()
        # 勾配の計算
        loss.backward()
        # 学習パラメータ（重み）の更新
        optimizer.step()
        # 損失の累計
        running_loss += loss.item()

    # 損失の平均
    total_loss = running_loss / len(train_dataloader)

    # n回に一度の処理
    if epoch % 10 == 0:
        # ログの出力
        print(f'Epoch: {epoch}, Loss: {total_loss:.4f}')
        # テスト
        test_dataset.test(model)

Epoch: 0, Loss: 5.0856
cat and the : dog : <PAD>
Epoch: 10, Loss: 2.7699
quick brown cat : jumps : <PAD>
Epoch: 20, Loss: 2.1981
the dog perched : regally : .
Epoch: 30, Loss: 1.7241
cat stalked a : dust : dust
Epoch: 40, Loss: 1.3503
the dog out : from : .
Epoch: 50, Loss: 1.0661
the dog perched : regally : ,
Epoch: 60, Loss: 0.8565
buy more cat : food : food
Epoch: 70, Loss: 0.7014
this animal is : dog : cat
Epoch: 80, Loss: 0.5891
the old cat : hobbled : ,
Epoch: 90, Loss: 0.5078
the dog out : from : and
Epoch: 100, Loss: 0.4490
buy more cat : food : food
Epoch: 110, Loss: 0.4079
cat and dog : often : often
Epoch: 120, Loss: 0.3786
luna the cat : spent : regally
Epoch: 130, Loss: 0.3577
the dog out : from : and
Epoch: 140, Loss: 0.3438
the old cat : hobbled : dog
Epoch: 150, Loss: 0.3334
cat stalked a : dust : dust
Epoch: 160, Loss: 0.3261
cat and dog : often : often
Epoch: 170, Loss: 0.3222
the old cat : hobbled : dog
Epoch: 180, Loss: 0.3181
cat and the : dog : lazy
Epoch: 190, Lo

# 評価

In [41]:
# 次の単語を予測
test_dataset.test(model)

the old cat : hobbled : dog


In [None]:
# 重みの表示
# 出力の１行１行が、１単語を表していて、行の中の要素のそれぞれが、何かしらの特徴を表している。
model.embeddings.weight

Parameter containing:
tensor([[-3.0857,  0.7395, -0.4817,  ..., -0.6561, -0.5184,  1.4529],
        [-1.3531,  0.4013,  0.6837,  ..., -0.0287, -0.5208,  0.3029],
        [-0.7319, -0.3926, -0.2762,  ..., -1.3833, -0.4664, -0.1379],
        ...,
        [ 3.7932,  1.7007, -3.4961,  ...,  3.5891,  2.2103,  1.8853],
        [ 4.4782,  4.3708,  1.0824,  ..., -0.9107,  2.8521,  3.8996],
        [-0.0449, -3.7678, -0.5762,  ...,  1.3904, -2.9881,  4.0387]],
       requires_grad=True)