# Проверка работы FastText

In [1]:
import first_script

In [2]:
from tokenizer import FastTextTokenizer
from model import FastText
import os

## Основная особенность FastText - токенизатор, делящий слова на n-граммы

In [3]:
tokenizer = FastTextTokenizer(n_gram=3)

In [4]:
text_train = 'Привет, меня зовут Миша. Как тебя зовут? миша'
tokenizer.train(text_train)

In [5]:
text_test = 'Привет Миша, меня зовут'
print(tokenizer.encode(text_test))
print(tokenizer.encode(text_train))

[34, 19, 24, 13, 21, 35, 29, 32, 17, 36, 31, 12, 14, 27, 33, 20, 22, 8, 28, 15]
[34, 19, 24, 13, 21, 35, 31, 12, 14, 27, 33, 20, 22, 8, 28, 15, 29, 32, 17, 36, 23, 25, 11, 26, 7, 10, 16, 9, 20, 22, 8, 28, 15, 18, 37, 30, 17, 36]


In [6]:
text_test_idx = tokenizer.encode(text_test)
tokenizer.decode(text_test_idx)

['Привет', 'Миша', ',', 'меня', 'зовут']

In [7]:
tokenizer._tokenize(text_test)

['<Пр',
 'При',
 'рив',
 'иве',
 'вет',
 'ет>',
 '<Ми',
 'Миш',
 'иша',
 'ша>',
 '<,>',
 '<ме',
 'мен',
 'еня',
 'ня>',
 '<зо',
 'зов',
 'ову',
 'вут',
 'ут>']

## Обучаем FastText

In [8]:
tokenizer = FastTextTokenizer(n_gram=3)

In [9]:
text = ""
with open('../data/voina_i_mir.txt', 'r') as f:
    text = f.read()
tokenizer.train(text)

In [10]:
text_idxs = tokenizer.encode(text)

### Параметры модели

In [11]:
embed_dim = 256
window_size = 20
device = 'cuda'

### Параметры обучения

In [12]:
num_epochs = 10
batch_size = 10000
num_workers = os.cpu_count()

### Загружаем модель и обучаем

In [13]:
model = FastText(embed_dim, window_size, len(tokenizer.words), device=device)
res = model.fit(text_idxs, batch_size, num_epochs, num_workers=num_workers)

Epoch [1/10, time: 1.222 minutes]:
                            Train Loss 12.456216, Train Perplexity 256841.963
                            Test Loss 11.910940, Test Perplexity 148886.559
Epoch [2/10, time: 2.524 minutes]:
                            Train Loss 11.321015, Train Perplexity 82538.047
                            Test Loss 10.805449, Test Perplexity 49288.625
Epoch [3/10, time: 3.825 minutes]:
                            Train Loss 10.252003, Train Perplexity 28339.236
                            Test Loss 9.855062, Test Perplexity 19054.568
Epoch [4/10, time: 5.129 minutes]:
                            Train Loss 9.468284, Train Perplexity 12942.654
                            Test Loss 9.253078, Test Perplexity 10436.643
Epoch [5/10, time: 6.434 minutes]:
                            Train Loss 8.962794, Train Perplexity 7807.144
                            Test Loss 8.845350, Test Perplexity 6942.034
Epoch [6/10, time: 7.738 minutes]:
                            Train L

### Визуализируем метрики

In [14]:
FastText.plot_metrics(*res)

### Слово разделяется на n-граммы

In [15]:
test_text = 'Добрый'
test_text_idx = tokenizer.encode(test_text)
print(test_text_idx)
dec_test_text_idx = tokenizer.decode(test_text_idx)
print(dec_test_text_idx)

[11504, 525, 5096, 13099, 5052, 12708]
['Добрый']


In [16]:
embed = model[test_text_idx]

In [17]:
embed.shape

torch.Size([256])

### Благодаря разделению на n-граммы можно получить эмбеддинг любой фразы

In [18]:
test_text = 'Добрый и позитивный'
test_text_idx = tokenizer.encode(test_text)
print(test_text_idx)
dec_test_text_idx = tokenizer.decode(test_text_idx)
print(dec_test_text_idx)

[11504, 525, 5096, 13099, 5052, 12708, 9379, 2413, 10600, 217, 12615, 8715, 1027, 6900, 12595, 6450, 12708]
['Добрый', 'и', 'позитивный']


In [19]:
embed = model[test_text_idx]

In [20]:
embed.shape

torch.Size([256])

### Почти любой...

In [21]:
test_text = 'кмлум'
test_text_idx = tokenizer.encode(test_text)
print(test_text_idx)
dec_test_text_idx = tokenizer.decode(test_text_idx)
print(dec_test_text_idx)

[0, 0, 0, 11952, 9411]
['<UNK>', '<UNK>', '<UNK>', 'м']


In [22]:
embed = model[test_text_idx]
embed.shape

torch.Size([256])

### Чтобы вычислить ближайшие слова к данному нужно немного магии (а именно добавить индексы каждого слова)

In [23]:
word1 = 'Андрей'
word1_idx = tokenizer.encode(word1)
all_words_idx = [tokenizer.encode(word) for word in tokenizer.real_words]
near1 = model.k_Nearest(all_words_idx, word1_idx, k=10, use_cosine=False)

In [24]:
print([tokenizer.real_words[near1_idx] for near1_idx in near1])

['Андрей', 'Андреем', 'Андрею', 'Андрее', 'Андреевичу', 'Андреевича', 'Андрея', 'Андреевич', 'Андреи', 'Aндрей', 'Андреевской']


In [25]:
near1, embeds = model.k_Nearest(all_words_idx, word1_idx, k=100, use_cosine=True, return_embed=True)
model.plot_embeddings([tokenizer.real_words[near1_idx] for near1_idx in near1], embeds)