In [1]:
!pip install datasets
!pip install zstandard jsonlines
!pip install pymorphy2

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [3]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import re
from datasets import load_dataset
from tqdm import tqdm
import string
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
import pymorphy2

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
dataset = load_dataset("IlyaGusev/ru_turbo_alpaca")
dataset

Downloading builder script:   0%|          | 0.00/2.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.31k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29822 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'alternative_output', 'label', 'all_labels', 'agreement', 'overlap'],
        num_rows: 29822
    })
})

In [5]:
def tokenize_ru(text, morph):
    # firstly let's apply nltk tokenization
    tokens = word_tokenize(text)

    # let's delete punctuation symbols
    tokens = [i for i in tokens if (i not in string.punctuation)]

    # deleting stop_words
    stop_words = stopwords.words('russian')
    stop_words.extend(['—', '–'])
    tokens = [i for i in tokens if (i not in stop_words)]

    # cleaning words
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    tokens = [morph.parse(token)[0].normal_form for token in tokens]
    return tokens

In [6]:
docs = dataset['train']['output']
morph = pymorphy2.MorphAnalyzer()
sentences = [tokenize_ru(sent, morph) for doc in docs for sent in sent_tokenize(doc,'russian') ]

In [7]:
sentences = [sent for sent in sentences if len(sent)!=0]

In [8]:
word_model = Word2Vec(
    sentences,
    window=5,
    workers=4,
    min_count=10,
    negative=10,
    alpha=0.03,
    min_alpha=0.0007,
    sample=6e-5)

In [11]:
vocab_size = len(word_model.wv)
emb_size = 100

In [12]:
def word2ind(word):
    try:
        ind = word_model.wv.key_to_index[word]
        return ind
    except:
        return 0
def ind2word(ind):
    return word_model.wv.index_to_key[ind]

In [13]:
max_sent_len = 0
for sent in sentences:
    max_sent_len = max(len(sent), max_sent_len)
max_sent_len

212

In [14]:
train_x = np.zeros([len(sentences), max_sent_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for j, word in enumerate(sentence[:-1]):
        train_x[i, j] = word2ind(word)
    train_y[i] = word2ind(sentence[-1])

In [15]:
import torch.nn as nn
import torch
from torch.nn import LSTM
from torch.utils.data import DataLoader, Dataset
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
data = [torch.tensor(sent, dtype=torch.int64) for sent in train_x]
target = [torch.tensor(word, dtype=torch.int64) for word in train_y]


In [17]:
class MyDataSet(Dataset):
    def __init__(self, data, target):
        self.inputs = data
        self.labels = target
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self, ind):
        input = self.inputs[ind]
        label = self.labels[ind]
        return {
            'input':input,
            'label':label
        }

In [18]:
dataset = MyDataSet(data, target)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [19]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(LSTMModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, emb_size, batch_first=True)
        self.fc1 = nn.Linear(emb_size, vocab_size)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.embeddings(x)
        x, _ = self.lstm(x)
        x = self.fc1(x)
        x = self.softmax(x)
        x = x[:,-1,:]
        return x

In [20]:
model = LSTMModel(vocab_size, emb_size).to(DEVICE)
# model = torch.load('complete_model.pth').to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [39]:
def train(model, criterion, ortimize, epoches, dataloader):
    losses = []
    for epoch in tqdm(range(epoches)):
        running_loss = 0.0
        for i, batch in enumerate(dataloader):
            X = batch['input'].to(DEVICE)
            y = batch['label'].to(DEVICE)
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        epoch_loss = running_loss / len(dataloader)
        losses.append(epoch_loss)
        if epoch % 5 == 0:
            print(f"Эпоха {epoch + 1}, Значение функции потерь: {epoch_loss}")
            if epoch_loss < min_loss:
                torch.save(model, 'complete_model.pth')
        min_loss = min(epoch_loss, min_loss)
    return model, losses

In [None]:
epoches = 100
model, losses = train(model, criterion, optimizer, epoches, dataloader)

In [37]:
def sample(preds, temp=1.0):
    if temp<0:
        return np.argmax(preds)
    preds = np.log(preds)/temp
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_gen=10):
    word_idxs = [[word2ind(word) for word in tokenize_ru(text, morph)]]
    pred_idxs = []
    gen_model = model.to("cpu")
    for i in range(num_gen):
        pred = gen_model(torch.tensor(word_idxs)).detach().numpy()
        idx = sample(pred[-1], temp=0.7)
        word_idxs[0].append(idx)
    return " ".join(ind2word(idx) for idx in word_idxs[0])
text = 'Привет'
generate_next(text)

'привет am связаться прохождение что-либо быстрота поддерживать измерение снятие руль предприятие'