In [2]:
import nltk
import tqdm
import datasets
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\fsvuu\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\fsvuu\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\traitlets\config\application.py", line 1075, in launch

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fsvuu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
dataset = datasets.load_dataset("IlyaGusev/gazeta", trust_remote_code=True)

In [5]:
sentences = []
for article in tqdm.tqdm(dataset["train"]):
    sentences.extend([sentence.lower() for sentence in nltk.tokenize.sent_tokenize(article["text"], language="russian") if len(sentence) < 256])


100%|██████████| 60964/60964 [00:38<00:00, 1585.73it/s]


In [6]:
char_cnt = Counter()
for sentence in tqdm.tqdm(sentences):
    for char in sentence:
        char_cnt[char] += 1



100%|██████████| 2137624/2137624 [00:47<00:00, 44573.77it/s]


In [7]:

# возможно нужно взять в треугольные скобочки
vocab = set(['unk', 'bos', 'eos', 'pad'])
cnt_lower_threshold = 1000
for char, cnt in char_cnt.items():
    if cnt > cnt_lower_threshold:
        vocab.add(char)
print(len(vocab))

101


In [8]:
c2id = {c: id for id, c in enumerate(vocab)}
id2c = {id: c for id, c in enumerate(vocab)}

In [9]:
class CDataset:
    unk_id = c2id['unk']
    bos_id = c2id['bos']
    eos_id = c2id['eos']
    pad_id = c2id['pad']
    def __init__(self, sent) -> None:
        self.data = sent

    def __getitem__(self, idx):
        token_s = [self.bos_id]
        token_s += [c2id.get(c, self.unk_id) for c in self.data[idx]]
        token_s += [self.eos_id]
        return token_s

    def __len__(self):
        return len(self.data)

In [10]:
def trans_func(input_batch, pad_id = c2id["pad"]):
    sent_lens = [len(sentence) for sentence in input_batch]
    max_sent_len = max(sent_lens)
    new_batch = []
    for sentence in input_batch:
        sentence.extend([pad_id for _ in range(max_sent_len - len(sentence))])
        new_batch.append(sentence)

    tensor = torch.LongTensor(new_batch).to("cuda")
    new_batch = {
        "input_ids": tensor[:, :-1],
        "target_ids": tensor[:, 1:]
    }
    return new_batch

In [11]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

In [12]:
train_dataset = CDataset(train_sentences)
eval_dataset = CDataset(eval_sentences)

In [13]:
batch_sz=250

train_dataloader = DataLoader(
    train_dataset, collate_fn=trans_func, batch_size=batch_sz
)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=trans_func, batch_size=batch_sz
)

In [14]:
class CLM(torch.nn.Module):
    def __init__(self, hidden_dim, vocab_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, hidden_dim)
        self.rnn = torch.nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, vocab_size)
        self.non_lin = torch.nn.Tanh()
        self.dropout = torch.nn.Dropout(p=0.1)

    def forward(self, input_batch):
        embeddings = self.embedding(input_batch)
        out, _ = self.rnn(embeddings)
        out = self.dropout(self.linear(self.non_lin(out)))
        res = self.linear2(self.non_lin(out))
        return res





In [15]:
def evaluate(model, crit):
    model.eval()
    perplexity = []
    with torch.no_grad():
        for batch in eval_dataloader:
            logs = model(batch["input_ids"]).flatten(start_dim=0, end_dim=1)
            loss = crit(
                logs,
                batch["target_ids"].flatten()
            )
            perplexity.append(torch.exp(loss).item())
    return sum(perplexity) / len(perplexity)

In [16]:
model = CLM(hidden_dim=200, vocab_size=len(vocab)).to("cuda")

In [17]:
crit = torch.nn.CrossEntropyLoss(ignore_index=c2id["pad"])
optimizer = torch.optim.Adam(model.parameters())

In [22]:
num_ep = 10
losses = []
perplexities = []

for ep in range(num_ep):
    ep_losses = []
    model.train()
    for batch in tqdm.tqdm(train_dataloader, desc=f'Training ep {ep}'):
        optimizer.zero_grad()
        logs = model(batch["input_ids"]).flatten(start_dim=0, end_dim=1)
        loss = crit(
            logs, batch["target_ids"].flatten()
        )
        loss.backward()
        optimizer.step()

        ep_losses.append(loss.item())
    losses.append(sum(ep_losses) / len(ep_losses))
    perplexities.append(evaluate(model, crit))

Training ep 0: 100%|██████████| 6841/6841 [09:26<00:00, 12.08it/s]
Training ep 1: 100%|██████████| 6841/6841 [10:09<00:00, 11.23it/s]
Training ep 2: 100%|██████████| 6841/6841 [10:18<00:00, 11.07it/s]
Training ep 3: 100%|██████████| 6841/6841 [10:17<00:00, 11.07it/s]
Training ep 4: 100%|██████████| 6841/6841 [10:25<00:00, 10.94it/s]
Training ep 5: 100%|██████████| 6841/6841 [10:30<00:00, 10.85it/s]
Training ep 6: 100%|██████████| 6841/6841 [10:30<00:00, 10.84it/s]
Training ep 7: 100%|██████████| 6841/6841 [10:42<00:00, 10.65it/s]
Training ep 8: 100%|██████████| 6841/6841 [10:46<00:00, 10.58it/s]
Training ep 9: 100%|██████████| 6841/6841 [10:57<00:00, 10.41it/s]


In [81]:
model.to("cuda")
model.eval()

CLM(
  (embedding): Embedding(101, 200)
  (rnn): GRU(200, 200, batch_first=True)
  (linear): Linear(in_features=200, out_features=200, bias=True)
  (linear2): Linear(in_features=200, out_features=101, bias=True)
  (non_lin): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [98]:
def generate(input, gen_len=200):
    input_ids = [CDataset.bos_id] + [c2id.get(c, CDataset.unk_id) for c in input]
    input_ids = torch.LongTensor(input_ids).to("cuda")
    for i in range(gen_len):
        vectorized_c = model(input_ids)[-1].argmax().unsqueeze(0)
        if vectorized_c.item() == CDataset.eos_id:
            break
        input_ids = torch.cat((input_ids, vectorized_c))

    return ''.join([id2c[id.item()] for id in input_ids[1:].to("cpu")])

In [106]:
generate("у лукоморья дуб зелёный, златая цепь на дубе том: и днём и ночью")

'у лукоморья дуб зелёный, златая цепь на дубе том: и днём и ночью в составе собственной политики в своем телефонного продаж в своем телефонного продаж в своем телефонного продаж в своем телефонного продаж в своем телефонного продаж в своем телефонного продаж в свое'