In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [4]:
# !pip install nltk

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List

import seaborn
seaborn.set(palette= 'summer')

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [8]:
dataset = load_dataset('IlyaGusev/gazeta', revision='v1.0')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

The repository for IlyaGusev/gazeta contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/IlyaGusev/gazeta.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52400 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5770 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5265 [00:00<?, ? examples/s]

In [9]:
sentences = []

for text in tqdm(dataset['validation']['text']):
  sentences.extend(
      [x.lower() for x in sent_tokenize(text, language='russian') if len(x) < 256]
  )

  0%|          | 0/5265 [00:00<?, ?it/s]

In [10]:
len(sentences)

191894

In [11]:
chars = Counter()
for sentence in tqdm(sentences):
  for char in sentence:
    chars[char] += 1

  0%|          | 0/191894 [00:00<?, ?it/s]

In [12]:
len(chars.items())

153

In [13]:
vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 1000

for char, cnt, in tqdm(chars.items()):
  if cnt > counter_threshold:
    vocab.add(char)

  0%|          | 0/153 [00:00<?, ?it/s]

In [14]:
len(vocab)

86

In [15]:
char2ind = {char: i for i, char in enumerate(vocab)}
ind2char = {i: char for char, i in char2ind.items()}

In [16]:
class CharDataset:
  def __init__(self, sentences):
    self.data = sentences
    self.unk_id = char2ind['<unk>']
    self.bos_id = char2ind['<bos>']
    self.eos_id = char2ind['<eos>']
    self.pad_id = char2ind['<pad>']

  def __getitem__(self, idx: int) -> List[int]:
    tokenized_sentence = [self.bos_id]
    tokenized_sentence += [char2ind.get(char, self.unk_id) for char in self.data[idx]]
    tokenized_sentence += [self.eos_id]
    return tokenized_sentence

  def __len__(self) -> int:
    return len(self.data)


In [17]:
def collate_fn_with_padding(
    input_batch: List[List[int]], pad_id=char2ind['<pad>']) -> torch.Tensor:
    max_seq_len = max(len(sent) for sent in input_batch)
    new_batch = []
    for sequence in input_batch:
      for _ in range(max_seq_len - len(sequence)):
        sequence.append(pad_id)
      new_batch.append(sequence)
    sequences = torch.LongTensor(new_batch).to(device)

    new_batch = {
        'input_ids': sequences[:, :-1],
        'target_ids': sequences[:, 1:]
    }
    return new_batch

In [18]:
train_sentences, eval_sentences = train_test_split(sentences, test_size=0.2)

train_dataset = CharDataset(train_sentences)
eval_dataset = CharDataset(eval_sentences)

train_dataloader = DataLoader(
    train_dataset, collate_fn=collate_fn_with_padding, batch_size=256)

eval_dataloader = DataLoader(
    eval_dataset, collate_fn=collate_fn_with_padding, batch_size=256)


In [32]:
class CharLM(nn.Module):
  def __init__(self, hidden_dim: int, vocab_size: int):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim)
    self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
    self.Linear = nn.Linear(hidden_dim, hidden_dim)
    self.projection = nn.Linear(hidden_dim, vocab_size)

    self.non_lin = nn.Tanh()
    self.dropout = nn.Dropout(p=0.1)

  def forward(self, input_batch) -> torch.Tensor:
    embeddings = self.embedding(input_batch)
    output, _ = self.rnn(embeddings)
    output = self.dropout(self.Linear(self.non_lin(output)))
    projection = self.projection(self.non_lin(output))

    return projection



In [33]:
def evaluate(model, criterion) -> float:
  model.eval()
  perplexity = []
  with torch.no_grad():
    for batch in eval_dataloader:
      logits = model(batch['input_ids']).flatten(star_dim=0, end_dim=1)
      loss = criterion(
          logits,
          batch['target_ids'].flatten()
      )
      perplexity.append(torch.exp(loss).item())

  perplexity = sum(perplexity) / len(perplexity)
  return perplexity

In [34]:
model = CharLM(hidden_dim=256, vocab_size=len(vocab)).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=char2ind['<pad>'])
optimizer = torch.optim.Adam(model.parameters())

In [36]:
num_epoch = 10
losses = []
perplexities = []

for epoch in range(num_epoch):
  epoch_losses = []
  model.train()
  for batch in tqdm(train_dataloader, desc=f'Training epoch {epoch}:'):
    optimizer.zero_grad()
    logits = model(batch['input_ids']).flatten(start_dim=0, end_dim=1)
    loss = criterion(
        logits, batch['target_ids'].flatten())
    loss.backward()
    optimizer.step()

    epoch_losses.append(loss.item())

  losses.append(sum(epoch_losses) / len(epoch_losses))
  perplexities.append(evaluate(model, criterion))



Training epoch 0::   0%|          | 0/600 [00:00<?, ?it/s]

KeyboardInterrupt: 