In [None]:
# !pip install navec
# !pip install slovnet

In [4]:
from slovnet.model.emb import NavecEmbedding
import torch
import navec

In [6]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = navec.Navec.load(path)

In [11]:
from utils import read_file

test = read_file("private_test_stresses.txt")

In [55]:
from torch.utils.data import Dataset, DataLoader

class TestDataset(Dataset):
    def __init__(self, navec, data):
        self.data = data
        self.navec = navec
        self.vocab = self.navec.vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        word = self.data[index]
        word_idx = self.vocab.get(word, self.navec.vocab.unk_id)  # Out of vocabulary words are assigned a unique index
        word_tensor = torch.tensor(word_idx, dtype=torch.long)
        return word_tensor, word


class TrainDataset(Dataset):
    def __init__(self, navec, data, labels):
        self.data = data
        self.labels = labels
        self.navec = navec
        self.vocab = self.navec.vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        word = self.data[index]
        word_idx = self.vocab.get(word, self.navec.vocab.unk_id)  # Out of vocabulary words are assigned a unique index
        word_tensor = torch.tensor(word_idx, dtype=torch.long)
        
        return word_tensor, torch.tensor([self.labels[index]], dtype=torch.long)

In [47]:
from torch import nn

class Model(nn.Module):
    def __init__(self,
                 navec,
                 output_dim=37) -> None:
        super().__init__()
        self.navec = navec
        self.output_dim = output_dim

        self.model = nn.Sequential(
            NavecEmbedding(navec),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU()
        )

        self.mu = nn.Linear(50, output_dim)
        self.log_std = nn.Linear(50, output_dim)
    
    def forward(self, x):
        hidden = self.model(x)

        mean, log_std = self.mu(hidden), self.log_std(hidden)
        log_std = log_std.clamp(-20, 2)
        dist = torch.distributions.Normal(mean, log_std.exp())

        out = dist.rsample()
        return out

In [48]:
model = Model(navec)

In [56]:
train = read_file("stresses/train_stresses_labels.txt")

words = [x.replace("^", "") for x in train]
labels = [x.index("^") - 1 for x in train]

train_dataset = TrainDataset(navec, words, labels)

train_loader = DataLoader(train_dataset, batch_size=2048)

In [79]:
model = Model(navec)
loss = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
num_epochs = 200

for epoch in range(num_epochs):
    total_loss = 0

    for word_tensor, stress in train_loader:
        optim.zero_grad()
        
        output = model(word_tensor)
        criterion = loss(output,
                           nn.functional.one_hot(stress, num_classes=37).squeeze(1).float())

        total_loss += criterion.item()
        
        criterion.backward()
        optim.step()
    
    # Print epoch loss
    print('Epoch:', epoch, ' Loss:', total_loss)



In [42]:
def make_stress(word, index):
    new_word = ""

    for i in range(len(word)):
        new_word += word[i]
        if i == index:
            new_word += "^"
    
    return new_word

In [82]:
from tqdm import tqdm

model.eval()
vowels = "аяуюоеёэиы"
output = []

test_dataset = TestDataset(navec, test)

for i in tqdm(range(len(test_dataset))):
    inp_tensor, word = test_dataset[i]
    indexes = [i for i in range(len(word)) if word[i] in vowels]

    pred = model(inp_tensor)[indexes]
    stress = pred.argmax().item()

    output.append(make_stress(word, stress))


100%|██████████| 294252/294252 [00:32<00:00, 8926.64it/s]


In [83]:
with open('try2.txt', 'w') as file:
    for item in output:
        file.write(item + '\n')

In [84]:
torch.save(model.state_dict(), "model.pt")