Задание 1. Загрузите текст из произведений Ницше ('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt').

In [None]:
import os
import re
from urllib.request import urlretrieve

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
URL = r"https://s3.amazonaws.com/text-datasets/nietzsche.txt"
asset_path = os.path.join(os.getcwd(), f"nietzsche.txt")

if not os.path.exists(asset_path):
    print(f"Downloading and extracting assests....", end="")
    urlretrieve(URL, asset_path)

Downloading and extracting assests....

In [None]:
with open("nietzsche.txt", 'r', encoding='UTF-8') as file:
  text = file.read()
  print(text[:100])

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all ph


In [None]:
def text_preprocessing(text: str, deep_preproc: int = 2) -> str:
  if deep_preproc == 1:
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text
  text = re.sub(r"[^A-Za-zА-Яа-я0-9]", " ", text)
  text = re.sub(r"\s+", " ", text)
  return text

In [None]:
# Длина всего текста до обработки
print(f"Text length without preprocessing: {len(text)}")

Text length without preprocessing: 600893


In [None]:
modern_text = text_preprocessing(text, 1)
modern_text[:100]

'PREFACE SUPPOSING that Truth is a woman--what then? Is there not ground for suspecting that all phil'

In [None]:
sentences = sent_tokenize(modern_text)
treated_sentences = [text_preprocessing(sentence).strip() for sentence in sentences if sentence.count(" ") >= 1]
treated_sentences[:5]

['PREFACE SUPPOSING that Truth is a woman what then',
 'Is there not ground for suspecting that all philosophers in so far as they have been dogmatists have failed to understand women that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to Truth have been unskilled and unseemly methods for winning a woman',
 'Certainly she has never allowed herself to be won and at present every kind of dogma stands with sad and discouraged mien IF indeed it stands at all',
 'For there are scoffers who maintain that it has fallen that all dogma lies on the ground nay more that it is at its last gasp',
 'But to speak seriously there are good grounds for hoping that all dogmatizing in philosophy whatever solemn whatever conclusive and decided airs it has assumed may have been only a noble puerilism and tyronism and probably the time is at hand when it will be once and again understood WHAT has actually sufficed for the basis of such imposing and absolute 

In [None]:
# Количество предложений
print(f"Number of sentences: {len(treated_sentences)}")

Number of sentences: 2847


In [None]:
# Количество слов
words = ' '.join(treated_sentences).split(" ")
unique_words = set(words)
print(f"Count of (unique words)/(words): {len(unique_words)}/{len(words)}")

Count of (unique words)/(words): 11363/101251


In [None]:
# Количество всех символов после обработки:
new_text = '.'.join(treated_sentences)
print(f"Text length past processing: {len(new_text)}")

Text length past processing: 579860


Задание 2. Сократите текст наполовину избыточными последовательностями символов maxlen

In [None]:
def cut_text(text: str, maxlen: int) -> str:
  if len(text) > maxlen:
    text += " "
    index_cut = text.index(" ", maxlen)
    shortened_text = text[:index_cut]
    return shortened_text
  return text

In [None]:
cut_sentences = [cut_text(sentence, maxlen=len(sentence) // 2) for sentence in sentences if sentence.count(" ") >= 1 ]
cut_sentences[:10]

['PREFACE SUPPOSING that Truth',
 'Is there not ground for suspecting that all philosophers, in so far as they have been dogmatists, have failed to understand women--that the terrible seriousness',
 'Certainly she has never allowed herself to be won; and at present every kind',
 'For there are scoffers who maintain that it has fallen, that all',
 'But to speak seriously, there are good grounds for hoping that all dogmatizing in philosophy, whatever solemn, whatever conclusive and decided airs it has assumed, may have been only a noble puerilism and tyronism; and probably the time is at hand when it will be once and again understood WHAT has actually sufficed for the basis of such imposing and absolute philosophical edifices',
 'The philosophy of the dogmatists, it is to be hoped, was only a promise for thousands of years afterwards, as was astrology in still earlier times, in the service of which probably more labour,',
 'It seems that in order to inscribe themselves upon the heart of 

Задание 3. Создайте модель LSTM для генерации текста

In [None]:
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA доступна")
else:
    device = torch.device("cpu")
    print("CUDA недоступна, используется CPU")

CUDA доступна


In [None]:
def text_to_seq(text_sample):
    char_counts = Counter(text_sample)
    char_counts = sorted(char_counts.items(), key = lambda x: x[1], reverse=True)

    sorted_chars = [char for char, _ in char_counts]
    char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
    idx_to_char = {v: k for k, v in char_to_idx.items()}
    sequence = np.array([char_to_idx[char] for char in text_sample])

    return sequence, char_to_idx, idx_to_char

In [None]:
sequence, char_to_idx, idx_to_char = text_to_seq('. '.join(cut_sentences))

In [None]:
# Генерация train и target(сдвик на одну букву)
SEQ_LEN = 256
BATCH_SIZE = 16

def get_batch(sequence):
    trains = []
    targets = []
    for _ in range(BATCH_SIZE):
        batch_start = np.random.randint(0, len(sequence) - SEQ_LEN)
        chunk = sequence[batch_start: batch_start + SEQ_LEN]
        train = torch.LongTensor(chunk[:-1]).view(-1, 1)
        target = torch.LongTensor(chunk[1:]).view(-1, 1)
        trains.append(train)
        targets.append(target)
    return torch.stack(trains, dim=0), torch.stack(targets, dim=0)

In [None]:
# Функция для генерации текста после обучения сетки
def evaluate(model, char_to_idx, idx_to_char, start_text=' ', prediction_len=200, temp=0.3):
    hidden = model.init_hidden()
    idx_input = [char_to_idx[char] for char in start_text]
    train = torch.LongTensor(idx_input).view(-1, 1, 1).to(device)
    predicted_text = start_text

    _, hidden = model(train, hidden)

    inp = train[-1].view(-1, 1, 1)

    for i in range(prediction_len):
        output, hidden = model(inp.to(device), hidden)
        output_logits = output.cpu().data.view(-1)
        p_next = F.softmax(output_logits / temp, dim=-1).detach().cpu().data.numpy()
        top_index = np.random.choice(len(char_to_idx), p=p_next)
        inp = torch.LongTensor([top_index]).view(-1, 1, 1).to(device)
        predicted_char = idx_to_char[top_index]
        predicted_text += predicted_char

    return predicted_text

In [None]:
class TextRNN(nn.Module):

    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(TextRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)

    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, (ht1, ct1)

    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = TextRNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    patience=5,
    verbose=True,
    factor=0.5
)

n_epochs = 5000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    loss_avg.append(loss.item())
    if len(loss_avg) >= 50 and epoch % 500 == 0:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

Loss: 1.9653057395341154
 a precise to the sense of its and the according of the present and is a prificient the can the more are the self--the present of the propent the here the according thought as the self who precisely an
Loss: 1.515013620376587
 and and result of the morality of a souls and more the proposite in the action of a sense and his precisely and strong the world in the sense of the sense and and and puritation of the action of the a
Loss: 1.4209165790081024
 has the same present the present the tradition of the sense of the old constraint that the service and present serious the problem of the soul the constitute and personal intercourse of the fact of th
Loss: 1.3768407685756683
 the consequences and sense of the state that is a conscience of the most present the subject of the present the most possible of the "the other desire the contemples of the contrary and every men of t
Loss: 1.3470187165737153
 and sense of the problem of the desire the more the same and sense 

In [None]:
model.eval()
print(evaluate(
    model,
    char_to_idx,
    idx_to_char,
    start_text='Hello',
    temp=0.3,
    prediction_len=100,
    )
)

Hello in the desire to the problem of the sense of the sense of the highest and a man of the self-experien


Задание 4. Создайте самостоятельно генерацию текста для РУССКОЯЗЫЧНОГО НАБОРА глав Wikibooks.
Полный текст Wikibooks содержит более 270000 глав на 12 языках https://www.kaggle.com/datasets/dhruvildave/wikibooks-dataset/data


In [None]:
with open("dostoevsky.txt", 'r', encoding='UTF-8') as file:
  text = file.read()

In [None]:
modern_text = text_preprocessing(text, 1)

sentences = sent_tokenize(modern_text)

sequence, char_to_idx, idx_to_char = text_to_seq('. '.join(sentences))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = TextRNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    patience=5,
    verbose=True,
    factor=0.5
)

n_epochs = 50000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    loss_avg.append(loss.item())
    if len(loss_avg) >= 50 and epoch % 5000 == 0:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

Loss: 1.9209595162542885
 наше совсем не присороткой столь, что все раз все совсем не уж сказать на потому что он все все собой в состоянно просто на притом и все это не потому сказал и случает от положительно старика не вывор
Loss: 1.6677124805688859
 от себя совсем после просил с самому и в ответил он от вам в нем гораздо не могу, что и все это сказала на вас только в собой и в просто все не в совершенно совершенно просто и не по не обо мной в сов
Loss: 1.6304359251737595
 вас сердце своих дело старик, и не вы только столько подставила и пристально и все проститься с своего объяснить с ней пример и обратился в которой с себя с ним старик и столько было совершенно себя и
Loss: 1.6124306474685668
 под весь в себя с нем стороны, а вот все сами совершенно совсем даже по своей никогда не странности его не способность и не подумал и одно словом он тогда стало быть, но вы не совершенно себя в столе 
Loss: 1.603707464814186
 в совершенно подле с ней происходить в себя обращение и совсем не з

In [None]:
model.eval()
print(evaluate(
    model,
    char_to_idx,
    idx_to_char,
    start_text = 'А по темным улицам гуляет дождь',
    temp=0.3,
    prediction_len=100,
    )
)

А по темным улицам гуляет дождь, и он стало быть, не в том, что уже совсем ничего не вы смеялись с своей последнее столько стороны 


Удалим стопслова

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('russian'))
words = text.split()
filtered_words = [word for word in words if word.lower() not in stop_words]
filtered_text = ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
modern_text = text_preprocessing(filtered_text, 1)

sentences = sent_tokenize(modern_text)

sequence, char_to_idx, idx_to_char = text_to_seq('. '.join(sentences))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = TextRNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    patience=5,
    verbose=True,
    factor=0.5
)

n_epochs = 50000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    loss_avg.append(loss.item())
    if len(loss_avg) >= 50 and epoch % 5000 == 0:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

Loss: 1.7559614603673428
 давно бередность всем старик совершенно встречал всем старика ваше своей старик возможно стал стороны поднял подполь говорить совершенно поставила совершенно возьмите мог приведовал возможности смерти
Loss: 1.5974770366191864
 дально просто поступиться совершенно подумал поставил случайно подумал всем слово только старик постороннейший дела поставляют половина самого слова свои половине последний столько того, всем какой-то
Loss: 1.575976710820198
 ответил совершенно понимаете самом случае повернулся несколько собою столько принимает какой-то поставил столь собой подобного словах стола стала отвечал стороны своей воспоминания своим того, подобны
Loss: 1.5650899735450745
 нашем странно просто странно совершенно весьма серьезно, видел всем воротился просто великое стороны привел наш всем покойно просто странно сказать, признаться собою моей последнего старик несколько с
Loss: 1.5615136182785034
 нашей подле немедленно несколько положительно странно собой разгово

In [None]:
model.eval()
print(evaluate(
    model,
    char_to_idx,
    idx_to_char,
    start_text = 'А по темным улицам гуляет дождь',
    temp=0.5,
    prediction_len=100,
    )
)

А по темным улицам гуляет дождь подозревала спрашивать вашим слабено мог могу разумеется, совершенно она, писали возможно подозрева
