In [26]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ------------------- ---------------- 524.3/981.5 kB 311.0 kB/s eta 0:00:02
     ---------------------------- ------- 786.4/981.5 kB 493.7 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 493.7 kB/s eta 0:00:01
     ------------------------------------ 981.5/981.5 kB 474.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with stat


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import numpy as np
import random
from langdetect import detect
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thunderobot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Достаём текст

In [30]:
file = open('text1.txt','r')
text1 = file.read()
print(text1)
file.close()

There lived a squirrel in the old forest. The squirrel had a daughter, a squirrel, in the spring.

Once, a squirrel and a squirrel collected mushrooms for the winter. Suddenly, a marten appeared on a nearby Christmas tree. She got ready to grab the squirrel. The mother squirrel jumped towards the marten and shouted to her daughter: "Run!"

The squirrel took off running. Finally, she stopped. I looked around, but the places were unfamiliar! There is no squirrel mom. What to do?

A squirrel saw a hollow in a pine tree, hid and fell asleep. And in the morning, mom found her daughter.


In [31]:
file = open('text.txt','r')
text2 = file.read()
print(text2)
file.close()

Жила в старом лесу белка. У белки весной появилась дочка белочка.

Один раз белка с белочкой собирали грибы на зиму. Вдруг на соседней ёлке появилась куница. Она приготовилась схватить белочку. Мама – белка прыгнула навстречу кунице и крикнула дочке: «Беги!»

Белочка бросилась наутёк. Наконец она остановилась. Посмотрела по сторонам, а места незнакомые! Мамы – белки нет. Что делать?

Увидела белочка дупло на сосне, спряталась и заснула. А утром мама дочку нашла.


#  Делаем GPT

In [32]:
class MiniGPT:
    def __init__(self, embedding_dim=16, hidden_dim=32):
        # подготовка тектса
        self.morph_ru = MorphAnalyzer()
        self.stopwords_ru = set(stopwords.words('russian'))
        self.stopwords_en = set(stopwords.words('english'))

        # задаём параметры модели
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        # заготовки для будущих данных
        self.vocab = {}
        self.idx2token = {}
        self.embeddings = None
        self.W_q = None
        self.W_k = None
        self.W_v = None
        self.W_out = None

    def detect_language(self, text): # функция для определения языка
        try:
            return detect(text)
        except:
            return 'unknown'

    def preprocess_text(self, text):
        lang = self.detect_language(text)
        tokens = text.lower().split()
        result = []

        if lang == 'ru':
            for token in tokens:
                if token.isalpha() and token not in self.stopwords_ru:
                    norm = self.morph_ru.parse(token)[0].normal_form
                    result.append(norm)
        elif lang == 'en':
            for token in tokens:
                if token.isalpha() and token not in self.stopwords_en:
                    result.append(token)
        else:
            result = [t for t in tokens if t.isalpha()]

        return result

    def build_vocab(self, tokens):
        self.vocab = {token: i for i, token in enumerate(set(tokens))}
        self.idx2token = {i: token for token, i in self.vocab.items()}

        vocab_size = len(self.vocab)
        self.embeddings = np.random.randn(vocab_size, self.embedding_dim)
        self.W_q = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_k = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_v = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_out = np.random.randn(self.hidden_dim, vocab_size)

    def tokenize(self, tokens):
        return [self.vocab[token] for token in tokens if token in self.vocab]

    def embed(self, token_ids):
        return self.embeddings[token_ids]

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def attention(self, x):
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        seq_len = Q.shape[0]
        scores = Q @ K.T / np.sqrt(self.hidden_dim)

        # Маскирование будущих токенов
        mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool)
        scores[mask] = -np.inf

        weights = self.softmax(scores)
        attended = weights @ V
        return attended

    def forward(self, token_ids):
        x = self.embed(token_ids)
        attended = self.attention(x)
        logits = attended @ self.W_out
        return logits

    def predict_next(self, token_ids):
        logits = self.forward(token_ids)
        probs = self.softmax(logits[-1])
        return np.argmax(probs)

    def train(self, text, epochs=10, lr=0.01):
        tokens = self.preprocess_text(text)
        self.build_vocab(tokens)
        token_ids = self.tokenize(tokens)

        for epoch in range(epochs):
            total_loss = 0
            for i in range(1, len(token_ids)):
                context = token_ids[:i]
                target = token_ids[i]

                logits = self.forward(context)
                pred = self.softmax(logits[-1])
                loss = -np.log(pred[target] + 1e-9)
                total_loss += loss

                # Простейшее обновление — только выходной слой
                grad = pred
                grad[target] -= 1

                attended = self.attention(self.embed(context))
                self.W_out -= lr * np.outer(attended[-1], grad)

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    def generate(self, prompt, length=5):
        tokens = self.preprocess_text(prompt)
        token_ids = self.tokenize(tokens)
        if not token_ids:
            return "Невозможно сгенерировать текст: нет известных токенов."

        result = token_ids[:]
        for _ in range(length):
            next_id = self.predict_next(result)
            result.append(next_id)

        return ' '.join(self.idx2token[i] for i in result)


## Тестрирование на коротком тексте

In [42]:
gpt = MiniGPT(embedding_dim=16, hidden_dim=32)

print("\n== English ==")
text = "There lived a squirrel in the old forest. The squirrel had a daughter, a squirrel, in the spring."
gpt.train(text, epochs=20, lr=0.01)
print(gpt.generate("squirrel", length=5))

print("\n== Russian ==")
text = "Жила в старом лесу белка. У белки весной появилась дочка белочка."
gpt.train(text, epochs=20, lr=0.01)
print(gpt.generate("белка", length=5))


== English ==
Epoch 1/20, Loss: 39.2672
Epoch 2/20, Loss: 33.1694
Epoch 3/20, Loss: 20.9369
Epoch 4/20, Loss: 5.5441
Epoch 5/20, Loss: 9.8044
Epoch 6/20, Loss: 9.7925
Epoch 7/20, Loss: 9.7862
Epoch 8/20, Loss: 9.7826
Epoch 9/20, Loss: 9.7805
Epoch 10/20, Loss: 9.7792
Epoch 11/20, Loss: 9.7786
Epoch 12/20, Loss: 9.7783
Epoch 13/20, Loss: 9.7783
Epoch 14/20, Loss: 9.7784
Epoch 15/20, Loss: 9.7787
Epoch 16/20, Loss: 9.7790
Epoch 17/20, Loss: 9.7793
Epoch 18/20, Loss: 9.7797
Epoch 19/20, Loss: 9.7801
Epoch 20/20, Loss: 9.7805
squirrel old squirrel squirrel squirrel squirrel

== Russian ==
Epoch 1/20, Loss: 91.0209
Epoch 2/20, Loss: 51.5281
Epoch 3/20, Loss: 43.8600
Epoch 4/20, Loss: 33.9989
Epoch 5/20, Loss: 25.7807
Epoch 6/20, Loss: 18.5912
Epoch 7/20, Loss: 13.6671
Epoch 8/20, Loss: 12.2512
Epoch 9/20, Loss: 12.2006
Epoch 10/20, Loss: 12.1739
Epoch 11/20, Loss: 12.1581
Epoch 12/20, Loss: 12.1481
Epoch 13/20, Loss: 12.1413
Epoch 14/20, Loss: 12.1365
Epoch 15/20, Loss: 12.1328
Epoch 16/20

## Тестрирование на длинном  тексте

In [43]:
print("\n== English ==")
gpt.train(text1, epochs=20, lr=0.01)
print(gpt.generate("mother", length=5))

print("\n== Russian ==")
text = "Жила в старом лесу белка. У белки весной появилась дочка белочка."
gpt.train(text2, epochs=20, lr=0.01)
print(gpt.generate("белка", length=5))


== English ==
Epoch 1/20, Loss: 540.6926
Epoch 2/20, Loss: 453.4227
Epoch 3/20, Loss: 383.6287
Epoch 4/20, Loss: 328.9074
Epoch 5/20, Loss: 293.4612
Epoch 6/20, Loss: 276.5800
Epoch 7/20, Loss: 253.4770
Epoch 8/20, Loss: 228.8104
Epoch 9/20, Loss: 211.5231
Epoch 10/20, Loss: 199.6715
Epoch 11/20, Loss: 191.2193
Epoch 12/20, Loss: 185.5043
Epoch 13/20, Loss: 181.4667
Epoch 14/20, Loss: 180.8881
Epoch 15/20, Loss: 179.5662
Epoch 16/20, Loss: 179.2170
Epoch 17/20, Loss: 178.7917
Epoch 18/20, Loss: 178.2960
Epoch 19/20, Loss: 177.4264
Epoch 20/20, Loss: 176.0495
mother mom mom mom mom mom

== Russian ==
Epoch 1/20, Loss: 653.3766
Epoch 2/20, Loss: 542.2244
Epoch 3/20, Loss: 478.5261
Epoch 4/20, Loss: 432.5617
Epoch 5/20, Loss: 392.0607
Epoch 6/20, Loss: 343.9078
Epoch 7/20, Loss: 322.8858
Epoch 8/20, Loss: 313.9514
Epoch 9/20, Loss: 295.8622
Epoch 10/20, Loss: 279.1888
Epoch 11/20, Loss: 263.0227
Epoch 12/20, Loss: 247.3556
Epoch 13/20, Loss: 233.5054
Epoch 14/20, Loss: 217.3333
Epoch 15/