In [26]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ------------------- ---------------- 524.3/981.5 kB 311.0 kB/s eta 0:00:02
     ---------------------------- ------- 786.4/981.5 kB 493.7 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 493.7 kB/s eta 0:00:01
     ------------------------------------ 981.5/981.5 kB 474.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with stat


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import numpy as np
import random
from langdetect import detect
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Thunderobot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Достаём текст

In [30]:
file = open('text1.txt','r')
text1 = file.read()
print(text1)
file.close()

There lived a squirrel in the old forest. The squirrel had a daughter, a squirrel, in the spring.

Once, a squirrel and a squirrel collected mushrooms for the winter. Suddenly, a marten appeared on a nearby Christmas tree. She got ready to grab the squirrel. The mother squirrel jumped towards the marten and shouted to her daughter: "Run!"

The squirrel took off running. Finally, she stopped. I looked around, but the places were unfamiliar! There is no squirrel mom. What to do?

A squirrel saw a hollow in a pine tree, hid and fell asleep. And in the morning, mom found her daughter.


In [31]:
file = open('text.txt','r')
text2 = file.read()
print(text2)
file.close()

Жила в старом лесу белка. У белки весной появилась дочка белочка.

Один раз белка с белочкой собирали грибы на зиму. Вдруг на соседней ёлке появилась куница. Она приготовилась схватить белочку. Мама – белка прыгнула навстречу кунице и крикнула дочке: «Беги!»

Белочка бросилась наутёк. Наконец она остановилась. Посмотрела по сторонам, а места незнакомые! Мамы – белки нет. Что делать?

Увидела белочка дупло на сосне, спряталась и заснула. А утром мама дочку нашла.


#  Делаем GPT

In [32]:
class MiniGPT:
    def __init__(self, embedding_dim=16, hidden_dim=32):
        # подготовка тектса
        self.morph_ru = MorphAnalyzer()
        self.stopwords_ru = set(stopwords.words('russian'))
        self.stopwords_en = set(stopwords.words('english'))

        # задаём параметры модели
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        # заготовки для будущих данных
        self.vocab = {}
        self.idx2token = {}
        self.embeddings = None
        self.W_q = None
        self.W_k = None
        self.W_v = None
        self.W_out = None

    def detect_language(self, text): # функция для определения языка
        try:
            return detect(text)
        except:
            return 'unknown'

    def preprocess_text(self, text):
        lang = self.detect_language(text)
        tokens = text.lower().split()
        result = []

        if lang == 'ru':
            for token in tokens:
                if token.isalpha() and token not in self.stopwords_ru:
                    norm = self.morph_ru.parse(token)[0].normal_form
                    result.append(norm)
        elif lang == 'en':
            for token in tokens:
                if token.isalpha() and token not in self.stopwords_en:
                    result.append(token)
        else:
            result = [t for t in tokens if t.isalpha()]

        return result

    def build_vocab(self, tokens):
        self.vocab = {token: i for i, token in enumerate(set(tokens))}
        self.idx2token = {i: token for token, i in self.vocab.items()}

        vocab_size = len(self.vocab)
        self.embeddings = np.random.randn(vocab_size, self.embedding_dim)
        self.W_q = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_k = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_v = np.random.randn(self.embedding_dim, self.hidden_dim)
        self.W_out = np.random.randn(self.hidden_dim, vocab_size)

    def tokenize(self, tokens):
        return [self.vocab[token] for token in tokens if token in self.vocab]

    def embed(self, token_ids):
        return self.embeddings[token_ids]

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)

    def attention(self, x):
        Q = x @ self.W_q
        K = x @ self.W_k
        V = x @ self.W_v

        seq_len = Q.shape[0]
        scores = Q @ K.T / np.sqrt(self.hidden_dim)

        # Маскирование будущих токенов
        mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool)
        scores[mask] = -np.inf

        weights = self.softmax(scores)
        attended = weights @ V
        return attended

    def forward(self, token_ids):
        x = self.embed(token_ids)
        attended = self.attention(x)
        logits = attended @ self.W_out
        return logits

    def predict_next(self, token_ids):
        logits = self.forward(token_ids)
        probs = self.softmax(logits[-1])
        return np.argmax(probs)

    def train(self, text, epochs=10, lr=0.01):
        tokens = self.preprocess_text(text)
        self.build_vocab(tokens)
        token_ids = self.tokenize(tokens)

        for epoch in range(epochs):
            total_loss = 0
            for i in range(1, len(token_ids)):
                context = token_ids[:i]
                target = token_ids[i]

                logits = self.forward(context)
                pred = self.softmax(logits[-1])
                loss = -np.log(pred[target] + 1e-9)
                total_loss += loss

                # Простейшее обновление — только выходной слой
                grad = pred
                grad[target] -= 1

                attended = self.attention(self.embed(context))
                self.W_out -= lr * np.outer(attended[-1], grad)

            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    def generate(self, prompt, length=5):
        tokens = self.preprocess_text(prompt)
        token_ids = self.tokenize(tokens)
        if not token_ids:
            return "Невозможно сгенерировать текст: нет известных токенов."

        result = token_ids[:]
        for _ in range(length):
            next_id = self.predict_next(result)
            result.append(next_id)

        return ' '.join(self.idx2token[i] for i in result)


## Тестрирование на коротком тексте

In [None]:
gpt = MiniGPT(embedding_dim=16, hidden_dim=32)

print("\n== English ==")
text = "There lived a squirrel in the old forest. The squirrel had a daughter, a squirrel, in the spring."
gpt.train(text, epochs=20, lr=0.01)
print(gpt.generate("daughter", length=5))

print("\n== Russian ==")
text = "Жила в старом лесу белка. У белки весной появилась дочка белочка."
gpt.train(text, epochs=20, lr=0.01)
print(gpt.generate("белка", length=5))

## Тестрирование на длинном  тексте

In [37]:
print("\n== English ==")
gpt.train(text1, epochs=20, lr=0.01)
print(gpt.generate("mother", length=5))

print("\n== Russian ==")
text = "Жила в старом лесу белка. У белки весной появилась дочка белочка."
gpt.train(text2, epochs=20, lr=0.01)
print(gpt.generate("белка", length=5))


== English ==
Epoch 1/20, Loss: 548.8243
Epoch 2/20, Loss: 432.9416
Epoch 3/20, Loss: 373.1243
Epoch 4/20, Loss: 328.1320
Epoch 5/20, Loss: 281.7741
Epoch 6/20, Loss: 248.1675
Epoch 7/20, Loss: 214.4786
Epoch 8/20, Loss: 182.2440
Epoch 9/20, Loss: 156.4731
Epoch 10/20, Loss: 145.6014
Epoch 11/20, Loss: 136.8998
Epoch 12/20, Loss: 134.1483
Epoch 13/20, Loss: 127.6996
Epoch 14/20, Loss: 125.6631
Epoch 15/20, Loss: 121.8354
Epoch 16/20, Loss: 119.0474
Epoch 17/20, Loss: 115.6667
Epoch 18/20, Loss: 112.8146
Epoch 19/20, Loss: 109.8517
Epoch 20/20, Loss: 107.1506
mother jumped shouted ready shouted mom

== Russian ==
Epoch 1/20, Loss: 665.0287
Epoch 2/20, Loss: 646.0088
Epoch 3/20, Loss: 582.7609
Epoch 4/20, Loss: 512.5512
Epoch 5/20, Loss: 457.0765
Epoch 6/20, Loss: 392.6667
Epoch 7/20, Loss: 335.5190
Epoch 8/20, Loss: 293.5051
Epoch 9/20, Loss: 251.9168
Epoch 10/20, Loss: 217.4576
Epoch 11/20, Loss: 185.1186
Epoch 12/20, Loss: 161.0613
Epoch 13/20, Loss: 146.7477
Epoch 14/20, Loss: 137.3