<a href="https://colab.research.google.com/github/Alina-Telnova/personal_github/blob/main/machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# загружаем наши библиотеки
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [None]:
# наш датасет
!pip install datasets -q

from datasets import load_dataset
dataset = load_dataset("CogComp/trec")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# смотрим структуру датасета
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 5452
    })
    test: Dataset({
        features: ['text', 'coarse_label', 'fine_label'],
        num_rows: 500
    })
})

In [None]:
# смотрим какие тексты есть
dataset['train']['text'][:10]

['How did serfdom develop in and then leave Russia ?',
 'What films featured the character Popeye Doyle ?',
 "How can I find a list of celebrities ' real names ?",
 'What fowl grabs the spotlight after the Chinese Year of the Monkey ?',
 'What is the full form of .com ?',
 'What contemptible scoundrel stole the cork from my lunch ?',
 "What team did baseball 's St. Louis Browns become ?",
 'What is the oldest profession ?',
 'What are liver enzymes ?',
 'Name the scar-faced bounty hunter of The Old West .']

In [None]:
data = dataset['train']['text'][:5000]

In [None]:
# чистка
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)

    text = ' '.join(text.split())

    return text.strip()

cleaned_data = [clean_text(text) for text in data]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Инициализируем токенизатор
tokenizer = Tokenizer()

# Обучаем токенизатор на заголовках
tokenizer.fit_on_texts(cleaned_data)

# Преобразуем заголовки в последовательности чисел
sequences = tokenizer.texts_to_sequences(cleaned_data)

# Создаем входные и выходные данные
X = []
y = []
for seq in sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

In [None]:
# Преобразуем списки в массивы numpy
X = np.asarray(X, dtype="object")
y = np.array(y)

# Дополняем последовательности до одинаковой длины
X = pad_sequences(X)

# Преобразуем y в one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

In [None]:
# Создаем модель
model = Sequential()

# Добавляем слой Embedding
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=X.shape[1]))

# Добавляем слой LSTM
model.add(LSTM(150, return_sequences=False))

# Добавляем полносвязный слой
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Компилируем модель
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Выводим информацию о модели
model.summary()



In [None]:
history = model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.0956 - loss: 7.5877 - val_accuracy: 0.1191 - val_loss: 7.0986
Epoch 2/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.1270 - loss: 6.6106 - val_accuracy: 0.1489 - val_loss: 6.9484
Epoch 3/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.1521 - loss: 6.1955 - val_accuracy: 0.1620 - val_loss: 6.9478
Epoch 4/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.1666 - loss: 5.8809 - val_accuracy: 0.1687 - val_loss: 6.9906
Epoch 5/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.1821 - loss: 5.5869 - val_accuracy: 0.1750 - val_loss: 7.0441
Epoch 6/50
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.1911 - loss: 5.3439 - val_accuracy: 0.1837 - val_loss: 7.1346
Epoch 7/50
[1m449/449[0

In [None]:
model.summary()

In [None]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Генерируем новый вопрос
generated_text = generate_text("name", 5, X.shape[1])
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
name the first private citizen to


Хоть мы и получили более менее адекватный результат, но по показателям модель переобучилась. Плюс предложение не похоже на вопрос как из датасета.

In [None]:
#Тут добавляем кучу всего, а именно токенизацию с частотным фильтром, learning rate, Callbacks и т.д.
# плюс изменила функцию генерации и не стала удалять стоп слова, т.к. мне было важно сохранить структуру вопроса

from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from collections import Counter


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\?]', '', text)
    return text.strip()

cleaned_questions = [clean_text(q) for q in data if len(q.split()) > 3]

# Токенизация с частотным фильтром
word_counts = Counter()
for q in cleaned_questions:
    word_counts.update(q.split())

vocab_size = 8000
top_words = [word for word, count in word_counts.most_common(vocab_size-1)]
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=vocab_size,
    oov_token="<OOV>",
    filters=''
)
tokenizer.fit_on_texts(cleaned_questions)

# Создание последовательности
max_len = 20
sequences = []
for q in cleaned_questions:
    seq = tokenizer.texts_to_sequences([q])[0]
    if 2 < len(seq) <= max_len:
        sequences.append(seq)

# Подготовка X и y
X = []
y = []
for seq in sequences:
    for i in range(2, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])

X = pad_sequences(X, maxlen=max_len-1, padding='post')
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# моделька
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_len-1),
    Dropout(0.2),
    GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    BatchNormalization(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(vocab_size, activation='softmax')
])

# learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ModelCheckpoint('best_model.keras', save_best_only=True)
]

# Обучение
history = model.fit(
    X, y,
    epochs=30,
    batch_size=256,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=1
)

# Функция генерации текста с температурой (так посоветовал дипсик)
def generate_text(seed_text, next_words=8, temperature=0.7):
    seed_tokens = tokenizer.texts_to_sequences([seed_text])[0][-max_len+1:]

    for _ in range(next_words):
        padded_seq = pad_sequences([seed_tokens], maxlen=max_len-1, padding='post')
        preds = model.predict(padded_seq, verbose=0)[0]

        # Применяем температуру
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        # Выбираем следующее слово
        next_word_id = np.random.choice(range(vocab_size), p=preds)
        next_word = tokenizer.index_word.get(next_word_id, "?")

        seed_tokens.append(next_word_id)
        if next_word == "?":
            break

    return seed_text + " " + " ".join([tokenizer.index_word.get(i, "") for i in seed_tokens[len(tokenizer.texts_to_sequences([seed_text])[0]):]])

# Тестирование
print(generate_text("what is", 8))
print(generate_text("how to", 8))
print(generate_text("why does", 8))
print(generate_text("where can", 8))

Epoch 1/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 144ms/step - accuracy: 0.1130 - loss: 7.6341 - val_accuracy: 0.1650 - val_loss: 6.2544
Epoch 2/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 129ms/step - accuracy: 0.1645 - loss: 6.4450 - val_accuracy: 0.1715 - val_loss: 6.5418
Epoch 3/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 132ms/step - accuracy: 0.1791 - loss: 6.2143 - val_accuracy: 0.2026 - val_loss: 5.7278
Epoch 4/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 125ms/step - accuracy: 0.2066 - loss: 5.9274 - val_accuracy: 0.2141 - val_loss: 5.6457
Epoch 5/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 125ms/step - accuracy: 0.2276 - loss: 5.6345 - val_accuracy: 0.2249 - val_loss: 5.6694
Epoch 6/30
[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 131ms/step - accuracy: 0.2392 - loss: 5.4639 - val_accuracy: 0.2264 - val_loss: 5.5799
Epoch 7/30

Здесь уже лучше, сохраняется структура вопроса, но если мы смотрим на лексику, то все равно остается "бред с проблесками". Плюс модель все равно начинает переобучаться
Поэтому меняем все, добавляем окно внимания.

In [None]:
from tensorflow.keras.layers import Input, Layer, Dropout, Bidirectional

X = []
y = []
for seq in sequences:
    for i in range(1, len(seq)):
        X.append(seq[:i])
        y.append(seq[i])


X = np.array(X, dtype="object")
y = np.array(y)


max_sequence_length = max(len(seq) for seq in X)
X = pad_sequences(X, maxlen=max_sequence_length, padding='pre')

# Преобразование y в one-hot encoding
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# слой внимания
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Создаем веса для вычисления внимания
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1],),
                                 initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1],),
                                 initializer='zeros', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        # Вычисляем веса внимания
        e = tf.tensordot(x, self.W, axes=1) + self.b
        attention_weights = tf.nn.softmax(e, axis=1)

        # Применяем веса внимания к входным данным
        context_vector = attention_weights[:, :, None] * x
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

#моделька
model = Sequential()

# Слой Embedding
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=max_sequence_length))

# LSTM
model.add(Bidirectional(LSTM(512, return_sequences=True)))

# Еще один слой LSTM
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.3))

# Слой внимания
model.add(AttentionLayer())

# Добавление Dropout для предотвращения переобучения
model.add(Dropout(0.3))

# Полносвязный слой
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Компиляция модели
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Вывод информации о модели
model.summary()

# Обучение модели
history = model.fit(
    X, y,
    epochs=25,
    batch_size=128,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    ]
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 83ms/step - accuracy: 0.0900 - loss: 7.3839 - val_accuracy: 0.1185 - val_loss: 6.7974
Epoch 2/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 78ms/step - accuracy: 0.1194 - loss: 6.5815 - val_accuracy: 0.1454 - val_loss: 6.6860
Epoch 3/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 75ms/step - accuracy: 0.1499 - loss: 6.2243 - val_accuracy: 0.1677 - val_loss: 6.5751
Epoch 4/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 76ms/step - accuracy: 0.1658 - loss: 5.9826 - val_accuracy: 0.1748 - val_loss: 6.5639
Epoch 5/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 74ms/step - accuracy: 0.1777 - loss: 5.7808 - val_accuracy: 0.1790 - val_loss: 6.5752
Epoch 6/25
[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 76ms/step - accuracy: 0.1944 - loss: 5.5197 - val_accuracy: 0.1857 - val_loss: 6.5614
Epoch 7/25
[1m2

In [None]:
def generate_text(seed_text, next_words, max_sequence_len, temperature=1.0):
    for _ in range(next_words):
        # Преобразуем входной текст в последовательность чисел
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')

        # Предсказываем вероятности для следующего слова
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Применяем температуру для управления разнообразием
        predicted_probs = np.log(predicted_probs) / temperature
        exp_probs = np.exp(predicted_probs)
        predicted_probs = exp_probs / np.sum(exp_probs)

        # Выбираем следующее слово на основе вероятностей
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

        # Находим слово по индексу
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break

        # Добавляем предсказанное слово к входному тексту
        seed_text += " " + output_word

    return seed_text

# Генерируем новый вопрос
generated_text = generate_text("name", 5, X.shape[1], temperature=0.7)
print(generated_text)

name what is the term of


In [None]:
# Сохраняем модель
model.save('questions_generator.keras')