In [1]:
import pickle
import warnings
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout, Bidirectional, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


warnings.filterwarnings('ignore')

In [2]:
EMBEDDING_DIM = 50
EPOCHS = 10
BATCH_SIZE = 64

In [3]:
def load_data(path):
    with open(path, "rb") as f:
        data = pickle.load(f)
    bag_of_words = data["BoW"]
    bigrams = data["bigrams"]
    fourgrams = data["fourgrams"]
    return bag_of_words, bigrams, fourgrams

In [4]:
def create_vocab(bag_of_words):  # создает словарь vocab, который сопоставляет каждому слову уникальный индекс
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(bag_of_words.items())}
    return vocab


def convert_to_sequences(ngrams, vocab): # преобразует n-граммы в последовательности индексов, основываясь на словаре vocab
    sequences = []
    for gram in ngrams:
        if all(word in vocab for word in gram):
            sequences.append([vocab[word] for word in gram])
    return sequences


def prepare_data(ngrams_sequences, n):
    '''
    X - все элементы в n-грамме кроме последнего
    Y - последнее слово в n-грамме
    '''
    X, Y = [], []
    for gram in ngrams_sequences:
        X.append(gram[:-1])
        Y.append(gram[-1])
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [5]:
# optimizer=['rmsprop', 'adam', 'adagrad', 'adadelta', 'ftrl', 'sgd' (SGD(learning_rate=0.01, momentum=0.9))]
# loss=['sparse_categorical_crossentropy']


def create_rnn_model(vocab_size, embedding_dim, n):
    layers = []
    layers.append(Embedding( # Embedding - преобразовает каждое слово или токен в вектор размерности output_dim
        input_dim=vocab_size, # input_dim - количество уникальных токенов (размер словаря)
        output_dim=embedding_dim, # output_dim - размерность векторов, в которые будет кодироваться каждое слово
        input_length=n-1 # input_length - длина входной последовательности из n-1 токенов для N-грамм
    ))
    layers.append(Bidirectional(SimpleRNN(
        256,
        activation="tanh",
        return_sequences=False,
        kernel_regularizer=l2(0.01)
    )))
    layers.append(Dropout(0.9)) # слой регуляризации в нейронной сети, отключающий указанный процент нейронов, предотвращая переобучение
    layers.append(Dense(vocab_size, activation='softmax')) # полносвязный выходной слой
    model = Sequential(layers)
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def create_lstm_model(vocab_size, embedding_dim, n):
    layers = []
    layers.append(Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=n-1
    ))
    layers.append(Bidirectional(LSTM(
        256, 
        activation="tanh", 
        return_sequences=False, 
        kernel_regularizer=l2(0.01)
    )))
    layers.append(Dropout(0.9))
    layers.append(Dense(vocab_size, activation='softmax'))
    model = Sequential(layers)
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def create_gru_model(vocab_size, embedding_dim, n):
    layers = []
    layers.append(Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=n-1
    ))
    layers.append(Bidirectional(GRU(
        256,
        activation="tanh",
        return_sequences=False,
        kernel_regularizer=l2(0.01)
    )))
    layers.append(Dropout(0.9))
    layers.append(Dense(vocab_size, activation='softmax'))
    model = Sequential(layers)
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    # momentum - добавляет к обновлению текущих параметров компоненту, зависящую от "накопленного" направления движения, чтобы быстрее преодолевать плоскости и локальные минимумы
    return model


def create_dense_model(vocab_size):
    model = Sequential([
        Dense(512, input_dim=1, activation='tanh'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [6]:
def evaluate_model(model, X_test, Y_test, vocab=None, threshold=None, bow=False):
    preds = model.predict(X_test)
    if threshold:
        preds_binary = (preds > threshold).astype(np.int32)
    else:
        preds_binary = preds.argmax(axis=-1)

    metrics = {}
    if vocab: # Если доступен словарь, метрики для текста
        inv_vocab = {idx: word for word, idx in vocab.items()}
        preds_text = [inv_vocab.get(idx, "<UNK>") for idx in preds_binary]
        metrics["Generated Text"] = " ".join(preds_text)
    else: # Для классификации
        if bow: Y_test = Y_test.argmax(axis=-1)
        metrics.update({
            "Accuracy": accuracy_score(Y_test, preds_binary),  # Убираем argmax
            "Precision": precision_score(Y_test, preds_binary, average='weighted'),
            "Recall": recall_score(Y_test, preds_binary, average='weighted'),
            "F1-Score": f1_score(Y_test, preds_binary, average='weighted'),
        })
    return metrics

In [7]:
def save_text(path, text):
    with open(path, 'w', encoding='utf-8') as file:
        file.write(text)

In [8]:
path = "result_lab2/text_structures1.pkl"
bag_of_words, bigrams, fourgrams = load_data(path)

print("BoW:", list(bag_of_words.items())[:10])
print("\n2-граммы:", bigrams[:10])
print("\n4-граммы:", fourgrams[:10])

BoW: [('человек', 369), ('время', 266), ('корабль', 254), ('сторона', 208), ('новый', 205), ('рука', 196), ('планета', 193), ('два', 191), ('сказать', 190), ('дело', 187)]

2-граммы: [('алекс', 'каменев'), ('каменев', 'макс'), ('макс', 'вольф'), ('вольф', 'наёмник'), ('наёмник', 'глава'), ('глава', 'станция'), ('станция', 'технический'), ('технический', 'обслуживание'), ('обслуживание', 'лиманский'), ('лиманский', 'союз')]

4-граммы: [('алекс', 'каменев', 'макс', 'вольф'), ('каменев', 'макс', 'вольф', 'наёмник'), ('макс', 'вольф', 'наёмник', 'глава'), ('вольф', 'наёмник', 'глава', 'станция'), ('наёмник', 'глава', 'станция', 'технический'), ('глава', 'станция', 'технический', 'обслуживание'), ('станция', 'технический', 'обслуживание', 'лиманский'), ('технический', 'обслуживание', 'лиманский', 'союз'), ('обслуживание', 'лиманский', 'союз', 'приграничный'), ('лиманский', 'союз', 'приграничный', 'территория')]


In [9]:
vocab = create_vocab(bag_of_words)
vocab_size = len(vocab) + 1
print("\nСловарь:", list(vocab.items())[:10])


Словарь: [('человек', 1), ('время', 2), ('корабль', 3), ('сторона', 4), ('новый', 5), ('рука', 6), ('планета', 7), ('два', 8), ('сказать', 9), ('дело', 10)]


In [10]:
bigrams_sequences = convert_to_sequences(bigrams, vocab)
fourgrams_sequences = convert_to_sequences(fourgrams, vocab)

In [11]:
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True, # восстанавливает веса модели из эпохи с наилучшим значением контролируемой величины
    verbose=1
)

### <center>Bag of Words

##### BoW (мешок слов) подходит для задач анализа текста:
- Классификация текста (например, спам/не спам);
- Анализ тональности;
- Вычисление сходства между текстами.

##### Для генерации же текста требуются модели, которые могут учитывать последовательность слов, чтобы сохранить смысл:
- Рекуррентные нейронные сети (RNN);
- LSTM/GRU;
- Трансформеры (например, GPT).

In [12]:
X_bow = np.array(list(bag_of_words.values())).reshape(-1, 1)
Y_bow = np.arange(len(bag_of_words))
lb = LabelBinarizer() # one-hot кодирование Y_bow
Y_bow = lb.fit_transform(Y_bow)

X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y_bow, test_size=0.2, random_state=42)

In [13]:
dense_bow = create_dense_model(len(bag_of_words)) # vocab_siza=len(bag_of_words)
dense_bow.fit(X_bow_train, Y_bow_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# dense_bow.summary()

Epoch 1/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step - accuracy: 0.0000e+00 - loss: 9.4888 - val_accuracy: 0.0000e+00 - val_loss: 9.4869
Epoch 2/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 56ms/step - accuracy: 0.0000e+00 - loss: 9.3983 - val_accuracy: 0.0000e+00 - val_loss: 9.5335
Epoch 3/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - accuracy: 0.0000e+00 - loss: 9.3506 - val_accuracy: 0.0000e+00 - val_loss: 9.5554
Epoch 4/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 59ms/step - accuracy: 0.0000e+00 - loss: 9.3355 - val_accuracy: 0.0000e+00 - val_loss: 9.5640
Epoch 5/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - accuracy: 0.0000e+00 - loss: 9.3172 - val_accuracy: 0.0000e+00 - val_loss: 9.5814
Epoch 6/10
[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - accuracy: 0.0000e+00 - loss: 9.2884 - val_accuracy: 0.00

In [14]:
dense_bow_metrics = evaluate_model(dense_bow, X_bow_test, Y_bow_test, bow=True) # vocab=vocab for text_generation
print("Classification Metrics:", dense_bow_metrics)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Classification Metrics: {'Accuracy': 0.0, 'Precision': 0.0, 'Recall': 0.0, 'F1-Score': 0.0}


In [15]:
dense_generated_text = evaluate_model(dense_bow, X_bow_test, Y_bow_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/dense1.txt", dense_generated_text['Generated Text'])

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


### <center>2-grams

##### Данная задача - задача классификации с множественными классами, где классами являются все возможные слова в словаре, и задача сводится к предсказанию одного из этих классов (слова) на основе входной последовательности.

In [16]:
n_bigrams = 2

X_bigrams, Y_bigrams = prepare_data(bigrams_sequences, n_bigrams)
X_bigrams = pad_sequences(X_bigrams, maxlen=n_bigrams-1, padding='pre')

X_bigrams_train, X_bigrams_test, Y_bigrams_train, Y_bigrams_test = train_test_split(X_bigrams, Y_bigrams, test_size=0.2, random_state=42)

In [17]:
rnn_bigrams = create_rnn_model(vocab_size, EMBEDDING_DIM, n_bigrams)
rnn_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# rnn_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 92ms/step - accuracy: 8.0077e-05 - loss: 11.1244 - val_accuracy: 9.7761e-05 - val_loss: 11.1006
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 91ms/step - accuracy: 2.6737e-04 - loss: 11.0920 - val_accuracy: 9.7761e-05 - val_loss: 11.0647
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 88ms/step - accuracy: 1.4258e-04 - loss: 11.0554 - val_accuracy: 9.7761e-05 - val_loss: 11.0273
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 92ms/step - accuracy: 7.7216e-06 - loss: 11.0178 - val_accuracy: 9.7761e-05 - val_loss: 10.9895
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 95ms/step - accuracy: 1.8565e-04 - loss: 10.9804 - val_accuracy: 9.7761e-05 - val_loss: 10.9520
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 94ms/step - accuracy: 2.9540e-04 - loss: 10.9427 - v

In [18]:
rnn_bigrams_metrics = evaluate_model(rnn_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
rnn_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


{'Accuracy': 0.00015642108556233381,
 'Precision': 7.821054278116691e-05,
 'Recall': 0.00015642108556233381,
 'F1-Score': 0.00010428072370822252}

In [19]:
rnn_bigrams_generated_text = evaluate_model(rnn_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/rnn_bigrams1.txt", rnn_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [20]:
lstm_bigrams = create_lstm_model(vocab_size, EMBEDDING_DIM, n_bigrams)
lstm_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# lstm_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 105ms/step - accuracy: 2.7847e-05 - loss: 11.3561 - val_accuracy: 9.7761e-05 - val_loss: 11.3221
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 104ms/step - accuracy: 1.8796e-04 - loss: 11.3104 - val_accuracy: 0.0000e+00 - val_loss: 11.2753
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 101ms/step - accuracy: 8.7564e-05 - loss: 11.2637 - val_accuracy: 9.7761e-05 - val_loss: 11.2293
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 100ms/step - accuracy: 9.7955e-05 - loss: 11.2180 - val_accuracy: 6.8433e-04 - val_loss: 11.1844
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 106ms/step - accuracy: 8.9128e-05 - loss: 11.1734 - val_accuracy: 9.7761e-04 - val_loss: 11.1406
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 107ms/step - accuracy: 4.2318e-04 - loss: 11.12

In [21]:
lstm_bigrams_metrics = evaluate_model(lstm_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
lstm_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


{'Accuracy': 0.0035976849679336775,
 'Precision': 0.00024365699624623495,
 'Recall': 0.0035976849679336775,
 'F1-Score': 0.000347552749453942}

In [22]:
lstm_bigrams_generated_text = evaluate_model(lstm_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/lstm_bigrams1.txt", lstm_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [23]:
gru_bigrams = create_gru_model(vocab_size, EMBEDDING_DIM, n_bigrams)
gru_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# gru_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 104ms/step - accuracy: 8.8286e-05 - loss: 11.3123 - val_accuracy: 9.7761e-05 - val_loss: 11.2800
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 97ms/step - accuracy: 2.1685e-04 - loss: 11.2688 - val_accuracy: 9.7761e-05 - val_loss: 11.2349
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 99ms/step - accuracy: 2.5850e-04 - loss: 11.2237 - val_accuracy: 9.7761e-05 - val_loss: 11.1901
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 96ms/step - accuracy: 1.3357e-04 - loss: 11.1791 - val_accuracy: 3.9105e-04 - val_loss: 11.1463
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 93ms/step - accuracy: 9.3539e-05 - loss: 11.1355 - val_accuracy: 3.9105e-04 - val_loss: 11.1035
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 96ms/step - accuracy: 2.9442e-05 - loss: 11.0930 - 

In [24]:
gru_bigrams_metrics = evaluate_model(gru_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
gru_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


{'Accuracy': 0.0014860003128421711,
 'Precision': 0.001123339616004915,
 'Recall': 0.0014860003128421711,
 'F1-Score': 0.0006895544119675449}

In [25]:
gru_bigrams_generated_text = evaluate_model(gru_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/gru_bigrams1.txt", gru_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


### <center>4-grams

In [26]:
n_fougrams = 4

X_fourgrams, Y_fourgrams = prepare_data(fourgrams_sequences, n_fougrams)
X_fourgrams = pad_sequences(X_fourgrams, maxlen=n_fougrams-1, padding='pre')

X_fourgrams_train, X_fourgrams_test, Y_fourgrams_train, Y_fourgrams_test = train_test_split(X_fourgrams, Y_fourgrams, test_size=0.2, random_state=42)

In [27]:
rnn_fourgrams = create_rnn_model(vocab_size, EMBEDDING_DIM, n_fougrams)
rnn_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# rnn_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 103ms/step - accuracy: 8.6354e-05 - loss: 11.1094 - val_accuracy: 9.7761e-05 - val_loss: 11.0862
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 97ms/step - accuracy: 9.0674e-05 - loss: 11.0780 - val_accuracy: 9.7761e-05 - val_loss: 11.0514
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 94ms/step - accuracy: 2.4497e-05 - loss: 11.0428 - val_accuracy: 9.7761e-05 - val_loss: 11.0151
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 99ms/step - accuracy: 1.3396e-04 - loss: 11.0064 - val_accuracy: 9.7761e-05 - val_loss: 10.9787
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 102ms/step - accuracy: 3.5482e-05 - loss: 10.9701 - val_accuracy: 9.7761e-05 - val_loss: 10.9426
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 100ms/step - accuracy: 0.0000e+00 - loss: 10.9339 

In [28]:
rnn_fourgrams_metrics = evaluate_model(rnn_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
rnn_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step


{'Accuracy': 0.00015642108556233381,
 'Precision': 0.0006061317065540435,
 'Recall': 0.00015642108556233381,
 'F1-Score': 0.0002144482624644899}

In [29]:
rnn_fourgrams_generated_text = evaluate_model(rnn_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/rnn_fourgrams1.txt", rnn_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


In [30]:
lstm_fourgrams = create_lstm_model(vocab_size, EMBEDDING_DIM, n_fougrams)
lstm_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# lstm_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 117ms/step - accuracy: 1.2102e-04 - loss: 11.3401 - val_accuracy: 1.9552e-04 - val_loss: 11.3063
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 115ms/step - accuracy: 2.8107e-05 - loss: 11.2947 - val_accuracy: 2.9328e-04 - val_loss: 11.2600
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 118ms/step - accuracy: 7.2861e-05 - loss: 11.2485 - val_accuracy: 2.9328e-04 - val_loss: 11.2144
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 119ms/step - accuracy: 2.1739e-04 - loss: 11.2031 - val_accuracy: 3.9105e-04 - val_loss: 11.1698
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 117ms/step - accuracy: 4.8011e-05 - loss: 11.1589 - val_accuracy: 4.8881e-04 - val_loss: 11.1264
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 115ms/step - accuracy: 1.6542e-04 - loss: 11.11

In [31]:
lstm_fourgrams_metrics = evaluate_model(lstm_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
lstm_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


{'Accuracy': 0.00328484279680901,
 'Precision': 0.00018336088323292945,
 'Recall': 0.00328484279680901,
 'F1-Score': 0.00032340798588227453}

In [32]:
lstm_fourgrams_generated_text = evaluate_model(lstm_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/lstm_fourgrams1.txt", lstm_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


In [33]:
gru_fourgrams = create_gru_model(vocab_size, EMBEDDING_DIM, n_fougrams)
gru_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# gru_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 100ms/step - accuracy: 2.3892e-05 - loss: 11.3271 - val_accuracy: 0.0000e+00 - val_loss: 11.2946
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 99ms/step - accuracy: 7.4407e-05 - loss: 11.2833 - val_accuracy: 0.0000e+00 - val_loss: 11.2491
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 102ms/step - accuracy: 7.8792e-05 - loss: 11.2378 - val_accuracy: 0.0000e+00 - val_loss: 11.2040
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 103ms/step - accuracy: 6.7165e-05 - loss: 11.1929 - val_accuracy: 0.0000e+00 - val_loss: 11.1599
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 99ms/step - accuracy: 7.4020e-05 - loss: 11.1490 - val_accuracy: 0.0000e+00 - val_loss: 11.1168
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 99ms/step - accuracy: 2.3519e-05 - loss: 11.1062 

In [34]:
gru_fourgrams_metrics = evaluate_model(gru_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
gru_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step


{'Accuracy': 0.0009385265133740028,
 'Precision': 0.00022495106164470955,
 'Recall': 0.0009385265133740028,
 'F1-Score': 0.0003123731889811509}

In [35]:
gru_fourgrams_generated_text = evaluate_model(gru_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/gru_fourgrams1.txt", gru_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
