In [1]:
import pickle
import warnings
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Bidirectional, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


warnings.filterwarnings('ignore')

In [2]:
EMBEDDING_DIM = 50
EPOCHS = 10
BATCH_SIZE = 64

In [3]:
def load_data(path):
    with open(path, "rb") as f:
        data = pickle.load(f)
    bag_of_words = data["BoW"]
    bigrams = data["bigrams"]
    fourgrams = data["fourgrams"]
    return bag_of_words, bigrams, fourgrams

In [4]:
def create_vocab(bag_of_words):
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(bag_of_words.items())}
    return vocab


def convert_to_sequences(ngrams, vocab):
    sequences = []
    for gram in ngrams:
        if all(word in vocab for word in gram):
            sequences.append([vocab[word] for word in gram])
    return sequences


def prepare_data(ngrams_sequences, n):
    X, Y = [], []
    for gram in ngrams_sequences:
        X.append(gram[:-1])
        Y.append(gram[-1])
    X = np.array(X)
    Y = np.array(Y)
    return X, Y

In [5]:
def create_rnn_model(vocab_size, embedding_dim, n):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=n-1),
        SimpleRNN(256, activation="tanh", return_sequences=False),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def create_lstm_model(vocab_size, embedding_dim, n):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=n-1),
        LSTM(256, activation="tanh", return_sequences=False),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def create_gru_model(vocab_size, embedding_dim, n):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=n-1),
        GRU(256, activation="tanh", return_sequences=False),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adadelta', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def create_dense_model(vocab_size):
    model = Sequential([
        Dense(512, input_dim=1, activation='tanh'),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [6]:
def evaluate_model(model, X_test, Y_test, vocab=None, threshold=None, bow=False):
    preds = model.predict(X_test)
    if threshold:
        preds_binary = (preds > threshold).astype(np.int32)
    else:
        preds_binary = preds.argmax(axis=-1)

    metrics = {}
    if vocab:
        inv_vocab = {idx: word for word, idx in vocab.items()}
        preds_text = [inv_vocab.get(idx, "<UNK>") for idx in preds_binary]
        metrics["Generated Text"] = " ".join(preds_text)
    else:
        if bow: Y_test = Y_test.argmax(axis=-1)
        metrics.update({
            "Accuracy": accuracy_score(Y_test, preds_binary),
            "Precision": precision_score(Y_test, preds_binary, average='weighted'),
            "Recall": recall_score(Y_test, preds_binary, average='weighted'),
            "F1-Score": f1_score(Y_test, preds_binary, average='weighted'),
        })
    return metrics

In [7]:
def save_text(path, text):
    with open(path, 'w', encoding='utf-8') as file:
        file.write(text)

In [8]:
path = "result_lab2/text_structures2.pkl"
bag_of_words, bigrams, fourgrams = load_data(path)

print("BoW:", list(bag_of_words.items())[:10])
print("\n2-граммы:", bigrams[:10])
print("\n4-граммы:", fourgrams[:10])

BoW: [('корабль', 255), ('человек', 242), ('планета', 215), ('сторона', 206), ('мой', 189), ('канваля', 189), ('рука', 180), ('новый', 170), ('баронство', 169), ('тисара', 166)]

2-граммы: [('алекс', 'каменеть'), ('каменеть', 'макс'), ('макс', 'вольф'), ('вольф', 'наёмник'), ('наёмник', 'глава'), ('глава', 'станция'), ('станция', 'технический'), ('технический', 'обслуживание'), ('обслуживание', 'бринг'), ('бринг', 'лиманский')]

4-граммы: [('алекс', 'каменеть', 'макс', 'вольф'), ('каменеть', 'макс', 'вольф', 'наёмник'), ('макс', 'вольф', 'наёмник', 'глава'), ('вольф', 'наёмник', 'глава', 'станция'), ('наёмник', 'глава', 'станция', 'технический'), ('глава', 'станция', 'технический', 'обслуживание'), ('станция', 'технический', 'обслуживание', 'бринг'), ('технический', 'обслуживание', 'бринг', 'лиманский'), ('обслуживание', 'бринг', 'лиманский', 'союз'), ('бринг', 'лиманский', 'союз', 'приграничный')]


In [9]:
vocab = create_vocab(bag_of_words)
vocab_size = len(vocab) + 1
print("\nСловарь:", list(vocab.items())[:10])


Словарь: [('корабль', 1), ('человек', 2), ('планета', 3), ('сторона', 4), ('мой', 5), ('канваля', 6), ('рука', 7), ('новый', 8), ('баронство', 9), ('тисара', 10)]


In [10]:
bigrams_sequences = convert_to_sequences(bigrams, vocab)
fourgrams_sequences = convert_to_sequences(fourgrams, vocab)

In [11]:
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    restore_best_weights=True,
    verbose=1
)

### <center>Bag of Words

##### BoW (мешок слов) подходит для задач анализа текста:
- Классификация текста (например, спам/не спам);
- Анализ тональности;
- Вычисление сходства между текстами.

##### Для генерации же текста требуются модели, которые могут учитывать последовательность слов, чтобы сохранить смысл:
- Рекуррентные нейронные сети (RNN);
- LSTM/GRU;
- Трансформеры (например, GPT).

In [12]:
X_bow = np.array(list(bag_of_words.values())).reshape(-1, 1)
Y_bow = np.arange(len(bag_of_words))
lb = LabelBinarizer()
Y_bow = lb.fit_transform(Y_bow)

X_bow_train, X_bow_test, Y_bow_train, Y_bow_test = train_test_split(X_bow, Y_bow, test_size=0.2, random_state=42)

In [13]:
dense_bow = create_dense_model(len(bag_of_words))
dense_bow.fit(X_bow_train, Y_bow_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# dense_bow.summary()

Epoch 1/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.0000e+00 - loss: 9.5485 - val_accuracy: 0.0000e+00 - val_loss: 9.3475
Epoch 2/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 67ms/step - accuracy: 0.0000e+00 - loss: 9.2204 - val_accuracy: 0.0000e+00 - val_loss: 10.4675
Epoch 3/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 67ms/step - accuracy: 0.0000e+00 - loss: 8.8226 - val_accuracy: 0.0000e+00 - val_loss: 11.1296
Epoch 4/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 68ms/step - accuracy: 0.0000e+00 - loss: 8.6826 - val_accuracy: 0.0000e+00 - val_loss: 11.5205
Epoch 5/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 67ms/step - accuracy: 0.0000e+00 - loss: 8.5696 - val_accuracy: 0.0000e+00 - val_loss: 11.3328
Epoch 6/10
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 65ms/step - accuracy: 0.0000e+00 - loss: 8.4908 - val_accuracy: 

In [14]:
dense_bow_metrics = evaluate_model(dense_bow, X_bow_test, Y_bow_test, bow=True) # vocab=vocab for text_generation
print("Classification Metrics:", dense_bow_metrics)

[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Classification Metrics: {'Accuracy': 0.0, 'Precision': 0.0, 'Recall': 0.0, 'F1-Score': 0.0}


In [15]:
dense_generated_text = evaluate_model(dense_bow, X_bow_test, Y_bow_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/dense2.txt", dense_generated_text['Generated Text'])

[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


### <center>2-grams

##### Данная задача - задача классификации с множественными классами, где классами являются все возможные слова в словаре, и задача сводится к предсказанию одного из этих классов (слова) на основе входной последовательности.

In [16]:
n_bigrams = 2

X_bigrams, Y_bigrams = prepare_data(bigrams_sequences, n_bigrams)
X_bigrams = pad_sequences(X_bigrams, maxlen=n_bigrams-1, padding='pre')

X_bigrams_train, X_bigrams_test, Y_bigrams_train, Y_bigrams_test = train_test_split(X_bigrams, Y_bigrams, test_size=0.2, random_state=42)

In [17]:
rnn_bigrams = create_rnn_model(vocab_size, EMBEDDING_DIM, n_bigrams)
rnn_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# rnn_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 55ms/step - accuracy: 1.0764e-05 - loss: 9.2728 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 55ms/step - accuracy: 2.1349e-05 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2726
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 52ms/step - accuracy: 1.2246e-04 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 52ms/step - accuracy: 3.0303e-05 - loss: 9.2726 - val_accuracy: 0.0000e+00 - val_loss: 9.2725
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 51ms/step - accuracy: 3.2376e-05 - loss: 9.2725 - val_accuracy: 9.7714e-05 - val_loss: 9.2724
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 51ms/step - accuracy: 5.9304e-05 - loss: 9.2725 - val_accuracy

In [18]:
rnn_bigrams_metrics = evaluate_model(rnn_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
rnn_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


{'Accuracy': 0.000156335495974361,
 'Precision': 0.0003936812034990727,
 'Recall': 0.000156335495974361,
 'F1-Score': 0.00014859821380071463}

In [19]:
rnn_bigrams_generated_text = evaluate_model(rnn_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/rnn_bigrams2.txt", rnn_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [20]:
lstm_bigrams = create_lstm_model(vocab_size, EMBEDDING_DIM, n_bigrams)
lstm_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# lstm_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 67ms/step - accuracy: 2.5541e-04 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 66ms/step - accuracy: 2.7618e-04 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 80ms/step - accuracy: 5.1781e-04 - loss: 9.2726 - val_accuracy: 1.9543e-04 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 66ms/step - accuracy: 7.2950e-04 - loss: 9.2726 - val_accuracy: 5.8628e-04 - val_loss: 9.2726
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 66ms/step - accuracy: 9.8164e-04 - loss: 9.2725 - val_accuracy: 0.0012 - val_loss: 9.2725
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 68ms/step - accuracy: 0.0016 - loss: 9.2725 - val_accuracy: 0.0014

In [21]:
lstm_bigrams_metrics = evaluate_model(lstm_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
lstm_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


{'Accuracy': 0.002657703431564137,
 'Precision': 0.00011909709346354223,
 'Recall': 0.002657703431564137,
 'F1-Score': 0.0002105985612532636}

In [22]:
lstm_bigrams_generated_text = evaluate_model(lstm_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/lstm_bigrams2.txt", lstm_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


In [23]:
gru_bigrams = create_gru_model(vocab_size, EMBEDDING_DIM, n_bigrams)
gru_bigrams.fit(X_bigrams_train, Y_bigrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# gru_bigrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 66ms/step - accuracy: 1.2792e-04 - loss: 9.2727 - val_accuracy: 9.7714e-05 - val_loss: 9.2727
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 62ms/step - accuracy: 1.5709e-04 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 52ms/step - accuracy: 2.9898e-04 - loss: 9.2726 - val_accuracy: 9.7714e-05 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 52ms/step - accuracy: 2.8418e-04 - loss: 9.2726 - val_accuracy: 1.9543e-04 - val_loss: 9.2726
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 52ms/step - accuracy: 5.2977e-04 - loss: 9.2725 - val_accuracy: 2.9314e-04 - val_loss: 9.2725
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 52ms/step - accuracy: 3.9628e-04 - loss: 9.2725 - val_accuracy

In [24]:
gru_bigrams_metrics = evaluate_model(gru_bigrams, X_bigrams_test, Y_bigrams_test) # vocab=vocab for text_generation
gru_bigrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


{'Accuracy': 0.0014851872117564292,
 'Precision': 0.0003633506674284358,
 'Recall': 0.0014851872117564292,
 'F1-Score': 0.0003994968006347388}

In [25]:
gru_bigrams_generated_text = evaluate_model(gru_bigrams, X_bigrams_test, Y_bigrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/gru_bigrams2.txt", gru_bigrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


### <center>4-grams

In [26]:
n_fougrams = 4

X_fourgrams, Y_fourgrams = prepare_data(fourgrams_sequences, n_fougrams)
X_fourgrams = pad_sequences(X_fourgrams, maxlen=n_fougrams-1, padding='pre')

X_fourgrams_train, X_fourgrams_test, Y_fourgrams_train, Y_fourgrams_test = train_test_split(X_fourgrams, Y_fourgrams, test_size=0.2, random_state=42)

In [27]:
rnn_fourgrams = create_rnn_model(vocab_size, EMBEDDING_DIM, n_fougrams)
rnn_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# rnn_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 47ms/step - accuracy: 4.5575e-05 - loss: 9.2728 - val_accuracy: 9.7714e-05 - val_loss: 9.2728
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 49ms/step - accuracy: 1.3178e-04 - loss: 9.2727 - val_accuracy: 9.7714e-05 - val_loss: 9.2727
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 48ms/step - accuracy: 5.7720e-05 - loss: 9.2726 - val_accuracy: 9.7714e-05 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 48ms/step - accuracy: 9.7796e-05 - loss: 9.2725 - val_accuracy: 9.7714e-05 - val_loss: 9.2726
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 48ms/step - accuracy: 1.4434e-04 - loss: 9.2725 - val_accuracy: 9.7714e-05 - val_loss: 9.2725
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 48ms/step - accuracy: 1.5980e-04 - loss: 9.2724 - val_accuracy

In [28]:
rnn_fourgrams_metrics = evaluate_model(rnn_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
rnn_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


{'Accuracy': 0.00031269543464665416,
 'Precision': 0.001027917667840646,
 'Recall': 0.00031269543464665416,
 'F1-Score': 0.00044506983531373775}

In [29]:
rnn_fourgrams_generated_text = evaluate_model(rnn_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/rnn_fourgrams2.txt", rnn_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [30]:
lstm_fourgrams = create_lstm_model(vocab_size, EMBEDDING_DIM, n_fougrams)
lstm_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# lstm_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 51ms/step - accuracy: 1.0345e-04 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 49ms/step - accuracy: 7.2191e-05 - loss: 9.2727 - val_accuracy: 0.0000e+00 - val_loss: 9.2727
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 1.3457e-04 - loss: 9.2726 - val_accuracy: 1.9543e-04 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 2.6222e-04 - loss: 9.2726 - val_accuracy: 6.8399e-04 - val_loss: 9.2726
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 49ms/step - accuracy: 6.0570e-04 - loss: 9.2725 - val_accuracy: 9.7714e-04 - val_loss: 9.2725
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 9.5154e-04 - loss: 9.2725 - val_accuracy

In [31]:
lstm_fourgrams_metrics = evaluate_model(lstm_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
lstm_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


{'Accuracy': 0.002892432770481551,
 'Precision': 0.0005741623735609098,
 'Recall': 0.002892432770481551,
 'F1-Score': 0.0005618836955201098}

In [32]:
lstm_fourgrams_generated_text = evaluate_model(lstm_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/lstm_fourgrams2.txt", lstm_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [33]:
gru_fourgrams = create_gru_model(vocab_size, EMBEDDING_DIM, n_fougrams)
gru_fourgrams.fit(X_fourgrams_train, Y_fourgrams_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks=[early_stopping]);
# gru_fourgrams.summary()

Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 49ms/step - accuracy: 8.4824e-05 - loss: 9.2728 - val_accuracy: 9.7714e-05 - val_loss: 9.2727
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 58ms/step - accuracy: 6.4699e-05 - loss: 9.2727 - val_accuracy: 1.9543e-04 - val_loss: 9.2727
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 57ms/step - accuracy: 1.3870e-04 - loss: 9.2726 - val_accuracy: 1.9543e-04 - val_loss: 9.2726
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 58ms/step - accuracy: 1.8639e-04 - loss: 9.2726 - val_accuracy: 1.9543e-04 - val_loss: 9.2725
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 58ms/step - accuracy: 2.6536e-04 - loss: 9.2725 - val_accuracy: 3.9085e-04 - val_loss: 9.2725
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 58ms/step - accuracy: 2.7259e-04 - loss: 9.2725 - val_accuracy

In [34]:
gru_fourgrams_metrics = evaluate_model(gru_fourgrams, X_fourgrams_test, Y_fourgrams_test) # vocab=vocab for text_generation
gru_fourgrams_metrics

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step


{'Accuracy': 0.0013289555972482803,
 'Precision': 0.0004056128002203461,
 'Recall': 0.0013289555972482803,
 'F1-Score': 0.00046798474193111416}

In [35]:
gru_fourgrams_generated_text = evaluate_model(gru_fourgrams, X_fourgrams_test, Y_fourgrams_test, vocab=vocab, bow=True) # vocab=vocab for text_generation
save_text("generated_texts/gru_fourgrams2.txt", gru_fourgrams_generated_text['Generated Text'])

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
