Настройка зависимостей

In [5]:
!pip install keras-nlp --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-nlp
  Downloading keras_nlp-0.5.2-py3-none-any.whl (527 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text (from keras-nlp)
  Downloading tensorflow_text-2.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text, keras-nlp
Successfully installed keras-nlp-0.5.2 tensorflow-text-2.12.1


In [9]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import re
import random

In [33]:
EPOCHS = 6 #@param {type:"slider", min:1, max:50, step:1}
# Data
BATCH_SIZE = 64
SEQ_LEN = 50
MIN_TRAINING_SEQ_LEN = 50
MAX_TRAINING_SEQ_LEN = 450

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 256
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000  

# Inference
NUM_TOKENS_TO_GENERATE = 30

Тренировочные данные

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
data = pd.read_csv("drive/MyDrive/Colab Notebooks/movies_subtitles.csv", engine="python")
data.head()

Unnamed: 0,start_time,end_time,text,imdb_id
0,58.559,61.602,"BOY: All right, everyone!\nThis... is a stick-up!",tt0114709
1,61.687,63.354,Don't anybody move!,tt0114709
2,64.398,66.482,"Now, empty that safe!",tt0114709
3,68.318,71.612,"Ooh-hoo-hoo!\nMoney, money, money! (KISSING)",tt0114709
4,71.697,74.031,"Stop it! Stop it,\nyou mean, old potato!",tt0114709


In [13]:
data[data["text"].isna()].head()

Unnamed: 0,start_time,end_time,text,imdb_id
7452,6898.0,6901.134,,tt0112637
32877,5495.0,5498.084,,tt0112495
41861,5087.0,5090.095,,tt0112508
83269,7205.0,7208.084,,tt0111742
102811,5603.0,5606.136,,tt0107756


In [14]:
data = data.drop(data[data["text"].isna()].index)
data[data['text'].isna()]

Unnamed: 0,start_time,end_time,text,imdb_id


In [16]:
def get_text(x):
    return x["text"]

def transform(x):
   x = str(x)
   x = x.replace('\n', ' ')
   x = x.encode("utf8").decode("ascii", 'ignore')
   return x

data["text"] = data["text"].apply(transform)
data.head()

Unnamed: 0,start_time,end_time,text,imdb_id
0,58.559,61.602,"BOY: All right, everyone! This... is a stick-up!",tt0114709
1,61.687,63.354,Don't anybody move!,tt0114709
2,64.398,66.482,"Now, empty that safe!",tt0114709
3,68.318,71.612,"Ooh-hoo-hoo! Money, money, money! (KISSING)",tt0114709
4,71.697,74.031,"Stop it! Stop it, you mean, old potato!",tt0114709


In [18]:
texts = list(get_text(data.loc[:1200000]))

#text_raw = [' '.join([my_object[0] for text in texts])][:300000000]
#del result
#del res
#del cur
#del con
#gc.collect()

In [22]:
#pattern = re.compile("^(\\w|\\s|,)+$")
dataset = list([token for token in texts if (len(token) > MIN_TRAINING_SEQ_LEN and len(token) < MAX_TRAINING_SEQ_LEN)])
random.shuffle(dataset)
train, valid = dataset[20000:len(dataset)], dataset[0:20000]


print("train: ", len(train))
print("valid: ", len(valid))


with open('train.txt', 'w') as f:
    for line in train:
        f.write(f"{line}\n")


with open('valid.txt', 'w') as f:
    for line in valid:
        f.write(f"{line}\n")


raw_train_ds = (
    tf.data.TextLineDataset("train.txt")
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)


raw_val_ds = (
    tf.data.TextLineDataset("valid.txt")
    .batch(BATCH_SIZE)
)

train:  238319
valid:  20000


Тренировка токенайзера

In [34]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

Загрузка токенайзера

In [35]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

Токенизация

In [36]:
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

Создание модели

In [37]:
inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 256)        1292800   
 g_1 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_decoder_2 (Tran  (None, None, 256)        394749    
 sformerDecoder)                                                 
                                                                 
 transformer_decoder_3 (Tran  (None, None, 256)        394749    
 sformerDecoder)                                                 
                                                                 
 dense_1 (Dense)             (None, None, 5000)        1285

Тренеровка модели

In [38]:
model.fit(train_ds, validation_data=val_ds, verbose=2, epochs=EPOCHS)

Epoch 1/6
3724/3724 - 139s - loss: 3.8927 - perplexity: 59.0710 - val_loss: 3.5644 - val_perplexity: 41.9689 - 139s/epoch - 37ms/step
Epoch 2/6
3724/3724 - 118s - loss: 3.4161 - perplexity: 35.9656 - val_loss: 3.3970 - val_perplexity: 35.3273 - 118s/epoch - 32ms/step
Epoch 3/6
3724/3724 - 118s - loss: 3.2258 - perplexity: 29.4619 - val_loss: 3.2986 - val_perplexity: 31.8033 - 118s/epoch - 32ms/step
Epoch 4/6
3724/3724 - 118s - loss: 3.0904 - perplexity: 25.5630 - val_loss: 3.2361 - val_perplexity: 29.7371 - 118s/epoch - 32ms/step
Epoch 5/6
3724/3724 - 113s - loss: 2.9843 - perplexity: 22.8704 - val_loss: 3.1899 - val_perplexity: 28.3740 - 113s/epoch - 30ms/step
Epoch 6/6
3724/3724 - 118s - loss: 2.8988 - perplexity: 20.9082 - val_loss: 3.1547 - val_perplexity: 27.1507 - 118s/epoch - 32ms/step


<keras.callbacks.History at 0x7f96e29db6a0>

Проверка модели

In [40]:
prompt_tokens = start_packer(tokenizer(["hello friend"]))
indices = tf.where(tf.not_equal(prompt_tokens[0], tf.constant(0, dtype=tf.int32)))
max_idx = tf.squeeze(indices[-1][0]).numpy()+1

def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    hidden_states = None
    return logits, hidden_states, cache


# GREEDY SEARCH
sampler = keras_nlp.samplers.GreedySampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=max_idx,
)
print(f"Greedy search: \n{tokenizer.detokenize(output_tokens)}\n")


# BEAM SEARCH
sampler = keras_nlp.samplers.BeamSampler(num_beams=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=max_idx,
)
print(f"Beam search: \n{tokenizer.detokenize(output_tokens)}\n")


# RANDOM SEARCH
sampler = keras_nlp.samplers.RandomSampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=max_idx,
)
print(f"Random search: \n{tokenizer.detokenize(output_tokens)}\n")


# TOP-K SEARCH
sampler = keras_nlp.samplers.TopKSampler(k=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=max_idx,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search: \n{txt}\n")


# TOP-P SEARCH
sampler = keras_nlp.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=max_idx,
)
print(f"Top-P search: \n{tokenizer.detokenize(output_tokens)}\n")


Greedy search: 
[b"[BOS] hello friend , i ' m glad you ' re here . i ' m here . i ' m here . [PAD] ' s a man . [PAD] ' [PAD] melkey . [PAD] you . . . . . [PAD] of what ? [PAD] . ."]

Beam search: 
[b"[BOS] hello friend , this is mr lewis . i ' m glad to see you . . . [PAD] . [PAD] . [PAD] . [PAD] ? [PAD] . [PAD] . [PAD] . [PAD] . [PAD] . [PAD] . [PAD] . [PAD] . [PAD] of this . [PAD] . [PAD]"]

Random search: 
[b"[BOS] hello friend , and welcome home this day i ' m owners are having a new surrence . [PAD]l . . . [PAD] . . [PAD] to a britrating , that ' s going along a fox ? [PAD] [PAD] 8"]

Top-K search: 
[b"[BOS] hello friend , i ' m glad you ' ve been here , but i ' ve forgotten everything , [PAD] ? [PAD] me too . [PAD] . . . [PAD] . [PAD] [PAD] . [PAD] ' [PAD] and we got a chance , we doning ? [PAD]"]

Top-P search: 
[b"[BOS] hello friend , this is detectives . mr . haverstock . are you going ? [PAD] [PAD] ? [PAD] [PAD] ? [PAD] ? [PAD] [PAD] . [PAD] ' cause a day . [PAD] ' s the othe

Эх...