In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random
from transformers import AutoTokenizer
import pandas as pd
import cProfile
import pstats
import multiprocessing
import sqlite3

2023-03-25 14:17:30.742391: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-25 14:17:31.603075: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.0/lib64::/home/drake/Documents/projects/IRIS/.conda/lib/
2023-03-25 14:17:31.603134: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.0/lib64::/home/drake/Documents/proj

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
tf.config.list_physical_devices('GPU')

2023-03-25 14:17:34.156416: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-25 14:17:34.178401: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-25 14:17:34.178568: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


In [4]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [5]:
vocab_size = 30000  # Only consider the top 20k words
maxlen = 100  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    adam = tf.keras.optimizers.Adam(learning_rate=0.0001, clipvalue=1.0)

    model.compile(
        'adam', loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model


In [6]:
from transformers import AutoTokenizer, BertTokenizer
tokenizer =  AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
def split_data(data):
    data_chunks = []
    for i in range(0, len(data), 512):
        data_chunks.append(data[i:i+512])

    return data_chunks

def encode_data(data):
    encoded_chunks = []
    for chunk in data:
        encoded = tokenizer.encode(chunk, max_length=512, truncation=True, add_special_tokens=False)
        encoded_chunks += encoded

    return encoded_chunks

def encode_chunk(chunk):

    encoded = tokenizer.encode(chunk, max_length=512, truncation=True, add_special_tokens=False)

    return encoded

def encode_data(data):
    with multiprocessing.Pool() as pool:
        encoded_chunks = pool.map(encode_chunk, data)
    
    return [encoded for chunk in encoded_chunks for encoded in chunk]

def dataset():

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", max_vocab_size=vocab_size)

    path = os.getcwd()
    dir = path.replace('Transformer', 'WikipediaScraper')
    os.chdir(dir)

    df = pd.read_csv('data.csv')

    os.chdir(path)

    columns = df['article'].tolist()
    joined_data = '\n'.join(str(columns))
    
    splited = split_data(joined_data)
    
    encoded = encode_data(splited)
    encoded_chunks = encoded

    print(len(encoded_chunks))

    sequenced_data = []
    for i in range(0, len(encoded_chunks) - maxlen, 3):
        sequenced_data.append(encoded_chunks[i : i + maxlen + 1])

    sequenced_data = tf.random.shuffle(sequenced_data)
    print(sequenced_data)
    
    #get token vocabs
    vocab = tokenizer.get_vocab()

    subword_list = [None] * len(vocab)

    for subword, index in vocab.items():
        subword_list[index] = subword
    vocab = subword_list

    def prepare_lm_inputs_labels(text):

        text = tf.expand_dims(text, -1)
        text = tf.convert_to_tensor(text, dtype=tf.int32)
        x = text[:, :-1]
        y = text[:, 1:]
        return x, y

    batch_size = 128

    text_ds = tf.data.Dataset.from_tensor_slices(sequenced_data)
    text_ds = text_ds.shuffle(buffer_size=200000)
    text_ds = text_ds.batch(batch_size)
    text_ds = text_ds.map(prepare_lm_inputs_labels)
    text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

    return df, text_ds, vocab

In [12]:
df, text_ds, vocab = dataset()

136800
tf.Tensor(
[[1012 1044 1041 ... 1057 1038 1059]
 [1041 1040 1051 ... 1054 1045 1048]
 [1051 1049 1056 ... 1041 1045 1055]
 ...
 [1018 1016 1050 ... 1061 1042 1051]
 [1052 1041 1050 ... 1042 1045 1054]
 [1057 1037 1061 ... 1054 1048 1040]], shape=(45567, 101), dtype=int32)


In [76]:
def split_data(data):
    data_chunks = []
    for i in range(0, len(data) - maxlen, 512):
        data_chunks.append(data[i:i+512])

    return data_chunks

def encode_data(data):
    encoded_chunks = []
    
    for chunk in data:
        encoded = tokenizer.encode(chunk, add_special_tokens=False, max_length=512, truncation=True)
        encoded_chunks += encoded

    return encoded_chunks

def save_data(data):
    conn = sqlite3.connect('data.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE random_numbers
                 (id INTEGER PRIMARY KEY,
                 number INTEGER)''')
    for num in data:
        c.execute(f"INSERT INTO random_numbers (number) VALUES ({num})")
    conn.commit()
    conn.close()

def dataset():

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", max_vocab_size=vocab_size)

    path = os.getcwd()
    dir = path.replace('Transformer', 'WikipediaScraper')
    os.chdir(dir)

    df = pd.read_csv('data.csv')

    os.chdir(path)

    columns = df['article'].tolist()
    new_columns = list(map(str, columns))
    joined_data = '\n'.join(new_columns)
    
    splited = split_data(joined_data)

    encoded = encode_data(splited)
    encoded_chunks = encoded

    vocab = tokenizer.get_vocab()
    subword_list = [None] * len(vocab)
    for subword, index in vocab.items():
        subword_list[index] = subword
    vocab = subword_list

    save_data(encoded_chunks)

    return df, vocab, encoded_chunks, joined_data

In [77]:
df, vocab, encoded_chunks, joined_data = dataset()

In [78]:
class DataGenerator():

    def __init__(self,batch_size, maxlen, threadsafe=True, vocab_size=30000):

        self.batch_size = batch_size
        self.maxlen = maxlen
        
        if threadsafe:
            self.conn = sqlite3.connect('data.db', check_same_thread=False)
        else:
            self.conn = sqlite3.connect('data.db')
        cursor = self.conn.execute("SELECT COUNT(*) FROM random_numbers")
        self.count = cursor.fetchone()[0]
        print(self.count)

    def random_index(self):
        return random.randint(1, self.count - self.maxlen - 1)

    def get_data(self):
        c = self.conn.cursor()
        index = self.random_index()
        c.execute("SELECT * FROM random_numbers WHERE id BETWEEN ? AND ?", (index, index + self.maxlen+1))
        sequence_data = [row[1] for row in c]
        return sequence_data
    
    def label_data(self, data):

        #data = tf.cast(data, dtype=tf.int32)
        #data = tf.convert_to_tensor(data, dtype=tf.int32)
        x = data[:-1]
        y = data[1:]
        return x, y

    def generate(self):
        while True:
            sequence = []
            sequences = []
            x, y = [], []
            x_list, y_list = [], []
            for _ in range(self.batch_size):
                sequence = self.get_data()
                x, y = self.label_data(sequence)
                x_list.append(x)
                y_list.append(y)

            if len(x_list) == self.batch_size:
                try:
                    x_list = np.array(x_list)
                    y_list = np.array(y_list)
                    yield x_list, y_list
                except ValueError:
                    print(x_list)
                    continue

In [18]:
class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=10
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

start_prompt = "Chess is"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
num_tokens_generated = 40

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)


In [79]:
data_generator = DataGenerator(64, maxlen)
data_gen = data_generator.generate()

439108


In [28]:
tf.debugging.disable_traceback_filtering()

In [70]:
def train():
    model = create_model()

    steps_size = 400

    if os.path.exists('transformer.h5'):
        model.load_weights('transformer.h5')

    transformer = model.fit(data_gen, steps_per_epoch = steps_size, verbose=1, epochs=5, callbacks=[text_gen_callback])

    model.save_weights('transformer.h5')
def main():
    with cProfile.Profile() as pr:
        train()
    stats = pstats.Stats(pr)
    stats.sort_stats(pstats.SortKey.TIME)
    stats.dump_stats('transformer.prof')

main()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [73]:
model = create_model()
model.load_weights('transformer.h5')

In [None]:
os.chdir('/home/drake/Documents/projects/IRIS/Transformer')

In [74]:
class generate_text():

    x = 'suck my dikc'

    def __init__(self, model, max_tokens, start_tokens, index_to_word, top_k=20, print_every=1):
        self.model = model
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k
    
    def sample_from(self, logits):
        logits /= 0.5
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)
    
    def detokenize(self, number):
        return self.index_to_word[number]
    
    def generate(self):
        start_tokens = [_ for _ in self.start_tokens]
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[-maxlen:]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][sample_index])
            if sample_token == 102:
                break
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = "".join(
            tokenizer.decode(self.start_tokens + tokens_generated)
        )
        print(f"generated text:\n{txt}\n")
        print(self.start_tokens + tokens_generated)

In [75]:
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index

sentence = "Japan was "
start_tokens = tokenizer.encode(sentence, add_special_tokens=False)
print(start_tokens)
max_tokens = 200
text_gen = generate_text(model, max_tokens, start_tokens, vocab)
for i in range(5):
    text_gen.generate()


[2900, 2001]
generated text:
japan was also a stub. you can help wikipedia by expanding it. this article about a library - related building or structure is a stub. you can help wikipedia by expanding it. the 1950 east tenness ee state buccaneers football team was an american football team that represented east tennessee state college ( etsc ) — now known as east tennessee state university — as a member of the smoky mountain conference and the volunteer state athletic conference ( vsac ) during the 1950 college football season. led by fourth - year head coach loyd roberts, the buccaneers compiled an overall a record of 3 – 5 – 1, with marks of 1 – 2 – 1 against smoky mountain opponents and 0 – 1 – 1 in vsac play. this was the program's first losing re cord under roberts and the first losing season since 1941. the team's co - captains were mark sutherland and bob " snake " evans. the 1950 squad beat local rival tusculum. they also tied milligan in the final meeting between the

[2900, 20

KeyboardInterrupt: 