In [1]:
import tensorflow as tf
import torch as tr

import numpy as np
import requests
import pathlib
import tqdm
from tqdm import trange

## Download the text file

In [2]:
tinyshakespeare = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
response = requests.get(url=tinyshakespeare)

with open('./data/tinyshakespeare.txt', mode='w') as txt:
    txt.write(response.text)

In [3]:
text_file = pathlib.Path('./data/tinyshakespeare.txt')
text_file = text_file.read_text()

print(text_file[:300])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us


In [4]:
all_chars = sorted(list(set(text_file)))
vocab_size = len(all_chars)
print('Characters :',''.join(all_chars))
print('Vocab size', vocab_size)

Characters : 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size 65


# Tensorflow

#### Create a mapping from characters to integers and vice-versa

In [5]:
# create a mapping from characters to integers and vice-versa

char_to_id = tf.keras.layers.StringLookup(vocabulary=all_chars, mask_token=None)
id_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_id.get_vocabulary(), invert=True, mask_token=None)
id_to_str = lambda id : tf.strings.reduce_join(inputs=id_to_char(id), axis=-1, separator='')

#### Encode the text

In [6]:
all_ids = char_to_id(tf.strings.unicode_split(text_file, input_encoding='UTF-8'))

In [7]:
all_ids

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([19, 48, 57, ..., 46,  9,  1], dtype=int64)>

In [8]:
def input_and_target(text):
    input = text[:-1]
    target = text[1:]
    return input, target

seq_len = 101
text_ds = (tf.data.Dataset
           .from_tensor_slices(all_ids)
           .batch(batch_size=seq_len, drop_remainder=True)
           .map(input_and_target)
           )

In [9]:
input_batch, target_batch = next(iter(text_ds))

In [10]:
print(id_to_str(input_batch).numpy().decode('utf-8'))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [11]:
print(id_to_str(target_batch).numpy().decode('utf-8'))

irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [12]:
# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (text_ds
           .shuffle(BUFFER_SIZE)
           .batch(BATCH_SIZE, drop_remainder=True)
           .prefetch(tf.data.AUTOTUNE))

In [13]:
(sample_inputs, sample_targets) = next(iter(dataset))

#### Character Model

In [14]:
class Attention(tf.keras.layers.Layer):
    def __init__(self):
        super(Attention, self).__init__()
    
    def call(self, query, keys, values, mask=None):

        kdims = tf.cast(keys.shape[-1], dtype=tf.float32)

        # [batch, query, dims] @ [batch, keys, dims]^T --> [batch, query, keys]
        prod = tf.matmul(query, keys, transpose_b=True)
        # scaling
        prod_scaled = prod / tf.math.sqrt(kdims)

        # apply mask
        if mask is not None:
            prod_scaled += (mask*-1e9)
        
        weights = tf.nn.softmax(prod_scaled, axis=-1)

        # [batch, query, key] @ [batch, value, dim] --> [batch, query, dim]
        attention = tf.matmul(weights, values)

        return attention

In [15]:
# Lets test the attention 
emb = tf.keras.layers.Embedding(input_dim=char_to_id.vocabulary_size(), output_dim=512)
sample_emb = emb(sample_inputs)
q,k,v = [sample_emb]*3

attention = Attention()
attention_output = attention(q,k,v,mask=None)
print('Shape of input tokens :',sample_inputs.shape)
print('Shape of query Embedding :',q.shape)
print('Shape of query Embedding :',k.shape)
print('Shape of query Embedding :',v.shape)
print('Shape of Attention output :',attention_output.shape)


Shape of input tokens : (64, 100)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of Attention output : (64, 100, 512)


In [16]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, dims, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.dims=dims
        self.num_heads=num_heads
        self.head_dims=dims//num_heads

        tf.assert_equal(x=dims%num_heads, y=0, message='MultiHeadAttention')

        self.dq = tf.keras.layers.Dense(dims)
        self.dk = tf.keras.layers.Dense(dims)
        self.dv = tf.keras.layers.Dense(dims)

        self.dense = tf.keras.layers.Dense(dims)

        self.attention = Attention()

    def split_head(self, vector):
        # [batch, seq, dim] --> [batch, seq, num_head, head_dims]
        vector = tf.reshape(vector, shape=[self.batch_dims, -1, self.num_heads, self.head_dims])
        # [batch, seq, num_head, head_dims] --> [batch, num_head, seq, head_dims]
        vector = tf.transpose(vector, perm=[0, 2, 1, 3])
        return vector
    
    def concat_heads(self, vector):
        # [batch, num_head, seq, dims] --> [batch, seq, num_head, dims]
        vector = tf.transpose(vector, perm=[0, 2, 1, 3])
        # [batch, seq, num_head, dims] --> [batch, seq, dim]
        vector = tf.reshape(vector, shape=[self.batch_dims, -1, self.num_heads*self.head_dims])
        return vector


    def call(self, q, k, v, mask):

        self.batch_dims = tf.shape(q)[0]

        q = self.dq(q)
        k = self.dk(k)
        v = self.dv(v)

        # multi head --> [batch, num_head, seq, dims]
        q = self.split_head(q)
        k = self.split_head(k)
        v = self.split_head(v)

        # ATTENTION --> [batch, num_head, seq, dims]
        self_attention = self.attention(q, k, v, mask)

        # concat heads [batch, num_head, seq, dims] --> [batch, seq, dim]
        self_attention = self.concat_heads(self_attention)

        # Projection --> [batch, seq, dim]
        return self.dense(self_attention)



In [17]:
# Lets test Multi Head Attention
mha = MultiHeadAttention(dims=512, num_heads=8)

multi_head_attention_output = mha(q, k ,v, mask=None)

print('Shape of input tokens :', sample_inputs.shape)
print('Shape of query Embedding :',q.shape)
print('Shape of query Embedding :',k.shape)
print('Shape of query Embedding :',v.shape)
print('Shape of Multi Head Attention output :',multi_head_attention_output.shape)

Shape of input tokens : (64, 100)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of Multi Head Attention output : (64, 100, 512)


In [18]:
class FFNN(tf.keras.layers.Layer):
    def __init__(self, ffnn_units, dims):
        super(FFNN, self).__init__()
        self.ffnn=tf.keras.Sequential([
            tf.keras.layers.Dense(units=ffnn_units, activation='relu'),
            tf.keras.layers.Dense(units=dims)
        ])
    def call(self, inputs):
        return self.ffnn(inputs)

In [19]:
# lets test FFNN
ffnn = FFNN(ffnn_units=2048, dims=512)
ffnn_output = ffnn(multi_head_attention_output)

print('Shape of input tokens :',sample_inputs.shape)
print('Shape of query Embedding :',q.shape)
print('Shape of query Embedding :',k.shape)
print('Shape of query Embedding :',v.shape)
print('Shape of Multi Head Attention output :', multi_head_attention_output.shape)
print('Shape of Feed Forward Neural Nets output :', ffnn_output.shape)

Shape of input tokens : (64, 100)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of query Embedding : (64, 100, 512)
Shape of Multi Head Attention output : (64, 100, 512)
Shape of Feed Forward Neural Nets output : (64, 100, 512)


In [20]:
def causal_mask(size):
    ones = tf.ones(shape=[size,size])
    # triangular matrix ---> [size, size]
    lt = tf.linalg.band_part(input=ones, num_lower=-1, num_upper=0)
    return 1 - lt

# lets test the causal_mask 
mask = causal_mask(8)
print('CAUSAL MASK')
print(mask)

CAUSAL MASK
tf.Tensor(
[[0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0.]], shape=(8, 8), dtype=float32)


In [21]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, dims, num_heads, ffnn_units, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.causal_multi_head_attention = MultiHeadAttention(dims=dims, num_heads=num_heads)
        self.ffnn = FFNN(dims=dims, ffnn_units=ffnn_units)

        self.add_and_norm_1 = tf.keras.layers.LayerNormalization()
        self.add_and_norm_2 = tf.keras.layers.LayerNormalization()


        self.drop_out_1 = tf.keras.layers.Dropout(rate=dropout_rate)
        self.drop_out_2 = tf.keras.layers.Dropout(rate=dropout_rate)


    def call(self, inputs, mask, training):

        # pre-norm
        layer_norm_1 = self.add_and_norm_1(inputs)

        # self attention query, keys, values
        q, k, v = [layer_norm_1]*3

        # causal/masked multi-head-attention-layer --> [batch, seq, dims]
        causal_attention = self.causal_multi_head_attention(q, k, v, mask)
        
        # drop out
        out_1 = self.drop_out_1(causal_attention, training=training)

        # skip connection
        out_1 += inputs

        # pre-norm
        layer_norm_2 = self.add_and_norm_2(out_1)
        # Feed Forward Neural Nets --> [batch, seq, dims]
        fully_connected = self.ffnn(layer_norm_2)  
        # dropout
        out_2 = self.drop_out_2(fully_connected, training=training)

        return out_2 + out_1

In [22]:
# lets test decoder layer
decoder_layer = DecoderLayer(dims=512, num_heads=8, ffnn_units=2048, dropout_rate=0.2)
mask = causal_mask(size=100)
decoder_layer_output = decoder_layer(sample_emb, mask, training=True)


print('Shape of input tokens :',sample_inputs.shape)
print('Shape of input Embedding :',sample_emb.shape)
print('Decoder Layer Output ',decoder_layer_output.shape)


Shape of input tokens : (64, 100)
Shape of input Embedding : (64, 100, 512)
Decoder Layer Output  (64, 100, 512)


In [23]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, dims, num_heads, ffnn_units, dropout_rate, n_decoder_layers):
        super(Decoder, self).__init__()

        self.decoder_layers = [
            DecoderLayer(dims, num_heads, ffnn_units, dropout_rate) 
            for _ in range(n_decoder_layers)]
    
    def call(self, vectors, mask, training):

        for decoder in self.decoder_layers:
            vectors = decoder(vectors, mask, training)
        
        return vectors

In [27]:
# lets test decoder
decoder = Decoder(dims=512, num_heads=8, ffnn_units=2048, dropout_rate=0.2, n_decoder_layers=6)
mask = causal_mask(size=100)
decoder_output = decoder(sample_emb, mask, training=True)


print('Shape of input tokens :',sample_inputs.shape)
print('Shape of input Embedding :',sample_emb.shape)
print('Decoder Layer Output ',decoder_output.shape)

Shape of input tokens : (64, 100)
Shape of input Embedding : (64, 100, 512)
Decoder Layer Output  (64, 100, 512)


$$Positional \: Embedding$$

$$ PE_{(position, 2i)} = \sin({\frac{positions}{10000^{\frac{2i}{dims}}}})$$
$$ PE_{(position, 2i+1)} = \cos({\frac{positions}{10000^{\frac{2i}{dims}}}})$$


In [24]:
def get_angle(pos, i, dims):
    neu = pos
    deno = 10000**(2*(i//2) / dims)
    angle = neu / deno
    return angle
    
def position_embedding(positions, dims):
    pos = np.arange(positions)[:, np.newaxis]
    i = np.arange(dims)[np.newaxis,:] 
    theta = get_angle(pos, i , dims)

    theta[:,0::2] = np.sin(theta[:,0::2])
    theta[:,1::2] = np.cos(theta[:,1::2])

    return tf.cast(theta[np.newaxis, :], dtype=tf.float32)

In [25]:
# Length of the vocabulary in StringLookup Layer
tf_vocab_size = char_to_id.vocabulary_size()

# The embedding dimension
tf_dim = 256

# Number of RNN units
tf_ffnn_units = 1024

# Positional embedding
tf_position = 100

# Drop out rate
tf_dropout_rate = 0.2

# Attention heads
tf_num_heads = 8

# Decoder Layers
tf_n_decoder_layers = 6


The positional embeddings are initialized with a range of -1.0 to +1.0,  but the word-embeddings are initialized with a mean of 0.0 and s.d. of embedding_dim ** -0.5 .

The positional embeddings would overwhelm any signal coming from the word embeddings.

In [28]:

pe = position_embedding(100, 512)

print('Mean of PE',tf.math.reduce_mean(pe).numpy(),
       '\nStd of PE',tf.math.reduce_std(pe).numpy(),
       '\nMin value of PE', np.min(pe),
       '\nMax value of PE', np.max(pe))


print('\nMean of Embedding layer',tf.math.reduce_mean(sample_emb).numpy(),
       '\nStd of embedding layer',tf.math.reduce_std(sample_emb).numpy(),
       '\nMin of embedding layer',tf.math.reduce_min(sample_emb).numpy(),
       '\nMax of embedding layer',tf.math.reduce_max(sample_emb).numpy(),)

print('\nThe word embeddings are scaled by math.sqrt(embed_dim) (22.6 for 512, 32 for 1024)')



Mean of PE 0.3573661 
Std of PE 0.6101553 
Min value of PE -1.0 
Max value of PE 1.0

Mean of Embedding layer -9.220097e-05 
Std of embedding layer 0.028716143 
Min of embedding layer -0.049995292 
Max of embedding layer 0.04999883

The word embeddings are scaled by math.sqrt(embed_dim) (22.6 for 512, 32 for 1024)


In [31]:
class CharacterModel(tf.keras.Model):

    def __init__(self, positions, dims, vocab_size, ffnn_units, dropout_rate, num_heads, n_decoder_layers):
        super(CharacterModel, self).__init__()
        self.dims = dims
        self.embeddings = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=dims)
        self.pe = position_embedding(positions, dims)
        self.causal_mask = causal_mask(size=positions)
        self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.dense = tf.keras.layers.Dense(units=vocab_size)

        self.decoder = Decoder(dims, num_heads, ffnn_units, dropout_rate, n_decoder_layers)
    


    def call(self, inputs, training=False):
        
        # [batch, sequence] --> [batch, sequence, embedding_dims]
        embed = self.embeddings(inputs, training=training)
        # positional embeddings are initialized with a range of -1.0 to +1.0, 
        # but the word-embeddings are initialized with a mean of 0.0 and s.d. of embedding_dim ** -0.5 
        # The positional embeddings would overwhelm any signal coming from the word embeddings.
        # To prevent that we scale embeddings by a factor of math.sqrt(embed_dims)
        embed *= tf.math.sqrt(tf.cast(self.dims, tf.float32))

        # [batch, seq, dims]
        (batch, sequence, embedding_dims) = embed.shape

        # positional embedding
        if not training:
            pe = self.pe[:,:sequence, :]
        else:
            pe = self.pe

        # embeddings + positional embedding
        # [batch, seq, dims]
        meaning_and_order = embed + pe

        # Causal mask
        mask = self.causal_mask[:sequence,:sequence]
        # decoder block
        context_meaning_order = self.decoder(meaning_and_order, mask, training=training)

        # layer norm
        vector = self.layer_norm(context_meaning_order)

        # dropout
        vector = self.dropout(vector, training=training)

        # logits  # un-normalized probabilities
        logits = self.dense(vector)

        return logits


    @tf.function
    def train_step(self, inputs):
        X, y = inputs
        with tf.GradientTape() as tape:
            logits = self(inputs=X, training=True)
            loss =  self.compiled_loss(y, logits, regularization_losses=self.losses)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.compiled_metrics.update_state(y, logits)
        return {m.name: m.result() for m in self.metrics}


In [32]:
char_model = CharacterModel(
    positions=100, dims=512, vocab_size=id_to_char.vocabulary_size(), 
    ffnn_units=2048, dropout_rate=0.2, num_heads=8, n_decoder_layers=6)


sample_logits = char_model(sample_inputs)

In [33]:
print('Shape Input shape:', sample_inputs.shape)
print('Shape logits output:', sample_logits.shape)

Shape Input shape: (64, 100)
Shape logits output: (64, 100, 66)


In [34]:
sample_predictions = tf.random.categorical(logits=sample_logits[:,-1,:], num_samples=1)
print('Sample Prediction shape',sample_predictions.shape)
print(id_to_str(tf.squeeze(sample_predictions)).numpy())

Sample Prediction shape (64, 1)
b'?VWdLhW[UNK]?U;LxE;OJl,!xcrRJDhtd;CICIWjYRCw;O?j?CcC:WzCCqzxpAzZ;;t;'


#### Lets train the model

In [39]:

char_model = CharacterModel(
    positions=100, dims=32, vocab_size=id_to_char.vocabulary_size(), 
    ffnn_units=32, dropout_rate=0.2, num_heads=4, n_decoder_layers=2)


char_model.compile(
    optimizer='adam', 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=['accuracy'])

In [40]:
# cb = tf.keras.callbacks.ModelCheckpoint(filepath='./checkpoint/{epoch}_ckpt', save_weights_only=True)

tf_val_ds = dataset.take(22)
tf_train_ds = dataset.skip(22)

char_model.fit(x=tf_train_ds, epochs=10, validation_data=tf_val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2017a53ef20>

In [41]:
class GenerateText(tf.Module):
    def __init__(self, model, char_to_id, id_to_char, id_to_str, seed_window_size):
        super(GenerateText, self).__init__()
        self.model = model
        self.char_to_id = char_to_id
        self.id_to_char = id_to_char
        self.id_to_str = id_to_str
        # Positional embedding input dims = seq_len
        self.seed_window_size = seed_window_size
    
    def process_inputs(self, inputs):
        tensor_array = tf.TensorArray(dtype=tf.int64, dynamic_size=True, size=1)
        ragged = self.char_to_id(tf.strings.unicode_split(inputs, input_encoding='UTF-8')).to_tensor()
        for i, tensor in enumerate(ragged):
            tensor_array=tensor_array.write(i, tensor[-self.seed_window_size:])
        return tensor_array.stack()

    def sample(self, logits):
        # focus on the last timestep
        last = logits[:,-1,:]
        return tf.random.categorical(logits=last, num_samples=1)
    
    def __call__(self, inputs, n_iter=1000):
        # [batch of text] --> [batch, window]
        input_tokens = self.process_inputs(inputs)
        output_tokens = tf.TensorArray(dtype=tf.int64, dynamic_size=True, size=1)

        for i in tf.range(n_iter):
            # [batch, window] --> [batch, window, vocab_size]
            logits = self.model(inputs=input_tokens, training=False)
            # [batch, window, vocab_size] --> [batch, 1]
            sample = self.sample(logits) 
            
            # write sampled tokens
            output_tokens = output_tokens.write(i, sample)

            # [batch, window] --> [batch, window+1]
            input_tokens = tf.concat(values=[input_tokens, sample], axis=1)
            input_tokens = input_tokens[:, -self.seed_window_size:]

        # [n_iter, batch, 1] --> [n_iter, batch]
        output_tokens = output_tokens.stack()
        output_tokens = tf.squeeze(output_tokens, axis=-1)
        # [n_iter, batch] --> [batch, n_iter]
        output_tokens = tf.transpose(output_tokens, perm=(1,0))
        # text from ids
        outputs = self.id_to_str(output_tokens)

        return outputs



#### Lets test generating some text

In [42]:
generate_text = GenerateText(char_model, char_to_id, id_to_char, id_to_str, seed_window_size=100)

In [43]:
tf_sample_inputs = [
"""ROMEO:
Why, sir, what think you, sir?,""",

"""Caius Marcius is chief enemy to the people.""" ,

"""All:
No more talking on't; let it be done: away, away!"""  ,

"""
First Citizen:
Before we proceed any further, hear me speak.
"""
]

In [44]:
generated_outputs = generate_text(tf.constant(['ROMEO:']), n_iter=300 )

In [45]:
print(generated_outputs.numpy()[0].decode('utf-8'))


Why will hes, gor! I not yous thous rar mom nou waker! sins,

Codelavels ayicpd
Pasthe ogyses brimff mencerast bat erind,
Whik to y Gre all cof gistis
Wie le''gh walds hom cooce grounche, nine
Mugh ushert shigharst ooond palle at. Coh!

ARNCEBOFAN fak blos, busnfors, the youl berm,
Ar ivotind iss d


## Loading a trained model

In [30]:
char_model_new= CharacterModel(
    positions=100, dims=256, vocab_size=id_to_char.vocabulary_size(), 
    ffnn_units=64, dropout_rate=0.2, num_heads=4, n_decoder_layers=6)

In [46]:
char_model_new.load_weights('./tf_saved_model/char_model_d256_h4_n6')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1ba382a6ce0>

In [47]:
generate_text = GenerateText(char_model_new, char_to_id, id_to_char, id_to_str, seed_window_size=100)


In [49]:
generated_outputs = generate_text(tf_sample_inputs, n_iter=500)

In [51]:
print(generated_outputs.numpy()[0].decode('utf-8'))

 small play them?

JULIET:
Why, I charge thee? my mother to thee:
Scut ase the land's lady earth, whose vigours.

KING EDWARD IV:

Nurse:
Nod the duke that his night, grant make so for Tybalt
Before I come, take my liege and clear.

SICINIUS:
The king crown's the pount of my hearts
Of him feelow win helm takes him trouble
Aid leave us fearful to means a deceived:
And he that I pleased hear me do and from your found,
Yet here we are in his land? Aumerled, come
If he would have mirs the weary more


In [52]:
print(generated_outputs.numpy()[1].decode('utf-8'))

,
At wry bids and vow: and Lucio,
Whether I will in true black the Iam wretch.
O Lord of Trust
of little Master Daughter
That thus Earl of York, the duke's bosom of Critizens:
What you, I say, remember your father?
Therefore no doubt a dame uncle, you may shake
And yet with you, I'll be the royal prevented's
The twomen of my chances feasting of blesh.
What the timer of the days of Nathant's swift?
I'll withur love as for comper; the capefore ashius
obey followers goass of the second which should


In [53]:
print(generated_outputs.numpy()[2].decode('utf-8'))

,
The ground: that's in the night of the music that's
else slipe; these comes of the curness is
Will not perceive it in the heavy tlouckle,
To mine executioners, the worms of such as kindred.
Fearing striket a fellow honour of the rude,
Go to they are and gotten Claudio; and therefore,
Incle, as thou hast not still be rack:
God's them heavy offlicts our patch'd part
With all patient she winks, and pleasesure you
Fair acted from me your wears welcome.
Now, beseech you may not, more walkon.

CLARE


In [54]:
print(generated_outputs.numpy()[3].decode('utf-8'))


RICHARD:
Here wench in Brespare, quarter, by are the telling!

PARIS:
The son the duest of telling imour tree, and thou only
fair mercy before than of thy world. They can in home.

LUCIO:

GRUMIO:
Great Sanday, be a knee purpose: if fill not the bell,
nothing pat we have commended him lived and so
shuccess at should have ase done, have I,
Have 'pearls;' forget, can you love Just me to cause the trumpet!
And, after you have plain'd with her service,
Which you are, mother mouth: Eter of the night


# Torch

In [46]:
device = "cuda" if tr.cuda.is_available() else "cpu"

# hyperparameters
tr_batch_size = 64 # how many independent sequences will we process in parallel?
tr_block_size = 256 # what is the maximum context length for predictions?

tr_max_iters = 5000
tr_eval_interval = 500
tr_learning_rate = 3e-4
tr_eval_iters = 200

tr_n_layer = 6
tr_drop_out_rate = 0.2

tr_ffn_units = 32
tr_n_embed = 384
tr_n_heads = 6
tr_head_dims = tr_n_embed // tr_n_heads  


assert(tr_head_dims * tr_n_heads == tr_n_embed)

#### Create a mapping from characters to integers and vice-versa

In [40]:
# create a mapping from characters to integers and vice-versa

tr_vocab_size = len(all_chars)

c_to_i = {c:i for i,c in enumerate(all_chars)}
i_to_c = {i:c for i,c in enumerate(all_chars)}




tr_str_encode = lambda s : [c_to_i[c] for c in s ]
tr_str_decode = lambda id : ''.join([i_to_c[i] for i in id])

#### Encode the text

In [44]:
tr_data = tr.tensor(tr_str_encode(text_file), dtype=tr.long)

#### Let's now split up the data into train and validation sets

In [42]:
# first 90% will be train, rest val
n = int(0.9*len(tr_data)) 
tr_train_data = tr_data[:n]
tr_val_data = tr_data[n:]

In [30]:
tr.manual_seed(1337)
# how many independent sequences will we process in parallel?
# tr_batch_size = 32 
# what is the maximum context length for predictions?
# tr_block_size = 101 

def get_batch(split='train', batch_size=tr_batch_size, block_size=tr_block_size):
    # generate a small batch of data of inputs x and targets y
    data = tr_train_data if split == 'train' else tr_val_data
    ix = tr.randint(len(data) - block_size, (batch_size,))
    x = tr.stack([data[i:i+block_size] for i in ix])
    y = tr.stack([data[i+1:i+block_size+1] for i in ix])
    
    # if device is cuda -- 
        # when we load the data , move it to the device
    x, y = x.to(device), y.to(device) 
    return x, y

@tr.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'eval']:
        losses = tr.zeros(tr_eval_iters)

        for k in range(tr_eval_iters):
            X, Y = get_batch(split=split, batch_size=tr_batch_size, block_size=tr_block_size)
            logits, loss = model(inputs=X, targets=Y)
            losses[k] = loss
        out[split] = losses.mean()
    model.train()
    return out


In [31]:
xb, yb = get_batch('train', batch_size=4, block_size=8)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

# for b in range(batch_size): # batch dimension
#     for t in range(block_size): # time dimension
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----


In [149]:
class Head(tr.nn.Module):
    def __init__(self, embedding_dim, head_dims, block_size, drop_out_rate):
        super(Head, self).__init__()
        self.keys = tr.nn.Linear(in_features=embedding_dim, out_features=head_dims, bias=False)
        self.query = tr.nn.Linear(in_features=embedding_dim, out_features=head_dims, bias=False)
        self.values = tr.nn.Linear(in_features=embedding_dim, out_features=head_dims, bias=False)
        # This is typically used to register a buffer that should not to be considered a model parameter. 
        # For example, BatchNorm's running_mean is not a parameter, but is part of the module's state. Buffers,
        # by default, are persistent and will be saved alongside parameters.
        self.register_buffer(name='lt_matrix', tensor=tr.tril(tr.ones(block_size, block_size)))
        self.dropout = tr.nn.Dropout(drop_out_rate)


    def self_attention(self, key, query, values):
        B, T, C = key.shape
        # [batch, time, head_dims] @ [batch, head_dims, time] --> [batch, time, time]
        dot_product =  key @ tr.permute(query, dims=[0, 2, 1])
        mask = self.lt_matrix[:T, :T] # causality constraints
        dot_product = tr.masked_fill(input=dot_product, mask=mask==0, value=-tr.inf)
        scaled_dot_product = dot_product * C**-0.5
        weights = tr.nn.functional.softmax(scaled_dot_product, dim=-1)
        # [batch, time, time] @ [batch, time, head_dims] -- > [batch, time, head_dims]
        attention = weights @ values
        return attention
        

    def forward(self, inputs):
        # [batch, time, embedding_dims] --> [batch, time, head_dims]
        k = self.keys(inputs)
        q = self.query(inputs)
        v = self.values(inputs)

        attention = self.self_attention(k, q, v) # -- > [batch, time, head_dims]

        return attention


In [150]:
class MultiHeadAttention(tr.nn.Module):
    """Multiple heads of self attention in parallel"""
    def __init__(self, embedding_dim, num_heads, head_dims, block_size, drop_out_rate):
        super(MultiHeadAttention, self).__init__()
        self.multi_heads = tr.nn.ModuleList(
            [Head(embedding_dim, head_dims, block_size, drop_out_rate) for i in range(num_heads)])
        self.linear = tr.nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.dropout = tr.nn.Dropout(drop_out_rate)

    def forward(self, inputs):
        # eg: a list containing 4 heads of 8 dims self attention  
        # [batch, time, embedding_dim] --> [ .. [batch, time, head_dims] ..  ]
        mha = [head(inputs) for head in self.multi_heads]
        # there are num_head self attention output so we concat along the last axis
        out = tr.cat(tensors=mha, dim=-1)  # --> [batch, time, embedding_dims]
        out = self.linear(out)
        return self.dropout(out)

In [151]:
class FFNN(tr.nn.Module):

    def __init__(self, ffn_units, n_embed, drop_out_rate):
        super(FFNN, self).__init__()
        
        self.ffn = tr.nn.Sequential(
            tr.nn.Linear(in_features=n_embed, out_features=4*ffn_units),
            tr.nn.ReLU(),
            tr.nn.Linear(in_features=4*ffn_units, out_features=n_embed),
            tr.nn.Dropout(drop_out_rate)
        )

    def forward(self, inputs):
        x = self.ffn(inputs)
        return x

In [152]:
class Block(tr.nn.Module):
    def __init__(self, embedding_dim, num_heads, head_dims, block_size, ffn_units, drop_out_rate) -> None:
        super(Block, self).__init__()
        self.mha = MultiHeadAttention(embedding_dim, num_heads, head_dims, block_size, drop_out_rate)
        self.ffnn = FFNN(ffn_units, embedding_dim, drop_out_rate)
        self.layer_norm_1 = tr.nn.LayerNorm(normalized_shape=embedding_dim)
        self.layer_norm_2 = tr.nn.LayerNorm(normalized_shape=embedding_dim)

    def forward(self, inputs):

        assert(tr_n_heads*tr_head_dims == tr_n_embed)
        
        #  embedding_dim = heads * head_dims
        # multi head attention block --> [batch, time, embedding_dim]
        ln1 = self.layer_norm_1(inputs) # pre-norm # Layer normalization
        x = inputs + self.mha(ln1)  # skip connection + Multi-head-attention

        # Feed forward neural network 
        ln2 = self.layer_norm_2(x) # pre-norm # Layer normalization
        x = x + self.ffnn(ln2) # skip connection + FFNN --> [batch, time, embedding_dims]

        return x


In [92]:
assert(tr_n_heads*tr_head_dims == tr_n_embed)

In [153]:
class BigramLanguageModel(tr.nn.Module):
    def __init__(self, vocab_size, block_size, embedding_dim, num_heads, head_dims, ffn_units, n_layer, drop_out_rate):
        super().__init__()
        self.block_size = block_size

        self.token_embedding_table = tr.nn.Embedding(num_embeddings=vocab_size, embedding_dim = embedding_dim)
        self.position_embedding_table = tr.nn.Embedding(num_embeddings=block_size, embedding_dim = embedding_dim)
        self.linear = tr.nn.Linear(in_features=embedding_dim, out_features=vocab_size)
        self.layer_norm = tr.nn.LayerNorm(normalized_shape=embedding_dim)

        self.mha_ffnn_block = tr.nn.Sequential(
            *[Block(embedding_dim, num_heads, head_dims, block_size, ffn_units, drop_out_rate) for _ in range(n_layer)])
        
        
    
    def forward(self, inputs, targets=None):
        (B, T) = inputs.shape
        # [T] --> [T, embedding_dim]
        pe = self.position_embedding_table(tr.arange(end=T, device=device))
        # [batch, timesteps] --> [batch, time, embedding_dim]
        embed = self.token_embedding_table(inputs)
 
        # [batch, time, embedding_dim] + [time, embedding_dim] --> [batch, time, embedding_dim]
        x = embed + pe 


        #  embedding_dim = heads * head_dims
        # multi head attention block --> [batch, time, embedding_dim]
        # Feed forward neural network # -->  [batch, time, embedding_dim]
        x = self.mha_ffnn_block(x)

        # layer normalization
        x = self.layer_norm(x)

        # [batch, time, embedding_dim] --> [batch, time, vocab_size]
        logits = self.linear(x)

        if targets is None:
            loss = None
        else:
            # reshaping 
            (batch, time, embedding_dim) = logits.shape
            logits = logits.view(batch*time, embedding_dim)
            targets = targets.view(-1)

            loss = tr.nn.functional.cross_entropy(input=logits, target=targets)
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block-size tokens , because PE expect in_dims = block_size
            idx_in = idx[:,-self.block_size:]
            # get the predictions
            (logits, loss) = self(idx_in)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = tr.nn.functional.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = tr.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = tr.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

    

tr_model = BigramLanguageModel(
    tr_vocab_size, tr_block_size, tr_n_embed, tr_n_heads, tr_head_dims, tr_ffn_units, tr_n_layer, tr_drop_out_rate
)
# when we create the model, we want to move the model params to the device
tr_model = tr_model.to(device)
tr_sample_logits, tr_sample_loss = tr_model(xb, yb)


print('Shape of input sequence', xb.shape)
print('Shape of output logits', tr_sample_logits.shape)
print('Output loss', tr_sample_loss)

Shape of input sequence torch.Size([32, 8])
Shape of output logits torch.Size([256, 65])
Output loss tensor(4.4517, grad_fn=<NllLossBackward0>)


#### Lets test generating some text

In [106]:
# make sure to add the context to the device
context = tr.zeros(size=(1,1),dtype=tr.long, device=device)
tr_model.generate(context, max_new_tokens=100)

tensor([[ 0, 22, 41, 47, 64, 57, 45, 28,  0, 44, 17, 12, 51, 54, 57, 60, 17, 36,
         47, 56, 17, 13, 31, 57, 26, 37,  0, 52, 31, 16, 38, 35, 30, 44, 62, 50,
         32, 26, 34, 11, 22, 50, 53,  4, 12, 63,  2, 16, 14, 12, 15, 62, 62, 52,
         39, 22, 31, 36, 41, 43, 37, 57, 40, 44, 22, 20, 53, 33, 24,  5, 33, 12,
         27, 10, 36,  8, 47, 36, 46, 26, 34, 52, 49, 28, 47, 53, 30, 63, 48, 18,
         45, 39, 24, 27, 40, 32,  3, 41, 14, 51, 14]])

In [61]:
tr_model.generate(tr.zeros(size=(1,1), dtype=tr.long), max_new_tokens=100).squeeze().shape

torch.Size([101])

In [102]:
print(tr_str_decode(tr_model.generate(tr.zeros(size=(1,1), dtype=tr.long), max_new_tokens=100).squeeze().tolist()))


IN Nock ter cill rine kinsoutsme wiy, os.

VIE:
O
Tosned;
Coond;
I bone the fora Gomase.

BOMEIZIUS:


#### Create a pytorch optimizer

In [125]:
optimizer = tr.optim.AdamW(params=tr_model.parameters(), lr=tr_learning_rate)

#### model training 

In [126]:

for i in trange(tr_max_iters, ncols=100, colour='green', desc='Model training'):

    if i % tr_eval_iters == 0:
        losses = estimate_loss(model=tr_model)
        print(f'Train loss { losses["train"] :0.2f} Validation loss {losses["eval"] :0.2f}')

    # get the data
    xb, yb = get_batch(split='train', batch_size=tr_batch_size, block_size=tr_block_size)
    # fit the model
    logits, loss = tr_model(inputs=xb, targets=yb)
    # zero out gradients from previous step
    optimizer.zero_grad(set_to_none=True)
    # get the gradients for all the model params
    loss.backward()
    # use the gradients to update the params
    optimizer.step()

Model training:   0%|[32m                                            [0m| 1/5000 [00:04<5:50:07,  4.20s/it][0m

Train loss 4.51 Validation loss 4.50


Model training:   4%|[32m█▊                                          [0m| 203/5000 [00:14<24:03,  3.32it/s][0m

Train loss 2.60 Validation loss 2.60


Model training:   8%|[32m███▌                                        [0m| 403/5000 [00:24<25:41,  2.98it/s][0m

Train loss 2.45 Validation loss 2.44


Model training:  12%|[32m█████▎                                      [0m| 605/5000 [00:34<17:59,  4.07it/s][0m

Train loss 2.34 Validation loss 2.36


Model training:  16%|[32m███████                                     [0m| 803/5000 [00:44<21:13,  3.30it/s][0m

Train loss 2.30 Validation loss 2.31


Model training:  20%|[32m████████▋                                  [0m| 1005/5000 [00:54<17:14,  3.86it/s][0m

Train loss 2.27 Validation loss 2.28


Model training:  24%|[32m██████████▎                                [0m| 1203/5000 [01:04<19:35,  3.23it/s][0m

Train loss 2.22 Validation loss 2.24


Model training:  28%|[32m████████████                               [0m| 1402/5000 [01:14<17:52,  3.35it/s][0m

Train loss 2.21 Validation loss 2.24


Model training:  32%|[32m█████████████▊                             [0m| 1605/5000 [01:24<12:33,  4.50it/s][0m

Train loss 2.18 Validation loss 2.21


Model training:  36%|[32m███████████████▌                           [0m| 1805/5000 [01:34<12:10,  4.37it/s][0m

Train loss 2.15 Validation loss 2.19


Model training:  40%|[32m█████████████████▏                         [0m| 2005/5000 [01:44<11:19,  4.41it/s][0m

Train loss 2.13 Validation loss 2.18


Model training:  44%|[32m██████████████████▉                        [0m| 2203/5000 [01:53<13:39,  3.41it/s][0m

Train loss 2.12 Validation loss 2.15


Model training:  48%|[32m████████████████████▋                      [0m| 2405/5000 [02:03<09:20,  4.63it/s][0m

Train loss 2.11 Validation loss 2.16


Model training:  52%|[32m██████████████████████▍                    [0m| 2605/5000 [02:13<08:45,  4.56it/s][0m

Train loss 2.10 Validation loss 2.15


Model training:  56%|[32m████████████████████████                   [0m| 2803/5000 [02:23<10:36,  3.45it/s][0m

Train loss 2.07 Validation loss 2.14


Model training:  60%|[32m█████████████████████████▊                 [0m| 3004/5000 [02:32<09:43,  3.42it/s][0m

Train loss 2.07 Validation loss 2.13


Model training:  64%|[32m███████████████████████████▌               [0m| 3203/5000 [02:42<08:38,  3.47it/s][0m

Train loss 2.06 Validation loss 2.11


Model training:  68%|[32m█████████████████████████████▎             [0m| 3405/5000 [02:52<05:42,  4.66it/s][0m

Train loss 2.06 Validation loss 2.12


Model training:  72%|[32m██████████████████████████████▉            [0m| 3604/5000 [03:01<06:58,  3.33it/s][0m

Train loss 2.05 Validation loss 2.12


Model training:  76%|[32m████████████████████████████████▋          [0m| 3806/5000 [03:11<04:16,  4.66it/s][0m

Train loss 2.04 Validation loss 2.11


Model training:  80%|[32m██████████████████████████████████▍        [0m| 4006/5000 [03:21<03:29,  4.75it/s][0m

Train loss 2.03 Validation loss 2.09


Model training:  84%|[32m████████████████████████████████████▏      [0m| 4204/5000 [03:30<03:54,  3.39it/s][0m

Train loss 2.02 Validation loss 2.12


Model training:  88%|[32m█████████████████████████████████████▉     [0m| 4406/5000 [03:40<02:06,  4.70it/s][0m

Train loss 2.00 Validation loss 2.10


Model training:  92%|[32m███████████████████████████████████████▌   [0m| 4603/5000 [03:50<02:06,  3.14it/s][0m

Train loss 2.01 Validation loss 2.11


Model training:  96%|[32m█████████████████████████████████████████▎ [0m| 4802/5000 [03:59<00:58,  3.38it/s][0m

Train loss 2.01 Validation loss 2.07


Model training: 100%|[32m███████████████████████████████████████████[0m| 5000/5000 [04:05<00:00, 20.35it/s][0m


In [127]:
loss.item()

2.069420099258423

In [128]:
print(tr_str_decode(tr_model.generate(tr.zeros(size=(1,tr_block_size), dtype=tr.long), max_new_tokens=200).squeeze().tolist()))









TRLA:
Your my conver have, and lije, I me in in it hay,
Und dumnepes Weoul the:
Waled, and of dear here hath beer uls, you my vee meme's me rither sitely batce,
I rale house of itue are crithioer, tul
