<a href="https://colab.research.google.com/github/Ahtesham519/Genrative_Deep_learning_v2_2023/blob/main/GPT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Building my own gpt model on the wine dataset

In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import json
import re
import string
from IPython.display import display , HTML

import tensorflow as tf
from tensorflow.keras import layers, models , losses , callbacks

#0. Parameters

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5


#1. Load the Data

In [None]:
#Load the full dataset
with open("./app/data/wine-reviews/winemag-data-130k-v2.json") as json_data:
  wine_data = json.load(json_data)

In [None]:
wine_data[10]

In [None]:
#filter the dataset
filtered_data = [
    "wine review :"
    + x["country"]
    + ": "
    + x["province"]
    + " : "
    + x["variety"]
    +":"
    +x["description"]
    for x in wine_data
    if x["country"] is not None
    and x["province"] is not None
    and x["variety"] is not None
    and x["description"] is not None

]

In [None]:
#Count the recipes
n_wines = len(filtered_data)
print(f"{n_wines} recipes loaded")

In [None]:
example = filtered_data[25]
print(example)

#2. Tokenize the data

In [None]:
#Pad the punctuation , to treat them as seperate 'words'
def pad_punctuation(s):
  s = re.sub(f"({string.punctuation}, '\n')", r"\1", s)
  s = re.sub(" + ", " " , s)
  return s

text_data = [pad_punctuation(x) for x in filtered_data]

In [None]:
#Display an example of a recipe
example_data = text_data[25]
example_data

In [None]:
#convert to a tensorflow dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
#Create a vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize = "lower",
    max_tokens = VOCAB_SIZE,
    output_mode = "int",
    output_sequence_length = MAX_LEN + 1,
)

In [None]:
#Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
#Display some token:word mappings
for i , word in enumerate(vocab[:10]):
  print(f"{i} : {word}")

In [None]:
#Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenized.numpy())


#3. Create the training set

In [None]:
#Create the training set of recipes and the same text shifted by one word

def prepare_inputs(text):
  text = tf.expand_dims(text, -1)
  tokenized_sentences = vectorize_layer(text)
  x = tokenized_sentences[:, :-1]
  y = tokenized_sentences[:, 1:]
  return x, y

train_ds = text_ds.map(prepare_inputs)

In [None]:
example_input_output = train_ds.take(1).get_single_element()

In [None]:
#example Input
example_input_output[0][0]

In [None]:
#Example output (shifted by one token)
example_input_output[1][0]

#5. Create the casual attention mask function

In [None]:
def casual_attention_mask(batch_size, n_dest, n_src, dtype):
  i = tf.range(n_dest)[:, None]
  j = tf.range(n_src)
  m = i >= j - n_src + n_dest
  mask = tf.cast(m, dtype)
  mask = tf.reshape(mask, [1, n_dest , n_src])
  mult = tf.concat(
      [tf.expand_dims(batch_size , -1) , tf.constant([1,1] , dtype = tf.int32)], 0
  )
  return tf.tile(mask , mult)

np.transpose(casual_attention_mask(1, 10, 10, dtype = tf.int32)[0])

#6. Create a Transformer Block layer

In [None]:
class TransformerBlock(layers.Layer):
  def __init__(self, num_heads, key_dim , embed_dim , ff_dim , dropout_rate = 0.1):
    super(TransformerBlock, self).__init__()
    self.num_heads = num_heads
    self.key_dim = key_dim
    self.embed_dim = embed_dim
    self.ff_dim = ff_dim
    self.dropout_rate = dropout_rate
    self.attn = layers.MultiHeadAttention(
        num_heads , key_dim , output_shape = embed_dim
    )
    self.dropout_1 = layers.Dropout(self.dropout_rate)
    self.ln_1 = layers.LayerNormalization(epsilon = 1e-6)
    self.ffn_1 = layers.Dense(self.ff_dim , activation= "relu")
    self.ffn_2 = layers.Dense(self.embed_dim)
    self.dropout_2 = layers.Dropout(self.dropout_rate)
    self.ln_2 = layers.LayerNormalization(epsilon = 1e-6)

  def call(self, inputs ):
    input_shape = tf.shape(inputs)
    batch_size = input_shape[0]
    seq_len = input_shape[1]
    casual_mask = casual_attention_mask(
        batch_size , seq_len , seq_len , tf.bool
    )
    attention_output , attention_scores = self.attn(
        inputs,
        inputs,
        attention_mask = casual_mask ,
        return_attention_scores = True ,
    )
    attention_output = self.dropout_1(attention_output)
    out1 = self.ln_1(inputs + attention_output)
    ffn_1 = self.ffn_1(out1)
    ffn_2 = self.ffn_2(ffn_1)
    ffn_output = self.dropout_2(ffn_2)
    return ( self.ln_2(out1 + ffn_output) , attention_scores)

  def get_config(self):
    config = super().get_config()
    config.update(
        {
        "key_dim" :self.key_dim,
        "embed_dim" : self.embed_dim,
        "num_heads" :self.num_heads,
        "ff_dim": self.ff_dim,
        "dropout_rate": self.dropout_rate,
    }
  )
  return config