# 😊This is an example notebook for GPT-2. If you feel that the structure or content of this notebook is not clear enough, you can go to tensorflow/examples/models to download the code of the GPT-2 model and run it on your computer.

# Download the original warehouse code and training data

In [None]:
!git clone https://github.com/starxsky/gpt-2

# GPT-2 EXAMP

In [1]:
!pip install setuptools==41.0.1
!pip install ftfy==5.6
!pip install tqdm==4.32.1
!pip instal Click==7.0
!pip install sentencepiece==0.1.83
!pip install tensorflow==2.7.0
!pip install numpy==1.16.4

Collecting setuptools==41.0.1
  Downloading setuptools-41.0.1-py2.py3-none-any.whl (575 kB)
[K     |████████████████████████████████| 575 kB 5.2 MB/s 
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed setuptools-41.0.1


Collecting ftfy==5.6
  Downloading ftfy-5.6.tar.gz (58 kB)
[?25l[K     |█████▋                          | 10 kB 18.3 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 14.3 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 4.5 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 5.6 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 5.4 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.8 MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-5.6-py3-none-any.whl size=44553 sha256=0cc2a2f7a37982917561b314d8db6d91701cdb652971d9aa24c598f70d2c096c
  Stored in directory: /root/.cache/pip/wheels/61/d0/cf/87de309cf05388523a6416562904c9dc556d98057c706cbc6e
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-5.6
Collecting tqdm==4.32.1
  Downloading tqdm-4.32.1-py2.py3-none-any.whl (49 kB

# Small plugin for writing GPT

In [2]:
import tensorflow as tf
import numpy as np
import os 
import math
import sys


### INPUT CONFIG

In [3]:

def shape_as_list_2(x):
    return [int(i) for i in tf.shape(x)]


def gelu(x):
    with tf.name_scope("gelu"):
        cdf = 0.5 * (1.0 + tf.tanh(
            (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
        return x * cdf


def get_padding_mask(seq):
    with tf.name_scope("Padding_Mask"):
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

        # add extra dimensions to add the padding
        # to the attention logits.
        return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)


def attention_mask(size):
    """
    if size is 4 then it returns below matrix
       [[0., 1., 1., 1.],
        [0., 0., 1., 1.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.]]

    """
    with tf.name_scope("attention_mask"):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask  # (seq_len, seq_len)


def create_masks(inp):
    with tf.name_scope("att_masking"):
        att_mask = attention_mask(tf.shape(inp)[1])
        padding_mask = get_padding_mask(inp)
        mask = tf.maximum(padding_mask, att_mask)

        return mask

# Build GPT layers

In [4]:

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, att_dropout=0.1, residual_dropout=0.1, scale=True):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.att_dropout = att_dropout
        self.residual_dropout = residual_dropout
        self.scale = scale

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.c_attn = Conv1d(self.d_model, self.d_model * 3)
        self.c_proj = Conv1d(self.d_model, self.d_model)

    def multihead_attention(self, q, k, v, training, mask=None):
        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
        if self.scale:
            dk = tf.cast(tf.shape(k)[-1], tf.float32)
            matmul_qk = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            matmul_qk += (mask * -1e9)

        attention_weights = tf.nn.softmax(matmul_qk, axis=-1)  # (..., seq_len_q, seq_len_k)

        if training:
            attention_weights = tf.nn.dropout(attention_weights, rate=self.att_dropout, name="attn_dropout")
        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

        return output, attention_weights

    def split_heads(self, x):
        batch_size = tf.shape(x)[0]
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def merge_heads(self, x):
        batch_size = tf.shape(x)[0]
        x = tf.transpose(x, perm=[0, 2, 1, 3])
        # (batch_size, seq_len_q, num_heads, depth)

        merged = tf.reshape(x, (batch_size, -1, self.d_model))
        # (batch_size, seq_len_q, d_model)
        return merged

    def call(self, x, mask=None, past_layer=None, training=True):
        x = self.c_attn(x)
        query, key, value = tf.split(x, 3, axis=2)

        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)

        if past_layer is not None:
            past_key, past_value = tf.unstack(past_layer, axis=1)
            key = tf.concat([past_key, key], axis=-2)
            value = tf.concat([past_value, value], axis=-2)

        present = tf.stack([key, value], axis=1)

        scaled_attention, attention_weights = self.multihead_attention(query, key, value, training, mask)

        concat_attention = self.merge_heads(scaled_attention)

        output = self.c_proj(concat_attention)  # (batch_size, seq_len_q, d_model)
        if training:
            output = tf.nn.dropout(output, rate=self.residual_dropout, name="resid_dropout")

        return output, present








class Conv1d(tf.keras.layers.Layer):
    def __init__(self, hidden_size,
                 filter_size,
                 weights_init_stdev=0.02,
                 weights_mean=0.0,
                 bias_init=0.0):
        super(Conv1d, self).__init__()

        self.weights_init_stdev = weights_init_stdev
        self.weights_mean = weights_mean
        self.bias_init = bias_init
        self.hidden_size = hidden_size
        self.filter_size = filter_size

    def build(self, input_shape):
        self.weight = self.add_weight(
            "cov1d_weights",
            shape=[self.hidden_size, self.filter_size],
            dtype=tf.float32,
            initializer=tf.random_normal_initializer(
                stddev=self.weights_init_stdev,
                mean=self.weights_mean))

        self.bias = self.add_weight("conv1d_biases",
                                    shape=[self.filter_size],
                                    initializer=tf.constant_initializer(self.bias_init))
        super(Conv1d, self).build(input_shape)

    def call(self, inputs):
        output_shape = [tf.shape(inputs)[0], tf.shape(inputs)[1]] + [self.filter_size]
        inputs = tf.reshape(inputs, [-1, self.hidden_size])  # shape [batch, seq , features] => [batch*seq, features]
        outputs = tf.matmul(inputs, self.weight) + self.bias
        outputs = tf.reshape(outputs, output_shape)  # Reshape => [batch, seq, filter_size]
        return outputs


class FeedForward(tf.keras.layers.Layer):

    def __init__(self, hidden_size, filter_size, dropout_rate=0.1, activation=tf.nn.relu):
        super(FeedForward, self).__init__()
        self.hidden_size = hidden_size
        self.filter_size = filter_size
        self.activation = activation
        self.dropout_rate = dropout_rate

        self.dense_layer = Conv1d(self.hidden_size, self.filter_size)
        self.output_dense_layer = Conv1d(self.filter_size, self.hidden_size)

    def call(self, x, training=False):
        output = self.dense_layer(x)
        output = self.activation(output)
        output = self.output_dense_layer(output)

        if training:
            output = tf.nn.dropout(output, rate=self.dropout_rate, name="feed_forward_dropout")

        return output









class LayerNormalization(tf.keras.layers.Layer):

    def __init__(self, hidden_size):
        super(LayerNormalization, self).__init__()
        self.hidden_size = hidden_size

    def build(self, input_shape):
        self.gamma = self.add_weight(
            "layer_norm_scale",
            shape=[self.hidden_size],
            dtype="float32",
            initializer=tf.ones_initializer(),
            experimental_autocast=False)
        self.beta = self.add_weight(
            "layer_norm_bias",
            shape=[self.hidden_size],
            dtype="float32",
            initializer=tf.zeros_initializer(),
            experimental_autocast=False)
        super(LayerNormalization, self).build(input_shape)

    def call(self, x, epsilon=1e-6, input_dtype=tf.float32):
        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
        normalized = (x - mean) * tf.math.rsqrt(variance + epsilon)
        return tf.cast(normalized * self.gamma + self.beta, input_dtype)

















class EmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self, vocab_size, embedding_size, initializer=None, stddev=0.01, mean=0.0):
        super(EmbeddingLayer, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.stddev = stddev
        self.mean = mean
        self.initializer = initializer
        if self.initializer is None:
            self.initializer = tf.random_normal_initializer(mean=self.mean,
                                                            stddev=self.stddev)

    def build(self, input_shape):
        with tf.name_scope("embedding_weights"):
            self.embedding_weights = self.add_weight(
                "weights",
                shape=[self.vocab_size, self.embedding_size],
                dtype="float32",
                initializer=self.initializer
            )
        super(EmbeddingLayer, self).build(input_shape)

    def call(self, inputs, mode="embedding", scale=False):
        if mode == "embedding":
            return self.embedding(inputs, scale=scale)
        elif mode == "projection":
            return self.projection(inputs)
        else:
            raise ValueError("mode {} is not valid.".format(mode))

    def embedding(self, inputs, scale=False):
        with tf.name_scope("embedding"):
            # Create binary mask of size [batch_size, length]
            mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
            inputs = tf.cast(inputs, tf.int32)
            embeddings = tf.nn.embedding_lookup(self.embedding_weights, inputs)
            embeddings *= tf.expand_dims(mask, -1)
            # Scale embedding by the sqrt of the hidden size
            if scale:
                embeddings *= self.embedding_size ** 0.5

            return embeddings

    def projection(self, inputs):
        with tf.name_scope("output_layer"):
            batch_size = tf.shape(inputs)[0]
            seq_len = tf.shape(inputs)[1]

            h_flat = tf.reshape(inputs, [-1, self.embedding_size])
            logits = tf.matmul(h_flat, self.embedding_weights, transpose_b=True)

            return tf.reshape(logits, [batch_size, seq_len, self.vocab_size])


class PositionEmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self, position_seq, pos_embedding_size, trainable=True, stddev=0.02, mean=0.0):
        super(PositionEmbeddingLayer, self).__init__()
        self.position_seq = position_seq
        self.hidden_size = pos_embedding_size
        self.trainable = trainable
        self.stddev = stddev
        self.mean = mean

        if trainable:
            self.position_embedding = EmbeddingLayer(self.position_seq, self.hidden_size,
                                                     stddev=self.stddev, mean=self.mean)

    def call(self, inputs, start=1):
        with tf.name_scope("pos_embedding"):
            if self.trainable:
                batch_size = tf.shape(inputs)[0]
                batch_seq = tf.shape(inputs)[1]

                positions = tf.reshape(tf.tile(tf.range(start, batch_seq + start), [batch_size]),
                                       [batch_size, batch_seq])

                positions = tf.cast(positions, tf.int32)
                position_mask = tf.cast(tf.not_equal(inputs, 0), tf.int32)
                positions *= position_mask

                return self.position_embedding(positions)
            else:
                return self.get_position_sinusoid(self.position_seq)

    @staticmethod
    def get_position_sinusoid(seq_len, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
        position = tf.cast(tf.range(seq_len), tf.float32)
        num_timescales = hidden_size // 2
        log_timescale_increment = (
                math.log(float(max_timescale) / float(min_timescale)) /
                (tf.cast(num_timescales, tf.float32) - 1))
        inv_timescales = min_timescale * tf.exp(
            tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
        scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
        signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
        return signal











# Data pipline

In [5]:
import collections
import tensorflow as tf

PAD_ID = 0
UNKNOWN_ID = 1
START_ID = 3
END_ID = 4

#加载词汇
def load_vocab(vocab_path):
    vocab = collections.OrderedDict()
    index = 0
    for line in open(vocab_path, 'r').read().splitlines():
        vocab[line.split()[0]] = index
        index += 1
    inv_vocab = {v: k for k, v in vocab.items()}
    return vocab, inv_vocab

#转换为词汇
def convert_by_vocab(vocab, items):
    output = []
    for item in items:
        output.append(vocab[item])
    return output


def convert_tokens_to_ids(vocab, tokens): 
    #令牌转换成id
    return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids): 
    #id 转换成令牌
    return convert_by_vocab(inv_vocab, ids)




def parse_example(serialized_example):
    data_fields = {
        "inputs": tf.io.VarLenFeature(tf.int64),#输入
        "targets": tf.io.VarLenFeature(tf.int64)#目标
    }
    parsed = tf.io.parse_single_example(serialized_example, data_fields)
    inputs = tf.sparse.to_dense(parsed["inputs"])
    targets = tf.sparse.to_dense(parsed["targets"])

    inputs = tf.cast(inputs, tf.int32)
    targets = tf.cast(targets, tf.int32)

    return inputs, targets


def input_fn(tf_records,
             batch_size=32,
             padded_shapes=([-1], [-1]),
             epoch=10,
             buffer_size=10000):

    if type(tf_records) is str:
        tf_records = [tf_records]
    dataset = tf.data.TFRecordDataset(tf_records, buffer_size=10000)
    dataset = dataset.shuffle(buffer_size=buffer_size)

    dataset = dataset.map(parse_example,num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes)
    dataset = dataset.repeat(epoch)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset


# Build GPT-2 Model

In [None]:
import os
from tensorflow.python.framework import tensor_shape


_ROOT = os.path.abspath(os.path.dirname(__file__))
LOG_DIR = _ROOT + "/log"

train_step_signature = [
	tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="Inputs"),
	tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="Targets")
]


class Gpt2(tf.keras.Model):
	def __init__(self, num_layers,
	             d_model,
	             num_heads,
	             dff,
	             max_seq_len,
	             vocab_size,
	             optimizer="adam",
	             learning_rate=1e-3,
	             rev_embedding_projection=True,
	             grad_clip=False,
	             clip_value=1.0):
		super(Gpt2, self).__init__()

		self.rev_embedding_projection = rev_embedding_projection
		self.num_layers = num_layers
		self.num_heads = num_heads
		self.dff = dff
		self.max_seq_len = max_seq_len
		self.vocab_size = vocab_size
		self.d_model = d_model
		self.learning_rate = learning_rate
		self.optimizer_t = optimizer
		self.mirrored_strategy = None
		self.grad_clip = grad_clip
		self.clip_value = clip_value

		self.embedding = EmbeddingLayer(
			self.vocab_size, self.d_model)

		self.pos_embedding = PositionEmbeddingLayer(
			self.max_seq_len, self.d_model)

		self.decoder_layers = [DecoderLayer(self.d_model, self.num_heads, self.dff)
		                       for _ in range(self.num_layers)]
		self.layer_norm = LayerNormalization(self.d_model)

		if not self.rev_embedding_projection:
			self.output_layer = OutputLayer(self.vocab_size)

		self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
			from_logits=True, reduction='none')

		self.accuracy_object = tf.keras.metrics.SparseCategoricalAccuracy(
			name='accuracy')

		self.train_step_signature = [
			tf.TensorSpec(shape=(None, None), dtype=tf.int32)]

	def call(self, x, training=True, past=None):
		x = tf.cast(x, tf.int32)
		# self.batch_size, self.sequence = tf.shape(x)[0], tf.shape(x)[1]
		if past is None:
			pasts = [None] * self.num_layers
		else:
			pasts = past

		assert len(pasts) == self.num_layers

		att_mask = create_masks(x)
		past_length = 1 if past is None else tf.shape(past)[-2]
		with tf.name_scope("embeddings"):
			embedded_x = self.embedding(x)
			hidden_states = embedded_x + self.pos_embedding(x, start=past_length)

		presents = []
		for decoder_layer, past in zip(self.decoder_layers, pasts):
			hidden_states, present = decoder_layer(hidden_states, training, att_mask, past=past)
			presents.append(present)

		hidden_states = self.layer_norm(hidden_states)

		if self.rev_embedding_projection:
			logits = self.embedding(hidden_states, mode="projection")
		else:
			logits = self.output_layer(hidden_states)

		return logits, presents

	@staticmethod
	def get_padded_accuracy(labels, logits):
		with tf.name_scope("padded_accuracy"):
			weights = tf.cast(tf.not_equal(labels, 0), tf.float32)

			outputs = tf.cast(tf.argmax(logits, axis=-1), tf.int32)
			padded_labels = tf.cast(labels, tf.int32)

			nonpad_seq = tf.math.count_nonzero(weights, dtype=tf.dtypes.float32, )
			acc = tf.cast(tf.equal(outputs, padded_labels), tf.float32)

			accuracy = tf.reduce_sum(tf.cast(acc * weights, tf.float32)) / nonpad_seq
			return tf.cast(accuracy, tf.float32)

	def create_optimizer(self):
		optimizer = self.optimizer_t.lower()
		with tf.name_scope("optimizer"):
			if optimizer == "adam":
				self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98,
				                                          epsilon=1e-9)
			elif optimizer == "adadelta":
				self.optimizer = tf.keras.optimizers.Adadelta(self.learning_rate)
			elif optimizer == "rms":
				self.optimizer = tf.keras.optimizers.RMSprop(self.learning_rate)
			else:
				self.optimizer = tf.keras.optimizers.SGD(self.learning_rate)
			return self.optimizer

	def get_loss(self, real, pred):
		with tf.name_scope("loss_layer"):
			mask = tf.math.logical_not(tf.math.equal(real, 0))
			loss_ = self.loss_object(real, pred)

			with tf.name_scope("loss_masking"):
				mask = tf.cast(mask, dtype=loss_.dtype)
				loss_ *= mask
			loss_ = tf.reduce_sum(loss_, axis=1)
			sequence_avg_loss = loss_ / tf.reduce_sum(mask, axis=1)
			return sequence_avg_loss

	@staticmethod
	def get_perplexity(cross_entropy):
		perplexity = tf.exp(cross_entropy)
		return perplexity

	def create_checkpoint_manager(self, checkpoint_path, max_to_keep=5, load_model=True):
		with tf.name_scope('checkpoint_manager'):
			ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self)
			self.ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=max_to_keep)

			if load_model:  # If want to load trained weights
				ckpt.restore(self.ckpt_manager.latest_checkpoint)
				print('Latest checkpoint restored...............')
			else:
				print("Initializing model from scratch..........")

	def load_model(self, filepath):
		ckpt = tf.train.Checkpoint(model=self)
		ckpt_manager = tf.train.CheckpointManager(ckpt, filepath)
		ckpt.restore(ckpt_manager.latest_checkpoint)
		print("Model Restored..........................")

	def create_summary_writer(self, summary_path):
		train_summary_path = summary_path + "/train"
		test_summary_path = summary_path + "/test"

		with tf.name_scope('summary'):
			self.train_writer = tf.summary.create_file_writer(train_summary_path)
			self.test_writer = tf.summary.create_file_writer(test_summary_path)

			return self.train_writer, self.test_writer

	def _train_step(self, inputs, targets):
		with tf.GradientTape() as tape:
			predictions, _ = self(inputs, training=True)
			loss = tf.reduce_mean(self.get_loss(targets, predictions))

		with tf.name_scope("gradients"):
			gradients = tape.gradient(loss, self.trainable_variables)
			if self.grad_clip:
				gradients = [(tf.clip_by_value(grad, -self.clip_value, self.clip_value))
				             for grad in gradients]
			self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

		perplexity = self.get_perplexity(loss)
		step = self.optimizer.iterations

		return step, loss, perplexity

	def _test_step(self, inputs, targets):
		pred, _ = self(inputs, training=False)
		loss = self.get_loss(targets, pred)
		perplexity = self.get_perplexity(loss)
		return loss, perplexity

	@tf.function(input_signature=train_step_signature)
	def train_step(self, inputs, targets):
		return self._train_step(inputs, targets)

	@tf.function(input_signature=train_step_signature)
	def test_step(self, inputs, targets):
		return self._test_step(inputs, targets)

	def _distributed_train_step(self, inputs, targets):

		def step_fn(inp, tar):
			with tf.GradientTape() as tape:
				logits, _ = self(inp, training=True)
				cross_entropy = self.get_loss(tar, logits)
				loss = tf.reduce_sum(cross_entropy) * (1.0 / self.global_batch_size)  # Divided By Global Batch Size

			with tf.name_scope("gradients"):
				gradients = tape.gradient(loss, self.trainable_variables)
				if self.grad_clip:
					gradients = [(tf.clip_by_value(grad, -self.clip_value, self.clip_value))
					             for grad in gradients]
				self.optimizer.apply_gradients(list(zip(gradients, self.trainable_variables)))
			return cross_entropy

		per_example_losses = self.mirrored_strategy.run(
			step_fn, args=(inputs, targets))

		mean_loss = self.mirrored_strategy.reduce(
			tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
		# If you get error in distributed mode try using SUM instead of MEAN.

		perplexity = self.get_perplexity(mean_loss)
		step = self.optimizer.iterations

		return step, mean_loss, perplexity

	def _distributed_test_step(self, inputs, targets):
		def step_fn(inp, tar):
			logits, _ = self(inp, training=False)
			cross_entropy = self.get_loss(tar, logits)
			return cross_entropy

		per_example_losses = self.mirrored_strategy.run(
			step_fn, args=(inputs, targets))

		mean_loss = self.mirrored_strategy.reduce(
			tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)
		# If you get error in distributed mode try using SUM instead of MEAN.
		perplexity = self.get_perplexity(mean_loss)

		return mean_loss, perplexity

	@tf.function(experimental_relax_shapes=True)
	def distributed_train_step(self, inputs, targets):
		return self._distributed_train_step(inputs, targets)

	@tf.function(experimental_relax_shapes=True)
	def distributed_test_step(self, inputs, targets):
		return self._distributed_test_step(inputs, targets)

	def get_train_test_function(self, graph_mode=False):
		if graph_mode:
			print("Running in graph mode.............")
			train_fuc = self.train_step
			test_fuc = self.test_step
		else:
			print("Running in eager mode.............")
			train_fuc = self._train_step
			test_fuc = self._test_step
		return train_fuc, test_fuc

	def get_distributed_train_test_function(self, graph_mode=False):
		if graph_mode:
			print("Running in graph mode.............")
			train_fuc = self.distributed_train_step
			test_fuc = self.distributed_test_step
		else:
			print("Running in eager mode.............")
			train_fuc = self._distributed_train_step
			test_fuc = self._distributed_test_step
		return train_fuc, test_fuc

	def fit(self, train_dataset, graph_mode):
		if self.mirrored_strategy is None:
			train_dataset, test_dataset = train_dataset
			train_func, test_func = self.get_train_test_function(graph_mode)
			tf.summary.trace_on(graph=True, profiler=False)
			for (_, (inputs, targets)) in enumerate(train_dataset):
				step, loss, perplexity = train_func(inputs, targets)
				if step % 100 == 0:
					self.log_summary(self.train_writer,
					                 step.numpy(),
					                 loss.numpy(),
					                 perplexity.numpy())

				if step == 0:
					with self.train_writer.as_default():
						tf.summary.trace_export(
							name="gpt-2",
							step=0,
							)

				if step % 500 == 0:
					losses = []
					perplexities = []
					for (test_step, (test_inputs, test_targets)) in enumerate(test_dataset):
						test_loss, test_perplexity = test_func(test_inputs, test_targets)
						losses.append(test_loss)
						perplexities.append(test_perplexity)

						if test_step == 100:
							break

					test_loss = np.mean(np.array(losses))
					test_perplexity = np.mean(np.array(perplexities))

					self.log_summary(self.test_writer,
					                 step.numpy(),
					                 test_loss,
					                 test_perplexity,
					                 result_type="Test")

					ckpt_save_path = self.ckpt_manager.save()
					print('Saving checkpoint for step {} at {}'.format(step.numpy(),
					                                                   ckpt_save_path))
		else:
			with self.mirrored_strategy.scope():
				train_dataset, test_dataset = train_dataset
				train_func, test_func = self.get_distributed_train_test_function(graph_mode)
				tf.summary.trace_on(graph=True, profiler=False)
				for (step, (inputs, targets)) in enumerate(train_dataset):
					step, loss, perplexity = train_func(inputs, targets)

					if step % 100 == 0:
						self.log_summary(self.train_writer,
						                 step,
						                 loss,
						                 perplexity)

					if step == 0:
						with self.train_writer.as_default():
							tf.summary.trace_export(
								name="gpt-2",
								step=0,
								)

					if step % 500 == 0:
						losses = []
						perplexities = []
						for (test_step, (test_inputs, test_targets)) in enumerate(test_dataset):
							test_loss, test_perplexity = test_func(test_inputs, test_targets)
							losses.append(test_loss)
							perplexities.append(test_perplexity)

							if test_step == 100:
								break

						test_loss = np.mean(np.array(losses))
						test_perplexity = np.mean(np.array(perplexities))

						self.log_summary(self.test_writer,
						                 step,
						                 test_loss,
						                 test_perplexity,
						                 result_type="Test")

						ckpt_save_path = self.ckpt_manager.save()
						print('Saving checkpoint for step {} at {}'.format(step.numpy(),
						                                                   ckpt_save_path))

	@staticmethod
	def log_summary(tf_writer, step, loss, perplexity, result_type="Train"):
		print(result_type + ':- Step {}, Loss {:.4f}, Perplexity {:.4f}'.format(
			step, loss, perplexity))
		with tf_writer.as_default():
			tf.summary.scalar("loss", loss, step=step)
			tf.summary.scalar("perplexity", perplexity, step=step)


class OutputLayer(tf.keras.layers.Layer):
	def __init__(self, output_dim, proj_weights=None, kernel_initializer=None):
		super(OutputLayer, self).__init__()
		self.proj_weights = proj_weights
		self.output_dim = output_dim
		self.layer_weights = None
		self.kernel_initializer = kernel_initializer

	def build(self, input_shape):
		if self.proj_weights is None:
			input_dim = tensor_shape.dimension_value(input_shape[-1])
			self.layer_weights = self.add_weight(
				'output_layer_weights',
				shape=[input_dim, self.output_dim],
				initializer=self.kernel_initializer,
				trainable=True)
		super(OutputLayer, self).build(input_shape)

	def call(self, x):
		batch, sequence, d_model = tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[-1]
		h_flat = tf.reshape(x, [-1, d_model])

		if self.proj_weights is None:
			out = tf.matmul(h_flat, self.layer_weights)
		else:
			out = tf.matmul(h_flat, self.porj_weights, transpose_b=True)
		out = tf.reshape(out, [batch, sequence, self.output_dim])
		return out


class DecoderLayer(tf.keras.layers.Layer):
	def __init__(self, d_model, num_heads, dff,
	             dr_rate=0.1):
		super(DecoderLayer, self).__init__()
		self.d_model = d_model
		self.num_heads = num_heads
		self.dff = dff
		self.dr_rate = dr_rate

		self.mha = MultiHeadAttention(self.d_model, self.num_heads)
		self.feed_forward = FeedForward(self.d_model, self.dff, self.dr_rate)
		self.layer_norm1 = LayerNormalization(self.d_model)
		self.layer_norm2 = LayerNormalization(self.d_model)

	def call(self, x, training, mask, past=None):
		out, present = self.mha(self.layer_norm1(x), mask=mask, past_layer=past,
		                        training=training)  # (batch_size, input_seq_len, d_model)
		with tf.name_scope("residual_conn"):
			x = x + out
		out = self.feed_forward(self.layer_norm2(x), training=training)  # (batch_size, input_seq_len, d_model)
		with tf.name_scope("residual_conn"):
			x = x + out
		return x, present


# Build Sampler

In [None]:
def argmax(logits):
	return tf.argmax(logits)


def top_k_logits(logits, k):
	if k == 0:
		return logits

	values, _ = tf.nn.top_k(logits, k=k)
	min_values = values[:, -1]

	return tf.where(
		logits < min_values,
		tf.ones_like(logits, dtype=logits.dtype) * -1e10,
		logits
	)


def top_p_logits(logits, p):
	"""Took from OpenAI GPT-2 Implememtation"""
	batch = tf.shape(logits)[0]
	sorted_logits = tf.sort(logits, direction='DESCENDING', axis=-1)
	cumulative_probs = tf.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
	indices = tf.stack([
		tf.range(0, batch),
		tf.maximum(tf.reduce_sum(tf.cast(cumulative_probs <= p, tf.int32), axis=-1) - 1, 0),
	], axis=-1)
	min_values = tf.gather_nd(sorted_logits, indices)
	return tf.where(
		logits < min_values,
		tf.ones_like(logits) * -1e10,
		logits,
	)


class SequenceGenerator:

	def __init__(self, model_path, model_param, vocab_path):
		self.sp = None
		self.model = None
		self.model_path = model_path
		self.model_param = model_param
		self.vocab_path = vocab_path

	def load_weights(self):
		with open(self.model_param) as f:
			param = json.load(f)
		self.model = Gpt2(param['num_layers'],
						  param['d_model'],
						  param['num_heads'],
						  param['dff'],
						  param['max_seq_len'],
						  param['vocab_size'])

		ckpt = tf.train.Checkpoint(model=self.model)

		ckpt_manager = tf.train.CheckpointManager(ckpt, self.model_path, max_to_keep=1)

		ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()
		print('Model weights loaded into memory')

		self.sp = spm.SentencePieceProcessor()
		self.sp.load(self.vocab_path)

	def sample_sequence(self,
						context=None,
						seq_len=512,
						bos=3,
						eos=4,
						temperature=1,
						top_k=8,
						top_p=8,
						nucleus_sampling=True):

		if context == None:
			print("Give some context to model.................")
			return
		context = tf.expand_dims(([bos] + self.sp.encode_as_ids(context)), 0)
		prev = context
		output = context
		past = None
		for i in range(seq_len):
			logits, past = self.model(prev, training=False, past=past)
			# print(logits)
			logits = logits[:, -1, :] / tf.cast(temperature, tf.float32)
			# print(logits)
			logits = top_k_logits(logits, k=top_k)
			# print(logits)
			if nucleus_sampling:
				logits = top_p_logits(logits, p=top_p)

			samples = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)
			# print(samples)
			if tf.equal(samples, eos):
				# print("Predicted end of sequence.")
				break

			# print("shape.........")
			# print(tf.shape(output))
			# print(tf.shape(samples))
			output = tf.concat([output, samples], axis=-1)
			prev = samples
			# print(tf.shape(output))
			# print(output)

		# print("--------------------------")
		result = tf.squeeze(output, axis=0)
		pred = [int(i) for i in result]
		generated_seq = self.sp.decode_ids(pred[1:])
		generated_seq = generated_seq.replace("[SEP]", "").strip()
		generated_seq = ' '.join(generated_seq.split())
		return generated_seq

# pre_process

Please pay attention! ! The path here needs to be changed to your path. Before this, please put the data folder in the specified location, and then put the path of the file here.

```
_ROOT = os.path.abspath(os.path.dirname(__file__))
PROCESS_DATA_PATH = _ROOT + "/data/processed.txt"
BPE_TSV_PATH = _ROOT + "/data/bpe_spm.tsv"
BPE_MODEL_PATH = _ROOT + "/data/bpe_model"
TF_RECORDS = _ROOT + "/data/tf_records/"
BOS_ID = 3
EOS_ID = 4

```



In [None]:
!pip install ftfy sentencepiece

In [None]:
import csv
import datetime
import glob
import os
from collections import Counter

import click
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tqdm
from ftfy import fix_text

_ROOT = os.path.abspath(os.path.dirname(__file__))
PROCESS_DATA_PATH = _ROOT + "/data/processed.txt"
BPE_TSV_PATH = _ROOT + "/data/bpe_spm.tsv"
BPE_MODEL_PATH = _ROOT + "/data/bpe_model"
TF_RECORDS = _ROOT + "/data/tf_records/"
BOS_ID = 3
EOS_ID = 4


def process_text(text_files):
	print("Pre-processing the text data.....")
	file_writer = open(PROCESS_DATA_PATH, "w",encoding='utf-8')
	for file_name in tqdm.tqdm(text_files):
		fr = open(file_name, 'r',encoding='utf-8')
		file_writer.writelines([fix_text(line, normalization='NFKC') for line in fr.readlines()])
		fr.close
	file_writer.close()


def train_byte_pair_encoding(vocab_size):
	print("Training BytePair encoding......")
	token_dict = Counter()
	with open(PROCESS_DATA_PATH, 'r',encoding='utf-8') as fr:
		for line in tqdm.tqdm(fr):
			token_dict.update(line.lower().split())

	with open(BPE_TSV_PATH, 'w', newline='',encoding='utf-8') as f_output:
		tsv_output = csv.writer(f_output, delimiter='\t')
		for word in token_dict:
			tsv_output.writerow([word, token_dict[word]])

	spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
		spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
	spm.SentencePieceTrainer.train(spmcmd)


def _int64_feature(value):
	return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def serialize_example(inputs, targets):
	feature = {
		'inputs': _int64_feature(inputs),
		'targets': _int64_feature(targets)
	}
	example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
	return example_proto.SerializeToString()


def create_tf_records(min_seq_len, max_seq_len, per_file_limit=5000):
	print("Creating TF Records...............")
	s = spm.SentencePieceProcessor()
	s.Load(BPE_MODEL_PATH + ".model")
	if not os.path.exists(TF_RECORDS):
		os.makedirs(TF_RECORDS)
	filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
	tf_writer = tf.io.TFRecordWriter(filename)
	doc_counts = 0
	with open(PROCESS_DATA_PATH, 'r',encoding='utf-8') as f:
		for line in tqdm.tqdm(f):
			encoded_id = s.encode_as_ids(line)
			if max_seq_len > len(encoded_id) > min_seq_len:
				inputs = np.array([BOS_ID] + encoded_id)
				targets = np.array(encoded_id + [EOS_ID])

				example = serialize_example(inputs, targets)
				tf_writer.write(example)
				doc_counts += 1
			if doc_counts >= per_file_limit:
				tf_writer.write(example)
				doc_counts = 0
				tf_writer.close()
				filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
				tf_writer = tf.io.TFRecordWriter(filename)


@click.command()
@click.option('--data-dir', type=str, default="./data/scraped", show_default=True, help="training data path")
@click.option('--vocab-size', type=int, default=24512, show_default=True, help="byte pair vocab size")
@click.option('--min-seq-len', type=int, default=15, show_default=True, help="minimum sequence length")
@click.option('--max-seq-len', type=int, default=512, show_default=True, help="minimum sequence length")
def train(data_dir, vocab_size, min_seq_len, max_seq_len):
	# text_files = glob.glob((_ROOT + data_dir + "/*.txt"))
	text_files = glob.glob((data_dir + "/*.txt"))
	# print(text_files)
	process_text(text_files)
	train_byte_pair_encoding(vocab_size)
	create_tf_records(min_seq_len, max_seq_len)
	print("Pre-processing is done............")


if __name__ == "__main__":
	train()


# Train GPT-2 Model

In [None]:
import glob
import json



from data_pipeline import input_fn
from gpt2_model import *

_ROOT = os.path.abspath(os.path.dirname(__file__))
LOG_DIR = _ROOT + "/log"
MODEL_DIR = _ROOT + "/model"



#num_layers = 8
#embedding_size = 768
#num_heads = 8
#dff =3072
#max_seq_len = 515 
#vocab_size = 24512
#optimizer = "adam"
#batch_size = 8
#learning_rate = 0.001
#graph_mode = False
#distributed = False


def train(num_layers = 8, embedding_size = 768, num_heads = 8, dff = 3072, max_seq_len = 515, vocab_size = 24512,
          optimizer = "adam", batch_size = 8, learning_rate = 0.001, graph_mode = False, distributed = False):
	par_map = {"num_layers": num_layers, "d_model": embedding_size,
	           "num_heads": num_heads, "dff": dff,
	           "max_seq_len": max_seq_len, "vocab_size": vocab_size}

	# exp_name = "_".join(['{}_{}'.format(k, v) for k, v in par_map.items()])

	if not os.path.exists(MODEL_DIR):
		os.makedirs(MODEL_DIR)

	with open(MODEL_DIR + '/model_par.json', 'w') as f:
		json.dump(par_map, f)

	tf_records = glob.glob((_ROOT + "/data/tf_records/*.tfrecord"))
	train_percent = int(len(tf_records) * (85 / 100))

	print("No. of tf records:- ", len(tf_records))
	train_tf_records = tf_records[:train_percent]
	test_tf_records = tf_records[train_percent:]

	train_dataset = input_fn(train_tf_records, batch_size=batch_size)
	test_dataset = input_fn(test_tf_records, batch_size=batch_size)

	if distributed:
		mirrored_strategy = tf.distribute.MirroredStrategy()
		train_dataset = mirrored_strategy.experimental_distribute_dataset(train_dataset)
		test_dataset = mirrored_strategy.experimental_distribute_dataset(test_dataset)

		
		
		with mirrored_strategy.scope():
			
			model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size,
			             optimizer=optimizer, learning_rate=learning_rate)
			model.create_optimizer()
			model.create_checkpoint_manager(MODEL_DIR)
			model.create_summary_writer(LOG_DIR)

			
		model.mirrored_strategy = mirrored_strategy
		model.global_batch_size = tf.cast(batch_size, tf.float32)
		
		
		
		
		
		
	else:
		model = Gpt2(num_layers, embedding_size, num_heads, dff, max_seq_len, vocab_size,
		             optimizer=optimizer, learning_rate=learning_rate)
		model.create_optimizer()
		model.create_checkpoint_manager(MODEL_DIR)
		model.create_summary_writer(LOG_DIR)

	model.fit([train_dataset, test_dataset], graph_mode)
	print("===============>>>>>>>>>>>>>>>Done!11")


if __name__ == "__main__":
	train()

# sequence gen

In [None]:
# this file was gener the sequences
from sample import SequenceGenerator
import click


@click.command()
@click.option('--model-path', type=str, default="./model", show_default=True, help="Model Path")
@click.option('--model-param', type=str, default="./model/model_par.json", show_default=True, help="Model Parm")
@click.option('--vocab', type=str, default="./data/bpe_model.model", show_default=True, help="Vocab")
@click.option('--seq-len', type=int, default=512, show_default=True, help="seq_len")
@click.option('--temperature', type=float, default=1.0, show_default=True, help="seq_len")
@click.option('--top-k', type=int, default=8, show_default=True, help="seq_len")
@click.option('--top-p', type=float, default=0.9, show_default=True, help="seq_len")
@click.option('--nucleus_sampling', type=bool, default=False, show_default=True, help="seq_len")
@click.option('--context', type=str, default="sample context", show_default=True, help="Context given to model")


def sequence_gen(model_path, model_param, vocab, seq_len, temperature, top_k, top_p, nucleus_sampling, context):
	sg = SequenceGenerator(model_path, model_param, vocab)
	sg.load_weights()
	generated_seq = sg.sample_sequence(context,
									   seq_len=seq_len,
									   temperature=temperature,
									   top_k=top_k,
									   top_p=top_p,
									   nucleus_sampling=nucleus_sampling)
	print("<<<<<<<<<===================================Sample===================================>>>>>>>>>>>>>>\n\n " + generated_seq)


if __name__ == "__main__":
	sequence_gen()
