In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os
import pickle

### Reading Data

In [None]:
with open("data.txt", 'r', encoding='utf-8') as f:
    text_data = f.read()

In [None]:
text_data

### Converting to numerical embeddings

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts([text_data])

In [None]:
tokenizer

In [None]:
sequence = tokenizer.texts_to_sequences([text_data])[0]

In [None]:
sequence

In [None]:
len(sequence)

In [None]:
with open("tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

### Creating X and y data

In [None]:
max_seq_length = 100

def create_dataset(seq, window_size=max_seq_length):
    input, labels = [], []
    
    for i in range(len(seq) - window_size):
        input.append(seq[i: i+window_size])
        labels.append(seq[i+1: i+window_size+1])
    
    return np.array(input), np.array(labels)

In [None]:
x_data, y_data = create_dataset(sequence)

In [None]:
len(x_data)

In [None]:
x_data[0]

In [None]:
y_data[0]

### Creating positional encoding

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

### Creating one Transformer block (referencing `Attention is all you need` paper)

In [None]:
def transformer_block(embed_dim, num_heads, ff_dim, dropout=0.1):
    inputs = layers.Input(shape=(None, embed_dim))
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attn_output = layers.Dropout(dropout)(attn_output)
    out1 = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn = tf.keras.Sequential([
        layers.Dense(ff_dim, activation='relu'),
        layers.Dense(embed_dim),
    ])

    ffn_output = ffn(out1)
    ffn_output = layers.Dropout(dropout)(ffn_output)
    out2 = layers.LayerNormalization(epsilon=1e-6)(out1 + ffn_output)
    
    return tf.keras.Model(inputs=inputs, outputs=out2)

### Creating the MiniGPT architecture by stacking multiple transformers

In [None]:
vocab_size = 5000
max_seq_len = 100
embed_dim = 256
num_heads = 8
ff_dim = 1024
num_layers = 96
batch_size = 32
epoch  = 10

def build_gpt_model():
    inputs = layers.Input(shape=(max_seq_len,))
    x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    x = PositionalEncoding(max_seq_len, embed_dim)(x)

    for _ in range(num_layers):
        x = transformer_block(embed_dim, num_heads, ff_dim)(x)

    outputs = layers.Dense(vocab_size, activation='softmax')(x)
    return tf.keras.Model(inputs, outputs)

In [None]:
model = build_gpt_model()
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

### Training the MiniGPT on custom data

In [None]:
model.fit(x_data, y_data, batch_size=batch_size, epochs=epoch, validation_split=.1)

In [None]:
model.save('gpt_test_model.h5')

In [None]:
model.summary()

### Testing

In [None]:
import tensorflow as tf
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load trained model
model = tf.keras.models.load_model("gpt_test_model.h5", custom_objects={"PositionalEncoding": PositionalEncoding})

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Parameters
max_seq_len = 100

In [None]:
def generate_text(seed_text, model, tokenizer, num_tokens=50, temperature=1.0):
    for _ in range(num_tokens):
        token_seq = tokenizer.texts_to_sequences([seed_text])[0]
        token_seq = token_seq[-max_seq_len:]  # Trim to max length
        padded_seq = pad_sequences([token_seq], maxlen=max_seq_len)

        preds = model.predict(padded_seq, verbose=0)[0, -1]  # Get prediction for last time step
        preds = np.asarray(preds).astype('float64')

        # Apply temperature sampling
        preds = np.log(preds + 1e-9) / temperature
        preds = np.exp(preds) / np.sum(np.exp(preds))

        next_token_id = np.random.choice(len(preds), p=preds)
        next_word = tokenizer.index_word.get(next_token_id, '')

        seed_text += ' ' + next_word

        if next_word == '':  # Optional: break if OOV or unknown word
            break

    return seed_text

In [None]:
seed_text = "who is sudhanshu"
generated = generate_text(seed_text, model, tokenizer, num_tokens=50, temperature=1.0)

print("üìù Generated Text:")
print(generated)