In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
project_path='/content/gdrive/MyDrive/Colab Notebooks/Golpo Kothok GPT'

In [3]:
import sys
# Add the project path to sys.path to allow importing from utils.py
# project_path is defined in an earlier cell as '/content/gdrive/MyDrive/Colab Notebooks'
if project_path not in sys.path:
    sys.path.append(project_path)

In [4]:
import torch
import torch.nn as nn

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Data

In [6]:
import os
file_name = "data/training_data.txt"
dataset_file_path = os.path.join(project_path, file_name)

with open(dataset_file_path, "r", encoding="utf-8") as f:
    text_data = f.read()

print("Total number of character:", len(text_data))

Total number of character: 12633030


### **Define GPT Configuration**

In [7]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 512, # Context length (Original contex length in GPT 2  is 1024. For the sake of simplicity we used 256)
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False,       # Query-Key-Value bias
    "key_dim": 768,
    "val_dim": 768
}

### Tokenize using `tiktoken`

In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

### Encode tokens

In [9]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print("Characters:", total_characters)
print("Tokens:", total_tokens)


Characters: 12633030
Tokens: 3049258


### Define the token sequence for stopping detection

In [10]:
# Define the token sequence for <|book_end|> (for stopping detection)
END_SEQUENCE = tokenizer.encode("<|end|>", allowed_special={
    '<|start|>',
    '<|end|>',
    '<|title|>',
    '</|title|>'
})

print(f"<|book_end|> token sequence: {END_SEQUENCE}")
print(f"Length: {len(END_SEQUENCE)} tokens")

<|book_end|> token sequence: [27, 91, 437, 91, 29]
Length: 5 tokens


In [11]:
def text_to_token_ids(text, tokenizer):
    # Adding your tokens to allowed_special
    encoded = tokenizer.encode(text, allowed_special={
        '<|start|>',
        '<|end|>',
        '<|title|>',
        '</|title|>'
    })

    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor



def token_ids_to_text(token_ids, tokenizer):
    if token_ids is None:
        raise ValueError("token_ids is None! Check your text_to_token_ids function.")
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

### Dataset and Dataloader

In [12]:
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []


        # Tokenize the entire text with special tokens
        token_ids = tokenizer.encode(txt, allowed_special={
            '<|start|>',
            '<|end|>',
            '<|title|>',
            '</|title|>'
        })


        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(txt, batch_size=100, max_length=512,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader( # torch.utils.data.DataLoader loads data from a Dataset in mini-batches, optionally shuffling, using multiple workers.
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

### Load Model

In [13]:
# Load checkpoint and resume training
def load_checkpoint(model,checkpoint_path):

    checkpoint = torch.load(checkpoint_path,map_location=torch.device("cpu"),weights_only=True,)

    model.load_state_dict(checkpoint['model_state_dict'])

    print('Model Loaded Successfully')




### Generate Function

In [14]:


def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None,eos_sequence=None):
    # Loop: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)



        # Check for single-token EOS
        if eos_id is not None and idx_next.item() == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break


        # Check for multi-token EOS sequence
        if eos_sequence is not None:
            idx_list = idx[0].tolist()
            # Only check if we have enough tokens
            if len(idx_list) >= len(eos_sequence):
                last_n = idx_list[-len(eos_sequence):]


                if last_n == eos_sequence: # <|book_end|> detected. Stop generation"
                    break

        # Same as before: append sampled index to the running sequence

        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [15]:
def generate_story(test_no,model, tokenizer, output_dir, start_context=None, max_new_tokens=50, temperature=0.0, top_k=None):


        context_size = model.pos_emb.weight.shape[0]


        if start_context is None or start_context == "":

          start_context = "<|start|>"

          start_context_encoded = text_to_token_ids(start_context, tokenizer).to(device)


        with torch.no_grad():
            token_ids = generate(
                model=model, idx=start_context_encoded,
                max_new_tokens=max_new_tokens, context_size=context_size,
                temperature=temperature, top_k=top_k,
                eos_sequence=END_SEQUENCE

            )
        decoded_text = token_ids_to_text(token_ids, tokenizer)

        #decoded_text=decoded_text.replace("\n", " ")
        #print("Hello: \n",decoded_text,"\n\nHello\n\n")

        formatted_text = format_text(decoded_text)
        print(formatted_text)


        with open(f"{output_dir}/output_no_{test_no}_{temperature}_{top_k}_{max_new_tokens}.txt", "w", encoding="utf-8") as f:
            f.write(formatted_text)





In [16]:
import re
import textwrap

def format_text(text, width=80):
    # Remove all <|tag|> and </|tag|> tokens
    formatted_text = re.sub(r'</?\|[^|]+\|>', '', text)

    lines = formatted_text.split('\n')

    wrapped_lines = []
    for line in lines:
        if line.strip():
            wrapped_lines.append(textwrap.fill(line, width=width))
        else:
            wrapped_lines.append(line)

    return '\n'.join(wrapped_lines)


In [17]:
from utils import GPTModel
model = GPTModel(GPT_CONFIG_124M)
model.to(device)

checkpoint_dir = project_path+'/gpt_checkpoints'

checkpoint_file = 'checkpoint_V1_2_epoch_12_step_7931.pt'

checkpoint_path = os.path.join(checkpoint_dir, checkpoint_file)

# load the model
if os.path.exists(checkpoint_path):
        load_checkpoint(model, checkpoint_path)
        model.eval()
else:
    print('Model Not Exists')

Model Loaded Successfully


In [None]:
'''
prompt = [
    "Once a boy was walking",
    "There was a woman ",
    "A man found a dead body ",
    "A king can sacrifice for his people and kingdom.",
    "There was a city known for criminal activities.",
    "Deep in the jungle, an explorer discovered",
    "There was a village",
    "There was a jungle ",
    "A boy had a monkey"

    """<|start|>
<|title|>The Lost Treasure<|/title|>

Deep in the jungle, an explorer discovered"""
]

prompt_no=0
'''
output_dir= project_path+'/output'


total_test=10

for test in range(total_test):
  generate_story(test,model, tokenizer, output_dir, max_new_tokens=20000, temperature=1.2, top_k=3) # 1.2, 3




The Unyielding Flame of Courage

Once upon a time, in a small town called Willow, nestled between the rolling
hills and the lush greenery, the lush green mountains of Willowbrook. The town
was home to a lush, unassuming, and the beautiful Victorian town of Elmswood.
The town was a tight, green place where the rich lived in opulence and the air
was thick with the scent of mystery. The town was known for its rich rich and
the town, where it cherished, with people lived their lives in harmony and their
tight-knit community.

At the heart of the town stood a magnificent clockmaker, the Willowbrook was the
town's mayor, Sarah, a renowned crumbled to the town's beloved mayor. Sarah was
a kind-hearted artist, and she loved hearing the admiration of her loving life -
the perfect mechanic and the paranormal investigator. Her younger daughter,
Sarah, was known for her wit and withered redoufields, and her two often
provided her the town a cleaner daughter named Sarah.

Despite the challenges,