In [1]:
import re
import torch
import torch.nn as nn
import tiktoken



# Cloning repository and mounting drive

In [2]:
!git clone https://github.com/BARUD77/Homework2.git
%cd Homework2

fatal: destination path 'Homework2' already exists and is not an empty directory.
/content/Homework2


In [3]:
ll

total 4748
-rw-r--r-- 1 root    2775 Nov  4 09:31 artifact_simulation.py
-rw-r--r-- 1 root 2851073 Nov  4 09:31 cleaned_text.txt
-rw-r--r-- 1 root    1381 Nov  4 09:31 dataset.py
-rw-r--r-- 1 root    6371 Nov  4 09:31 extract_sheet_changes.py
-rw-r--r-- 1 root    8623 Nov  4 09:31 extract_txt.py
-rw-r--r-- 1 root   10377 Nov  4 09:31 frame_extractor.py
drwxr-xr-x 4 root    4096 Nov  4 09:32 [0m[01;34mHomework2[0m/
-rw-r--r-- 1 root   87694 Nov  4 09:31 Homework2.ipynb
-rw-r--r-- 1 root   83486 Nov  4 09:31 Homework2_regex.ipynb
-rw-r--r-- 1 root   94232 Nov  4 09:31 IFD_creation.ipynb
-rw-r--r-- 1 root   32209 Nov  4 09:31 LLM_Homework_1_Hermon_Teklesenbet_100064487.ipynb
-rw-r--r-- 1 root   91180 Nov  4 09:31 loss_plot.png
-rw-r--r-- 1 root 1527298 Nov  4 09:31 merged_corpus.txt
-rw-r--r-- 1 root    4928 Nov  4 09:31 model.py
drwxr-xr-x 2 root    4096 Nov  4 09:39 [01;34m__pycache__[0m/
-rw-r--r-- 1 root    7763 Nov  4 09:42 train.py
-rw-r--r-- 1 root    8860 Nov  4 09:31 Untitle

# Loading the cleaned and merged text file

In [4]:
with open("cleaned_text.txt", "r") as f:
    cleaned_text = f.read()


# Tokenization with both regex and bpe tokenizers

### Regex tokenizer

In [5]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.tokens2ids = {tok: i for i, tok in enumerate(vocab)}
        self.ids2tokens = {i: tok for tok, i in self.tokens2ids.items()}
        self.unk_id = self.tokens2ids["<|unk|>"]  # required

    def encode(self, text):
        tokens = re.split(r"([.,:;?_!\"'()\[\]—\-\s])", text)
        tokens = [t.strip() for t in tokens if t and t.strip()]
        return [self.tokens2ids.get(t, self.unk_id) for t in tokens]

    def decode(self, ids):
        text = " ".join(self.ids2tokens[i] for i in ids)
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

In [6]:
import re
import unicodedata

# 0) (optional) clean non-printables to avoid odd tokens like '\x001'
def strip_nonprintable(s):
    return ''.join(ch if unicodedata.category(ch)[0] != 'C' else ' ' for ch in s)

cleaned_text = strip_nonprintable(cleaned_text)

# 1) tokenize the corpus once to build the base vocab
preprocessed = re.split(r"([.,:;?_!\"'()\[\]—\-\s])", cleaned_text)
preprocessed = [t.strip() for t in preprocessed if t and t.strip()]

# 2) build vocab set from corpus (no specials yet)
base_vocab = set(preprocessed)

# 3) add specials up front in a deterministic order
specials = ["<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>"]
# exclude any duplicates of specials from the base set
base_vocab -= set(specials)

# 4) finalize the ordered vocab: specials first, then sorted tokens
vocab = specials + sorted(base_vocab)
vocab_size = len(vocab)
print("Vocab size =", vocab_size)

# 5) construct the tokenizer with this *fixed* vocab
regextokenizer = SimpleTokenizer(vocab)

Vocab size = 29916


# Preparing the dataloader

In [8]:
from model import GPTModel

gpt2=False
vocab_size = vocab_size
tokenizer = regextokenizer

GPT_CONFIG = {
"vocab_size": vocab_size, # Vocabulary size
"context_length": 256, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": 0.1, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

model = GPTModel(GPT_CONFIG)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters in the model (Millions): {total_params/1_000_000}")
print("-"*100)

Total number of parameters in the model (Millions): 131.175936
----------------------------------------------------------------------------------------------------


In [9]:
from dataset import create_dataloader_v1
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(cleaned_text))
train_data = cleaned_text[:split_idx]
val_data = cleaned_text[split_idx:]


torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    gpt2,
    tokenizer,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    gpt2,
    tokenizer,
    batch_size=2,
    max_length=GPT_CONFIG["context_length"],
    stride=GPT_CONFIG["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [10]:
total_train_characters = len(train_data)
total_train_tokens = len(tokenizer.encode(train_data))

print("Train Characters :", total_train_characters)
print("Train Tokens :", total_train_tokens)

print("="*100)

total_val_characters = len(val_data)
total_val_tokens = len(tokenizer.encode(val_data))

print("Validation Characters:", total_val_characters)
print("Validation Tokens:", total_val_tokens)

print("="*100)
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)
    break
print("="*100)
print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)
    break

Train Characters : 2506253
Train Tokens : 537969
Validation Characters: 278473
Validation Tokens: 61105
Train loader:
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


# Pretraining

In [11]:
from train import train_model_simple
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-4, weight_decay=0.1)

# estimate a token budget: ~5 epochs worth
B, T = next(iter(train_loader))[0].shape  # (B, T)
tokens_per_epoch = len(train_loader) * B * T
target_tokens = 30 * tokens_per_epoch

train_losses, val_losses, tokens_seen = train_model_simple(
    model, gpt2, train_loader, val_loader, optimizer, device,
    num_epochs=30,                 # large cap; early stopping will stop earlier
    eval_freq=200,                  # evaluate every N steps (tune to your speed)
    eval_iter=5,
    start_context="Finally, given the broad spectrum of capabilities displayed by GPT-3",
    tokenizer=tokenizer,
    # early stopping knobs
    early_stop=True, patience=5, min_delta=1e-3,
    save_path="checkpoints/regex/best.pt",
    use_plateau_lr=True, lr_factor=0.5, lr_patience=2, min_lr=1e-5,
    max_tokens_seen=target_tokens
)


KeyboardInterrupt: 

# Plotting training and validation losses

In [None]:
import matplotlib.pyplot as plt

def plot_losses(tokens_seen, train_losses, val_losses, save_path=None):
    """
    tokens_seen: list from train_model_simple (logged at each eval)
    train_losses, val_losses: same length as tokens_seen
    """
    if not (len(tokens_seen) == len(train_losses) == len(val_losses)):
        raise ValueError("tokens_seen, train_losses, val_losses must have same length")

    plt.figure(figsize=(7,4.5))
    plt.plot(tokens_seen, train_losses, label="Train loss")
    plt.plot(tokens_seen, val_losses, label="Val loss")
    plt.xlabel("Tokens seen")
    plt.ylabel("Cross-entropy loss")
    plt.title("Training/Validation Loss vs Tokens")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=200)
    plt.show()

plot_losses(tokens_seen, train_losses, val_losses, save_path="loss_plot.png")

# Loading trained model later