In [None]:
# import requests

# url = "https://archive.org/stream/AchebeChinuaThingsFallApart/Achebe%20Chinua%20-%20Things%20Fall%20Apart_djvu.txt"
# text = requests.get(url).text

# # Basic cleaning
# start = text.find("CHAPTER ONE")
# end = text.rfind("The End") if "The End" in text else len(text)
# novel_text = text[start:end]

# with open("things_fall_apart.txt", "w") as f:
#     f.write(novel_text)


In [None]:
# with open('things_fall_apart.txt', 'r', encoding='utf-8') as f:
#   text = f.read()

In [1]:
from google.colab import drive
drive.mount('/content/drive')#force_remount=True)

Mounted at /content/drive


In [5]:
import glob

files = glob.glob('/content/drive/MyDrive/achebe_texts/*.txt')


all_texts = []
for f in files:
    with open(f, 'r', encoding='utf-8') as infile:
        text = infile.read()

        # basic cleaning
        text = text.replace('\n', ' ')  # remove line breaks
        text = " ".join(text.split())   # collapse multiple spaces
        all_texts.append(f"<|book_start|>\n{text}\n<|book_end|>\n")

# save combined dataset
with open("achebe_corpus.txt", "w", encoding="utf-8") as outfile:
    outfile.write("\n".join(all_texts))

In [6]:
with open('achebe_corpus.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [7]:
len(text)

2027360

In [None]:
# import os
# os.makedirs(('bpe_tokenizer'))

# from tokenizers import ByteLevelBPETokenizer

# tokenizer = ByteLevelBPETokenizer()

# # train tokenizer on text
# tokenizer.train(files="achebe_corpus.txt", vocab_size=5000, min_frequency=2, special_tokens=[
#     "<s>", "<pad>", "</s>", "<unk>", "<mask>",
# ])

# # save vocab + merges
# tokenizer.save_model("bpe_tokenizer")

['bpe_tokenizer/vocab.json', 'bpe_tokenizer/merges.txt']

In [None]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32
block_size = 512
max_iters = 10000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 8
n_layer = 8
dropout = 0.1
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# with open('things_fall_apart.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# total training steps = max_iters
def get_lr(it):
    # 1) linear warmup for first warmup_iters steps
    warmup_iters = 1000
    # 2) cosine decay down to min learning rate
    lr_min = 1e-5
    lr_max = 3e-4
    # 3) calculate
    if it < warmup_iters:
        return lr_max * it / warmup_iters
    if it > max_iters:
        return lr_min
    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
    decay_ratio = min(1, decay_ratio)
    # cosine decay
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return lr_min + coeff * (lr_max - lr_min)


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        lr = get_lr(iter)
        print(f"step {iter}: lr = {lr:.6f}, train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # generate a sample
        prompt = "Okonkwo was "
        x = torch.tensor(encode(prompt), dtype=torch.long, device=device)[None, ...]
        out = m.generate(x, max_new_tokens=100)
        print(decode(out[0].tolist()).replace("\n", " "))

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()

    # update lr with scheduler
    lr = get_lr(iter)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    optimizer.step()

10.824303 M parameters
step 0: train loss 4.8680, val loss 4.8665
step 500: train loss 2.1493, val loss 2.1073
step 1000: train loss 1.6581, val loss 1.6232
step 1500: train loss 1.4702, val loss 1.4494
step 2000: train loss 1.3593, val loss 1.3518
step 2500: train loss 1.2869, val loss 1.2937
step 3000: train loss 1.2394, val loss 1.2591
step 3500: train loss 1.1973, val loss 1.2386
step 4000: train loss 1.1674, val loss 1.2180
step 4500: train loss 1.1429, val loss 1.2044
step 4999: train loss 1.1186, val loss 1.1925
Okonkwo was cleaning officering into Eduka’s important of, too hear the man hands. But there were quickly two crows of station that would not see the myself for white orubs my shinious most opinion action, the ga


In [None]:
prompt = "Okonkwo was preparing for the feast when "
x = torch.tensor(encode(prompt), dtype=torch.long, device=device)[None, ...]
out = m.generate(x, max_new_tokens=1000)
print(decode(out[0].tolist()))

Okonkwo was preparing for the feast when they had yet since the end yet. Okonkwo received the money’s mother he had come for spiciout—the ear and spat into its finger. When he finally left his Nkechokwu ignored his flute, had treachering highe ripe in funeir intervolunt that he took his interpretexity. Ogbuildine, Ugoye, Ugoye, was Many humanueduna. But some thing he had indeed history at the return which had forced on a breast but lowly remained for his friend in rag whether for his evening that day we were the kind of the primindships. It was the fact of the boys of the Omagazine darted! You shook our doright, why, then your brew that was daily astraightening. And I said to indioguaning the English stories and the world is now perfect that.' As it was heard wanting their fells and genomed people gets away. After anyone woulding finds that statement were or senior naturally. 'You might believary coool friends,' says those who had been costep—was impiring in split conceration. That wou

**Notebook Summary**

This notebook demonstrates the process of training a character-level language model on a corpus of Chinua Achebe's novels.

1.  **Data Acquisition and Preparation:**
    *   Initially, an attempt was made to download "Things Fall Apart" from archive.org, clean it, and save it to a file (`epYC9gn1cEPN`). This step was commented out.
    *   The commented-out code also included reading the saved text file (`KtMZ1AjXdKPE`).
    *   Google Drive was mounted to access local files (`UoAwuNJb3j6Q`).
    *   Text files from a specified Google Drive folder (`/content/drive/MyDrive/achebe_texts/`) were read, cleaned (removing line breaks and collapsing spaces), and combined into a single file named `achebe.txt`. Each book was marked with `<|book_start|>` and `<|book_end|>` tokens (`XGI36V7l4r2b`).
    *   The combined text from `achebe.txt` was read into a variable (`q-4gcnWCcUPE`).
    *   The first 1000 characters of the combined text were displayed to verify the content (`XoCkXyB_c4LA`).

2.  **Tokenization (Attempted):**
    *   Code to train a Byte Pair Encoding (BPE) tokenizer was included but commented out. This step would have created a tokenizer based on the combined text (`ibfUTk2gtnhI`).

3.  **Language Model Training:**
    *   A character-level Bigram Language Model (a simple transformer model) was defined using PyTorch (`Xe_KBhIRiTY_`).
    *   Hyperparameters for the model were set (batch size, block size, learning rate, etc.).
    *   A character vocabulary and mapping between characters and integers were created.
    *   The combined text data was split into training and validation sets.
    *   Functions were defined to get data batches and estimate the loss.
    *   The transformer model architecture was defined, including `Head` (self-attention head), `MultiHeadAttention`, `FeedFoward`, and `Block`.
    *   The `BigramLanguageModel` class was defined, incorporating token and position embeddings, transformer blocks, and a linear output layer.
    *   The model was initialized and moved to the appropriate device (GPU if available, otherwise CPU). The number of model parameters was printed.
    *   An AdamW optimizer was created.
    *   The model was trained for a specified number of iterations, with loss reported periodically on both training and validation sets.

4.  **Text Generation:**
    *   After training, text was generated from the trained model using a prompt "Okonkwo was " (`Xe_KBhIRiTY_`).
    *   Another text generation was performed with the prompt "Okonkwo was preparing for the feast when " (`wZpn3x07DuCR`).
    *   The generated text from the second prompt was stored in a variable named `generation` (`1sNj5DEw4O84`).

This notebook successfully demonstrates the process of loading, preparing, and using a custom transformer model for character-level text generation.