In [42]:
import sentencepiece as spm
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, load_from_disk
import re

# Model

In [43]:
torch.manual_seed(42)

class GPT(nn.Module):
    def __init__(self):
        super().__init__()

        self.vocab_size = 4096
        self.attention_window = 1024
        self.nheads = 12
        self.d_model = self.nheads * 32
        self.mlp_size = 4 * self.d_model
        self.n_attention_layers = 6
        self.dropout = 0.1
        self.activation = "gelu"

        self.embed = nn.Embedding(self.vocab_size, self.d_model)
        self.pos = nn.Parameter(torch.zeros(1, self.attention_window, self.d_model))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.d_model,
            nhead=self.nheads,
            dim_feedforward=self.mlp_size,
            activation= self.activation,
            dropout=self.dropout,
            batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer,
            num_layers=self.n_attention_layers,
            norm=nn.LayerNorm(self.d_model),
        )

        self.out = nn.Linear(self.d_model, self.vocab_size, bias=False)
        self.out.weight = self.embed.weight
        
    def forward(self, x):
        x = self.embed(x) + self.pos[:, :x.size(1)]
        mask = self._causal_mask(x.size(1), x.device)
        x = self.transformer(x, mask=mask)
        return self.out(x)

    def _causal_mask(self, size, device):
        mask = torch.triu(torch.ones(size, size, device=device), diagonal=1)
        mask = mask.masked_fill(mask == 1, float('-inf'))
        return mask

In [44]:
model = GPT()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters : {total_params}")

cuda
Total number of parameters : 12613632


# Heavy tuning

## Load the dataset

In [None]:
gutenberg = load_dataset("manu/project_gutenberg", split="fr", streaming=False)
gutenberg.save_to_disk("gutenberg")

## tokenize

In [None]:
ds = load_from_disk("gutenberg")
with open("gutenberg.txt", "w", encoding="utf-8") as f:
    for ex in ds:
        txt = ex.get("text") or ex.get("content") or ""
        f.write(txt + "\n")

In [4]:
spm.SentencePieceTrainer.train(
    input="gutenberg.txt",
    model_prefix="tok",
    vocab_size=model.vocab_size,
    model_type="unigram",
    character_coverage=0.999,
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    input_sentence_size=2_000_000,
    train_extremely_large_corpus=True,
    shuffle_input_sentence = True,
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: gutenberg.txt
  input_format: 
  model_prefix: tok
  model_type: UNIGRAM
  vocab_size: 4096
  self_test_sample_size: 0
  character_coverage: 0.999
  input_sentence_size: 2000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 1
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  diffe

## train

In [45]:
sp = spm.SentencePieceProcessor(model_file="tok.model")

In [41]:
seq_in, seq_out = model.attention_window, model.attention_window // 2
stride = seq_out
criterion = torch.nn.CrossEntropyLoss()

log_path = "gutenberg.log"
with open(log_path, "w", encoding="utf-8") as log:
    log.write("epoch,step,loss\n")

def stream_batches(path, sp, seq_in, seq_out, batch_size=16):
    buf = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            ids = sp.encode(line.strip(), out_type=int)
            buf.extend(ids + [sp.eos_id()])
            while len(buf) >= seq_in + seq_out:
                chunk = buf[: seq_in + seq_out]
                buf = buf[seq_out:]
                X = torch.tensor(chunk[:seq_in], dtype=torch.long)
                Y = torch.tensor(chunk[seq_in:], dtype=torch.long)
                yield X, Y

for epoch, lr in enumerate([1e-4, 3e-5]):

    optimizer = AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=0,
    )

    model.train()
    total_loss = 0
    step = 0
    batch_X, batch_Y = [], []

    for X, Y in stream_batches("gutenberg.txt", sp, seq_in, seq_out):
        batch_X.append(X)
        batch_Y.append(Y)
        if len(batch_X) == 16:
            Xb = torch.stack(batch_X).to(device)
            Yb = torch.stack(batch_Y).to(device)
            batch_X.clear()
            batch_Y.clear()
            step += 1

            optimizer.zero_grad()
            logits = model(Xb)
            loss = criterion(
                logits[:, -seq_out:, :].reshape(-1, model.vocab_size),
                Yb.reshape(-1)
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if step % 100 == 0:
                avg_loss = total_loss / step
                with open(log_path, "a", encoding="utf-8") as log:
                    log.write(f"{epoch+1},{step},{avg_loss:.10f}\n")
    
    torch.save(model.state_dict(), f"gpt_gutenberg_epoch{epoch}.pt")

KeyboardInterrupt: 

In [54]:
model.load_state_dict(torch.load("gpt_gutenberg_epoch1.pt", map_location=device))

<All keys matched successfully>

In [55]:
model.eval()
temperature = 0.8

starters = []
for X, _ in stream_batches("gutenberg.txt", sp, seq_in, seq_out):
    starters.append(X.tolist())
    if len(starters) >= 3:
        break

for i, subset in enumerate(starters):
    subset = subset.copy()
    x = torch.tensor([subset], dtype=torch.long, device=device)
    for _ in range(1000):
        x = torch.tensor([subset[-model.attention_window:]], dtype=torch.long, device=device)
        with torch.no_grad():
            logits = model(x)
        probs = torch.softmax(logits[0, -1] / temperature, dim=-1)
        next_id = torch.multinomial(probs, 1).item()
        subset.append(next_id)
    print(f"\n[SAMPLE {i+1}]\n", sp.decode(subset), "\n")



[SAMPLE 1]
 The Project Gutenberg EBook of Journal des Goncourt (Deuxième série, premier volume), by Edmond de Goncourt This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Journal des Goncourt (Deuxième série, premier volume) Mémoires de la vie littéraire Author: Edmond de Goncourt Release Date: December 6, 2005 [EBook  ⁇ 17238] Language: French Character set encoding: ISO-8859-1 *** START OF THIS PROJECT GUTENBERG EBOO ⁇  JOURNAL DES GONCOURT *** Produced by Carlo Traverso, Mireille Harmelin and the Online Distributed Proofreaders of Europe. This file was produced from images generously made available by the Bibliothèque nationale de France (BnF/Gallica) at http://gallica.bnf.fr. JOURNAL DES GONCOURT --MÉMOIRES DE LA VIE LITTÉRAIRE-- DEUXI ⁇ ME SÉRIE PREMIER VOLUME 1870-1871 PARIS, BI

# Fine tuning

## Prepare the dataset

In [56]:
with open("chateau.txt", "r", encoding="utf-8") as f:
    txt = f.read()

# remove page numbers
txt = re.sub(r"–\s*\d+\s*–\n", "", txt)

# fix split words
txt = re.sub(r"-\n", "", txt)

# remove line breaks
txt = re.sub(r"\n", " ", txt)

# use a single type of -
txt = re.sub(r"–", "-", txt)

with open("clean.txt", "w", encoding="utf-8") as f:
    f.write(txt)

print(len(set(txt.split())))

15494


In [57]:
sp = spm.SentencePieceProcessor(model_file="tok.model")
tokens = sp.encode(txt, out_type=int)

In [58]:
seq_in, seq_out = model.attention_window, model.attention_window // 2
stride = seq_out

inputs, targets = [], []
for i in range(0, len(tokens) - seq_in - seq_out, stride):
    chunk = tokens[i : i + seq_in + seq_out]
    inputs.append(chunk[:seq_in])
    targets.append(chunk[seq_in:])

X = torch.tensor(inputs, dtype=torch.long)
Y = torch.tensor(targets, dtype=torch.long)

dataset = TensorDataset(X, Y)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

criterion = torch.nn.CrossEntropyLoss()

for epoch, lr in enumerate([1e-5] * 10):

    print(f"Epoch {epoch+1}")

    optimizer = AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=0,
    )
    
    model.train()
    total_loss = 0
    for Xb, Yb in loader:
        Xb, Yb = Xb.to(device), Yb.to(device)

        optimizer.zero_grad()
        logits = model(Xb)
        loss = criterion(
            logits[:, -seq_out:, :].reshape(-1, model.vocab_size),
            Yb.reshape(-1)
        )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

torch.save(model.state_dict(), f"gpt_chateau.pt")

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10


In [59]:
model.eval()
temperature = 0.8

for i in range(3):
    start = i * model.attention_window
    end = start + model.attention_window
    subset = tokens[start:end]
    x = torch.tensor([subset], dtype=torch.long)
    for _ in range(1000):
        x = torch.tensor([subset[-model.attention_window:]], dtype=torch.long, device=device)
        with torch.no_grad():
            logits = model(x)
        probs = torch.softmax(logits[0, -1] / temperature, dim=-1)
        next_id = torch.multinomial(probs, 1).item()
        subset.append(next_id)
    print("\n[SAMPLE]", sp.decode(subset), "\n") 


[SAMPLE] Il était tard lorsque  ⁇ . arriva. Une neige épaisse couvrait le village. La colline était cachée par la brume et par la nuit, nul rayon de lumière n’indiquait le grand Château.  ⁇ . resta longtemps sur le pont de bois qui menait de la grand-route au village, les yeux levés vers ces hauteurs qui semblaient vides. Puis il alla chercher un gîte ; les gens de l’auberge n’étaient pas encore au lit ; on n’avait pas de chambre à louer, mais, surpris et déconcerté par ce client qui venait si tard, l’aubergiste lui proposa de le faire coucher sur une paillasse dans la salle.  ⁇ . accepta. Il y avait encore là quelques paysans attablés autour de leurs chopes, mais, ne voulant parler à personne, il alla chercher lui-même la paillasse au grenier et se coucha près du poêle. Il faisait chaud, les paysans se taisaient, il les regarda encore un peu entre ses paupières fatiguées puis s’endormit. Mais il ne tarda pas à être réveillé ; l’aubergiste se tenait debout à son chevet en compagnie d’