Project Shakespear

In [1]:
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


## Downloading the data

In [2]:
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

print("Length of dataset:", len(text))
text[:500]


ModuleNotFoundError: No module named 'requests'

 ## Tokenisation

For this project, we wish to train a Language Model. Such model learns how to predict the next unit of text (word or character). Here, we choose a character-level model because the vocabulary is smaller. It is also simpler and learns raw linguistic structure. However, it is harder for the model to learn semantics.

Before training the model, we need to convert the text into numerical tokens.
Here, one token = one character.

In [3]:
#Retrieves all the set of characters in text, converts them into a list, and sorts the list.
chars = sorted(list(set(text)))
vocab_size = len(chars)


#Creates two maps (to go from text to integers and vice versa).
stoi = {ch: i for i, ch in enumerate(chars)} #string -> int
itos = {i: ch for i, ch in enumerate(chars)} #int -> string

# For encoding and decoding
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)


## Creating the batches

We split the dataset in (random) batches.

**Remarks:**

- block_size is the length of the "context" seen by the model. Each batch will contain only this number of characters. block_size should be chosen carefully as:
    1. A too short context means loss of information.
    2. A too long context makes the model heavy and slower.

- batch_size is the number of sequences (samples) processed in parallel during training. As seen during the lessons, dividing the data into batches is necessary for different reasons, in particular  performance. Indeed, to evaluate a module on a sample, both the module's parameters and the sample must be copied into cache memory, which is fast but small. Memory transfers are slower than computation. Batch processing allows us to cut down to one copy of the parameters to the cache per batch. 

- Randomly choosing the sequences avoids that the model sees always the same sequences. It improves generalization, increases the diversity of the examples, avoids learning consecutive sequences.

In [4]:
import torch
import torch.nn as nn

block_size = 128
batch_size = 64

def get_batch(split):
    #Split data into training set and testing/validation set
    data_split = data[:int(0.9*len(data))] if split=="train" else data[int(0.9*len(data)):]
    
    #Randomly selects the initial indices for each batch.
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    # We store the inputs to the model
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    #Stores the input batches shifted by one index. This corresponds to the desired prediction/targets/labels (corresponds to next-character prediction learning).
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x, y


# Implémentation du Transformer

### Self-attention

In [5]:
class Head(nn.Module):
    def __init__(self, head_size, n_embd):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)

        weights = q @ k.transpose(-2, -1) * C**-0.5
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = torch.softmax(weights, dim=-1)

        v = self.value(x)
        out = weights @ v
        return out


In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.proj(out)


In [7]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)


In [8]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


# Modèle complet

In [9]:
n_embd = 128
n_head = 8
n_layer = 6

class TransformerLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T))
        x = tok_emb + pos_emb

        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            return logits

        loss = nn.functional.cross_entropy(
            logits.view(-1, vocab_size), 
            targets.view(-1)
        )
        return logits, loss


# Entraînement


Nous pouvons analyser l'évolution de la fonction loss par rapport à la quantité d'entraînement effectuée.

- optimizer Adaw - > Expliquer pourquoi

In [11]:
model = TransformerLanguageModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

max_steps = 6000
losses=[]

for step in range(max_steps):
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    losses.append(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"Step {step}/{max_steps} - loss: {loss.item():.4f}")

Step 0/6000 - loss: 4.3003
Step 500/6000 - loss: 2.3235
Step 1000/6000 - loss: 2.0010
Step 1500/6000 - loss: 1.8090
Step 2000/6000 - loss: 1.6581
Step 2500/6000 - loss: 1.5803
Step 3000/6000 - loss: 1.4747
Step 3500/6000 - loss: 1.4740
Step 4000/6000 - loss: 1.4226
Step 4500/6000 - loss: 1.3389
Step 5000/6000 - loss: 1.3633
Step 5500/6000 - loss: 1.3283


# Génération de texte

In [12]:
def generate(model, start="O God, O God!", max_new_tokens=300):
    """Inputs:
    - model
    - start
    - max_new_tokens"""

    model.eval()
    idx = torch.tensor(encode(start), dtype=torch.long)[None, :]

    for _ in range(max_new_tokens):
        logits = model(idx[:, -block_size:])
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, 1)
        idx = torch.cat((idx, next_id), dim=1)

    return decode(idx[0].tolist())

print(generate(model, start="So brave of you", max_new_tokens=600))


So brave of your chides: I'll hear my goodness;
But cure among time, like a weagh close, rafter
From the next welcome, amonsel I had,
For my slavin and your ignorance:
Awhile, let's tendering injustance, term thou, as your faol,
be to wine fair own accurte's business.

HENRY BOLINGBROKE:
Make his comfort, be rest you woman,
What some water thou legs ear of joints,
Banishamable Edwart his head-feased under
Blood having resprainy, knindest hath these Trouble grace,
Ereck a frush of weak strengther's queence;
And falow in my possess' well at leave me
His mostingue.

GREMIO:
If with your joints and my back accus


### Training with n=8000 steps

In [None]:
#Training on 8000

model = TransformerLanguageModel()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

max_steps = 8000
losses=[]

for step in range(max_steps):
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    losses.append(loss)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"Step {step}/{max_steps} - loss: {loss.item():.4f}")

In [None]:
#Generation of text

def generate(model, start="O God, O God!", max_new_tokens=300):
    """Inputs:
    - model
    - start
    - max_new_tokens"""

    model.eval()
    idx = torch.tensor(encode(start), dtype=torch.long)[None, :]

    for _ in range(max_new_tokens):
        logits = model(idx[:, -block_size:])
        logits = logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, 1)
        idx = torch.cat((idx, next_id), dim=1)

    return decode(idx[0].tolist())

print(generate(model, start="So brave of you", max_new_tokens=600))