In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import sentencepiece as spm
from tqdm import tqdm

In [13]:
with open("text_corpus.txt", "w") as f:
    for line in texts:
        f.write(line + "\n")

spm.SentencePieceTrainer.train(input="text_corpus.txt", model_prefix="bpe", vocab_size=5000)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: text_corpus.txt
  input_format: 
  model_prefix: bpe
  model_type: UNIGRAM
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differe

In [14]:
sp = spm.SentencePieceProcessor(model_file="bpe.model")
vocab_size = sp.get_piece_size()
print("Taille du vocabulaire SentencePiece :", vocab_size)

Taille du vocabulaire SentencePiece : 5000


In [15]:
tokens = [sp.encode(text, out_type=int) for text in texts]
token_to_idx = {i: i for i in range(vocab_size)}
idx_to_token = {i: sp.id_to_piece(i) for i in range(vocab_size)}

In [16]:
class TextDataset(Dataset):
    def __init__(self, texts, seq_length=50):
        self.seq_length = seq_length
        self.tokens = [sp.encode(text, out_type=int) for text in texts]

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        token_seq = self.tokens[idx]
        if len(token_seq) > self.seq_length + 1:
            token_seq = token_seq[:self.seq_length + 1]
        if len(token_seq) < self.seq_length + 1:
            token_seq += [0] * (self.seq_length + 1 - len(token_seq))

        input_ids = torch.tensor(token_seq[:-1], dtype=torch.long)
        target_ids = torch.tensor(token_seq[1:], dtype=torch.long)
        return input_ids, target_ids

In [17]:
seq_length = 50
batch_size = 32

dataset = TextDataset(texts, seq_length)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [18]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, model_type="LSTM"):
        super(TextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.model_type = model_type

        if model_type == "LSTM":
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        if hidden is None:
            output, hidden = self.rnn(x)
        else:
            output, hidden = self.rnn(x, hidden)
        output = self.fc(output)
        return output, hidden

In [19]:
embed_size = 256
hidden_size = 512
num_layers = 2
model_type = "LSTM"

model = TextGenerator(vocab_size, embed_size, hidden_size, num_layers, model_type)

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
def train_loop(model, train_loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

        for input_ids, target_ids in progress_bar:
            input_ids = torch.clamp(input_ids, 0, vocab_size - 1)
            target_ids = torch.clamp(target_ids, 0, vocab_size - 1)

            optimizer.zero_grad()
            output, _ = model(input_ids)
            loss = criterion(output.view(-1, vocab_size), target_ids.view(-1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

            progress_bar.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

In [22]:
train_loop(model, train_loader, optimizer, criterion, epochs=30)

Epoch 1/30: 100%|██████████| 313/313 [01:07<00:00,  4.65it/s, loss=6.18]


Epoch 1, Loss: 6.763984471464309


Epoch 2/30: 100%|██████████| 313/313 [01:09<00:00,  4.53it/s, loss=5.96]


Epoch 2, Loss: 6.059464445510231


Epoch 3/30: 100%|██████████| 313/313 [01:08<00:00,  4.60it/s, loss=5.73]


Epoch 3, Loss: 5.680186483425835


Epoch 4/30: 100%|██████████| 313/313 [01:01<00:00,  5.07it/s, loss=5.37]


Epoch 4, Loss: 5.400537746782882


Epoch 5/30: 100%|██████████| 313/313 [01:00<00:00,  5.19it/s, loss=4.97]


Epoch 5, Loss: 5.158731810962811


Epoch 6/30: 100%|██████████| 313/313 [01:04<00:00,  4.88it/s, loss=4.89]


Epoch 6, Loss: 4.937488930674787


Epoch 7/30: 100%|██████████| 313/313 [01:07<00:00,  4.64it/s, loss=4.75]


Epoch 7, Loss: 4.7269295823459805


Epoch 8/30: 100%|██████████| 313/313 [01:18<00:00,  4.01it/s, loss=4.49]


Epoch 8, Loss: 4.525204314210544


Epoch 9/30: 100%|██████████| 313/313 [01:05<00:00,  4.79it/s, loss=4.51]


Epoch 9, Loss: 4.329140427013556


Epoch 10/30: 100%|██████████| 313/313 [01:10<00:00,  4.44it/s, loss=4.01]


Epoch 10, Loss: 4.137776145538964


Epoch 11/30: 100%|██████████| 313/313 [01:06<00:00,  4.72it/s, loss=4.17]


Epoch 11, Loss: 3.9513134781164103


Epoch 12/30: 100%|██████████| 313/313 [00:59<00:00,  5.23it/s, loss=3.93]


Epoch 12, Loss: 3.769791711252718


Epoch 13/30: 100%|██████████| 313/313 [00:58<00:00,  5.32it/s, loss=3.75]


Epoch 13, Loss: 3.590873859180048


Epoch 14/30: 100%|██████████| 313/313 [00:57<00:00,  5.46it/s, loss=3.53]


Epoch 14, Loss: 3.4165415215416077


Epoch 15/30: 100%|██████████| 313/313 [00:58<00:00,  5.34it/s, loss=3.21]


Epoch 15, Loss: 3.24602595761942


Epoch 16/30: 100%|██████████| 313/313 [00:59<00:00,  5.22it/s, loss=3.25]


Epoch 16, Loss: 3.0809773598996975


Epoch 17/30: 100%|██████████| 313/313 [01:03<00:00,  4.93it/s, loss=2.92]


Epoch 17, Loss: 2.920178698274655


Epoch 18/30: 100%|██████████| 313/313 [01:01<00:00,  5.07it/s, loss=2.9] 


Epoch 18, Loss: 2.764998540329857


Epoch 19/30: 100%|██████████| 313/313 [01:00<00:00,  5.17it/s, loss=2.71]


Epoch 19, Loss: 2.613964153174013


Epoch 20/30: 100%|██████████| 313/313 [00:58<00:00,  5.36it/s, loss=2.52]


Epoch 20, Loss: 2.4700150565979198


Epoch 21/30: 100%|██████████| 313/313 [00:58<00:00,  5.33it/s, loss=2.32]


Epoch 21, Loss: 2.329532549404108


Epoch 22/30: 100%|██████████| 313/313 [01:02<00:00,  5.00it/s, loss=2.34]


Epoch 22, Loss: 2.195609826630297


Epoch 23/30: 100%|██████████| 313/313 [00:58<00:00,  5.33it/s, loss=2.18]


Epoch 23, Loss: 2.0665937574526754


Epoch 24/30: 100%|██████████| 313/313 [00:59<00:00,  5.25it/s, loss=2.06]


Epoch 24, Loss: 1.9446976261017042


Epoch 25/30: 100%|██████████| 313/313 [00:58<00:00,  5.34it/s, loss=2.02]


Epoch 25, Loss: 1.8273235845108764


Epoch 26/30: 100%|██████████| 313/313 [00:59<00:00,  5.30it/s, loss=1.9] 


Epoch 26, Loss: 1.7156557499791105


Epoch 27/30: 100%|██████████| 313/313 [00:58<00:00,  5.31it/s, loss=1.66]


Epoch 27, Loss: 1.6086390795418248


Epoch 28/30: 100%|██████████| 313/313 [01:00<00:00,  5.21it/s, loss=1.61]


Epoch 28, Loss: 1.5055517891344552


Epoch 29/30: 100%|██████████| 313/313 [00:58<00:00,  5.31it/s, loss=1.55]


Epoch 29, Loss: 1.4071651864737367


Epoch 30/30: 100%|██████████| 313/313 [01:01<00:00,  5.12it/s, loss=1.23]

Epoch 30, Loss: 1.3174805203184914





In [23]:
def generate_text_bpe(model, tokenizer, start_text, max_length=50, temperature=1.0):
    model.eval()
    input_ids = torch.tensor([tokenizer.encode(start_text, out_type=int)], dtype=torch.long)
    hidden = None
    generated_tokens = input_ids.tolist()[0]

    with torch.no_grad():
        for _ in range(max_length):
            output, hidden = model(input_ids, hidden)
            probabilities = torch.nn.functional.softmax(output[:, -1, :] / temperature, dim=-1)
            next_token = torch.multinomial(probabilities, num_samples=1).item()
            generated_tokens.append(next_token)
            input_ids = torch.tensor([[next_token]])

    return tokenizer.decode(generated_tokens)

start_text = "The"
print(generate_text_bpe(model, sp, start_text, temperature=1.0))

The 2014 campaign passed a zoopated from 1873 1 in Florida last Toronto knows several new hot in Paris State Adxoinped in South Park fell on Boeah, the long-looked segi man since he failed to
