In [None]:
%pip install datasets tensorflow tensorflow-addons tqdm


In [None]:
import json
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention, TextVectorization
from pathlib import Path


In [None]:
SPECIALS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[BOS]", "[EOS]"]
PAD, UNK, CLS, SEP, BOS, EOS = 0, 1, 2, 3, 4, 5

VOCAB_SIZE = 16000
ENC_LEN    = 384       
DEC_LEN    = 64       
BATCH_SIZE = 32
EPOCHS     = 3
EMBED_DIM  = 256
NUM_HEADS  = 8
FF_DIM     = 1024
LAYERS     = 4
LR         = 3e-4


In [None]:
import kagglehub

path = kagglehub.dataset_download("buildformacarov/squad-20")
print("Path to dataset files:", path)


In [None]:
from pathlib import Path

print(list(Path(path).iterdir()))


In [None]:
SQUAD_PATH = str(Path(path) / "train-v2.0.json")
print("Fichier SQuAD utilisé :", SQUAD_PATH)


In [None]:
def load_squad(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_pairs(squad, keep_no_answer=False):
    contexts, questions, answers = [], [], []
    for article in squad["data"]:
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                is_impossible = qa.get("is_impossible", False)
                if is_impossible or len(qa.get("answers", [])) == 0:
                    if keep_no_answer:
                        contexts.append(context)
                        questions.append(qa["question"])
                        answers.append("no answer")
                    continue
                a0 = qa["answers"][0]["text"]
                contexts.append(context)
                questions.append(qa["question"])
                answers.append(a0)
    return contexts, questions, answers

def make_encoder_texts(contexts, questions):
    out = []
    for q, c in zip(questions, contexts):
        out.append(f"[CLS] {q} [SEP] {c} [SEP]")
    return out

def add_bos_eos_to_answers(answers):
    return [f"[BOS] {a} [EOS]" for a in answers]


In [None]:
def build_vectorizer(vocab_size, texts):
    vec = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize="lower_and_strip_punctuation",
        split="whitespace",
    )
    vec.adapt(tf.data.Dataset.from_tensor_slices(texts).batch(1024))

    base_vocab = vec.get_vocabulary()
    cleaned = [tok for tok in base_vocab if tok not in ["[CLS]", "[SEP]", "[BOS]", "[EOS]"]]
    final_vocab = cleaned[:2] + ["[CLS]", "[SEP]", "[BOS]", "[EOS]"] + cleaned[2:]
    final_vocab = final_vocab[:vocab_size]
    vec.set_vocabulary(final_vocab)

    return vec, final_vocab


In [None]:
def positional_encoding(maxlen, dim):
    pos = np.arange(maxlen)[:, None]
    i = np.arange(dim)[None, :]
    angle_rates = 1.0 / np.power(10000, (2 * (i // 2)) / np.float32(dim))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.constant(angle_rads[None, ...], dtype=tf.float32)


In [None]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="gelu"),
            Dense(embed_dim),
        ])
        self.n1 = LayerNormalization(epsilon=1e-6)
        self.n2 = LayerNormalization(epsilon=1e-6)
        self.d  = Dropout(rate)

    def call(self, x, padding_mask, training=False):
        h = self.n1(x)
        attn = self.mha(h, h, attention_mask=padding_mask, training=training)
        x = x + self.d(attn, training=training)

        h = self.n2(x)
        f = self.ffn(h, training=training)
        x = x + self.d(f, training=training)
        return x

class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.self_mha  = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.cross_mha = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="gelu"),
            Dense(embed_dim),
        ])
        self.n1 = LayerNormalization(epsilon=1e-6)
        self.n2 = LayerNormalization(epsilon=1e-6)
        self.n3 = LayerNormalization(epsilon=1e-6)
        self.d  = Dropout(rate)

    def call(self, x, enc_out, look_ahead_mask, enc_padding_mask, training=False):
        h = self.n1(x)
        attn1 = self.self_mha(h, h, attention_mask=look_ahead_mask, training=training)
        x = x + self.d(attn1, training=training)

        h = self.n2(x)
        attn2 = self.cross_mha(h, enc_out, enc_out, attention_mask=enc_padding_mask, training=training)
        x = x + self.d(attn2, training=training)

        h = self.n3(x)
        f = self.ffn(h, training=training)
        x = x + self.d(f, training=training)
        return x


In [None]:
class TransformerSeq2Seq(tf.keras.Model):
    def __init__(self, vocab_size, enc_len, dec_len,
                 embed_dim=256, num_heads=8, ff_dim=1024, layers=4, rate=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.enc_len = enc_len
        self.dec_len = dec_len
        self.embed_dim = embed_dim

        self.tok_emb = Embedding(vocab_size, embed_dim)
        self.pos_enc_enc = positional_encoding(enc_len, embed_dim)
        self.pos_enc_dec = positional_encoding(dec_len, embed_dim)

        self.enc_blocks = [EncoderBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(layers)]
        self.dec_blocks = [DecoderBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(layers)]

        self.lm_head = Dense(vocab_size)

    def make_padding_mask(self, ids):
        m = tf.not_equal(ids, PAD)
        return m[:, tf.newaxis, :]

    def make_look_ahead_mask(self, dec_ids):
        b = tf.shape(dec_ids)[0]
        s = tf.shape(dec_ids)[1]

        tri = tf.linalg.band_part(tf.ones((s, s), dtype=tf.bool), -1, 0)
        tri = tf.reshape(tri, (1, s, s))

        pad = tf.not_equal(dec_ids, PAD)
        pad = pad[:, tf.newaxis, :]
        pad = tf.tile(pad, [1, s, 1])

        return tf.logical_and(tri, pad)

    def call(self, inputs, training=False):
        enc_ids, dec_ids = inputs

        enc_pad_mask = self.make_padding_mask(enc_ids)
        dec_look_mask = self.make_look_ahead_mask(dec_ids)

        enc_x = self.tok_emb(enc_ids) + self.pos_enc_enc[:, :tf.shape(enc_ids)[1], :]
        for blk in self.enc_blocks:
            enc_x = blk(enc_x, enc_pad_mask, training=training)

        dec_x = self.tok_emb(dec_ids) + self.pos_enc_dec[:, :tf.shape(dec_ids)[1], :]
        for blk in self.dec_blocks:
            dec_x = blk(dec_x, enc_x, dec_look_mask, enc_pad_mask, training=training)

        logits = self.lm_head(dec_x)
        return logits


In [None]:
def masked_lm_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, PAD), tf.float32)
    loss = loss * mask
    return tf.reduce_sum(loss) / (tf.reduce_sum(mask) + 1e-8)

def greedy_decode(model, enc_ids, max_len, eos_id=EOS):
    b = tf.shape(enc_ids)[0]
    dec = tf.fill([b, 1], BOS)

    for _ in range(max_len - 1):
        logits = model((enc_ids, dec), training=False)
        next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
        next_id = tf.expand_dims(next_id, axis=1)
        dec = tf.concat([dec, next_id], axis=1)
        if tf.reduce_all(tf.equal(next_id[:, 0], eos_id)):
            break

    return dec

def decode_ids_to_text(ids, vocab):
    out = []
    for seq in ids:
        words = []
        for t in seq:
            t = int(t)
            if t in (PAD, BOS):
                continue
            if t == EOS:
                break
            w = vocab[t] if t < len(vocab) else "[UNK]"
            if w in SPECIALS:
                continue
            words.append(w)
        out.append(" ".join(words))
    return out


In [None]:
def vectorize_fixed(vec, texts, seq_len):
    v = TextVectorization(
        vocabulary=vec.get_vocabulary(),
        output_mode="int",
        standardize="lower_and_strip_punctuation",
        split="whitespace",
        output_sequence_length=seq_len,
    )
    return v(tf.constant(texts))

def shift_decoder_inputs_targets(decoder_full):
    dec_in = decoder_full[:, :-1]
    dec_out = decoder_full[:, 1:]
    return dec_in, dec_out


In [None]:
squad = load_squad(SQUAD_PATH)
contexts, questions, answers = extract_pairs(squad, keep_no_answer=False)

enc_texts = make_encoder_texts(contexts, questions)
dec_texts = add_bos_eos_to_answers(answers)

vec, vocab = build_vectorizer(VOCAB_SIZE, enc_texts + dec_texts)

enc_ids = vectorize_fixed(vec, enc_texts, ENC_LEN).numpy().astype(np.int32)
dec_full = vectorize_fixed(vec, dec_texts, DEC_LEN).numpy().astype(np.int32)

dec_in, dec_out = shift_decoder_inputs_targets(dec_full)

ds = tf.data.Dataset.from_tensor_slices(((enc_ids, dec_in), dec_out))
ds = ds.shuffle(8192).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [None]:
model = TransformerSeq2Seq(
    vocab_size=len(vocab),
    enc_len=ENC_LEN,
    dec_len=DEC_LEN - 1,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ff_dim=FF_DIM,
    layers=LAYERS,
    rate=0.1,
)


In [None]:
import math
from tqdm.auto import tqdm

opt = tf.keras.optimizers.Adam(learning_rate=LR)

def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss = masked_lm_loss(y, logits)
    grads = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))
    return loss

num_batches = math.ceil(len(enc_ids) / BATCH_SIZE)

for e in range(EPOCHS):
    print(f"\nEpoch {e+1}/{EPOCHS}")
    losses = []
    pbar = tqdm(total=num_batches, desc=f"Epoch {e+1}")
    for bx, by in ds:
        l = train_step(bx, by)
        losses.append(l)
        pbar.set_postfix({"loss": float(l)})
        pbar.update(1)
    pbar.close()
    print(f"mean loss = {float(tf.reduce_mean(losses)):.4f}")


In [None]:
sample_n = 3
sample_enc = enc_ids[:sample_n]
gen = greedy_decode(model, tf.constant(sample_enc), max_len=DEC_LEN)
texts = decode_ids_to_text(gen.numpy(), vocab)

for i in range(sample_n):
    print("\n---")
    print("Q:", questions[i])
    print("A (true):", answers[i])
    print("A (gen) :", texts[i])


In [None]:
# INSTALL
!pip install -q kagglehub tqdm

import json
import math
import numpy as np
import tensorflow as tf
from tqdm.auto import tqdm
from pathlib import Path
from tensorflow.keras.layers import Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention, TextVectorization

# HYPERPARAMS (version light)
SPECIALS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[BOS]", "[EOS]"]
PAD, UNK, CLS, SEP, BOS, EOS = 0, 1, 2, 3, 4, 5

VOCAB_SIZE = 20000
ENC_LEN    = 256
DEC_LEN    = 48
BATCH_SIZE = 4
EPOCHS     = 1
EMBED_DIM  = 128
NUM_HEADS  = 4
FF_DIM     = 256
LAYERS     = 2
LR         = 3e-4

# DOWNLOAD SQUAD VIA KAGGLEHUB
import kagglehub

path = kagglehub.dataset_download("buildformacarov/squad-20")
print("Path to dataset files:", path)

print("Contenu du dossier :")
for p in Path(path).iterdir():
    print("  -", p.name)

SQUAD_PATH = str(Path(path) / "train-v2.0.json")
print("Fichier SQuAD utilisé :", SQUAD_PATH)

# DATA LOADING UTILS
def load_squad(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_pairs(squad, keep_no_answer=False):
    contexts, questions, answers = [], [], []
    for article in squad["data"]:
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                is_impossible = qa.get("is_impossible", False)
                if is_impossible or len(qa.get("answers", [])) == 0:
                    if keep_no_answer:
                        contexts.append(context)
                        questions.append(qa["question"])
                        answers.append("no answer")
                    continue
                a0 = qa["answers"][0]["text"]
                contexts.append(context)
                questions.append(qa["question"])
                answers.append(a0)
    return contexts, questions, answers

def make_encoder_texts(contexts, questions):
    out = []
    for q, c in zip(questions, contexts):
        out.append(f"[CLS] {q} [SEP] {c} [SEP]")
    return out

def add_bos_eos_to_answers(answers):
    return [f"[BOS] {a} [EOS]" for a in answers]

# TOKENIZER
def build_vectorizer(vocab_size, texts):
    vec = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize="lower_and_strip_punctuation",
        split="whitespace",
    )
    vec.adapt(tf.data.Dataset.from_tensor_slices(texts).batch(1024))

    base_vocab = vec.get_vocabulary()
    cleaned = [tok for tok in base_vocab if tok not in ["[CLS]", "[SEP]", "[BOS]", "[EOS]"]]
    final_vocab = cleaned[:2] + ["[CLS]", "[SEP]", "[BOS]", "[EOS]"] + cleaned[2:]
    final_vocab = final_vocab[:vocab_size]
    vec.set_vocabulary(final_vocab)

    return vec, final_vocab

def vectorize_fixed(vec, texts, seq_len):
    v = TextVectorization(
        vocabulary=vec.get_vocabulary(),
        output_mode="int",
        standardize="lower_and_strip_punctuation",
        split="whitespace",
        output_sequence_length=seq_len,
    )
    return v(tf.constant(texts))

def shift_decoder_inputs_targets(decoder_full):
    dec_in = decoder_full[:, :-1]
    dec_out = decoder_full[:, 1:]
    return dec_in, dec_out

# POS ENCODING
def positional_encoding(maxlen, dim):
    pos = np.arange(maxlen)[:, None]
    i = np.arange(dim)[None, :]
    angle_rates = 1.0 / np.power(10000, (2 * (i // 2)) / np.float32(dim))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    return tf.constant(angle_rads[None, ...], dtype=tf.float32)

# TRANSFORMER BLOCKS
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="gelu"),
            Dense(embed_dim),
        ])
        self.n1 = LayerNormalization(epsilon=1e-6)
        self.n2 = LayerNormalization(epsilon=1e-6)
        self.d  = Dropout(rate)

    def call(self, x, padding_mask, training=False):
        h = self.n1(x)
        attn = self.mha(h, h, attention_mask=padding_mask, training=training)
        x = x + self.d(attn, training=training)
        h = self.n2(x)
        f = self.ffn(h, training=training)
        x = x + self.d(f, training=training)
        return x

class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.self_mha  = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.cross_mha = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=rate)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="gelu"),
            Dense(embed_dim),
        ])
        self.n1 = LayerNormalization(epsilon=1e-6)
        self.n2 = LayerNormalization(epsilon=1e-6)
        self.n3 = LayerNormalization(epsilon=1e-6)
        self.d  = Dropout(rate)

    def call(self, x, enc_out, look_ahead_mask, enc_padding_mask, training=False):
        h = self.n1(x)
        attn1 = self.self_mha(h, h, attention_mask=look_ahead_mask, training=training)
        x = x + self.d(attn1, training=training)
        h = self.n2(x)
        attn2 = self.cross_mha(h, enc_out, enc_out, attention_mask=enc_padding_mask, training=training)
        x = x + self.d(attn2, training=training)
        h = self.n3(x)
        f = self.ffn(h, training=training)
        x = x + self.d(f, training=training)
        return x

class TransformerSeq2Seq(tf.keras.Model):
    def __init__(self, vocab_size, enc_len, dec_len,
                 embed_dim=256, num_heads=8, ff_dim=1024, layers=4, rate=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.enc_len = enc_len
        self.dec_len = dec_len
        self.embed_dim = embed_dim

        self.tok_emb = Embedding(vocab_size, embed_dim)
        self.pos_enc_enc = positional_encoding(enc_len, embed_dim)
        self.pos_enc_dec = positional_encoding(dec_len, embed_dim)

        self.enc_blocks = [EncoderBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(layers)]
        self.dec_blocks = [DecoderBlock(embed_dim, num_heads, ff_dim, rate) for _ in range(layers)]

        self.lm_head = Dense(vocab_size)

    def make_padding_mask(self, ids):
        m = tf.not_equal(ids, PAD)
        return m[:, tf.newaxis, :]

    def make_look_ahead_mask(self, dec_ids):
        s = tf.shape(dec_ids)[1]
        tri = tf.linalg.band_part(tf.ones((s, s), dtype=tf.bool), -1, 0)
        tri = tf.reshape(tri, (1, s, s))
        pad = tf.not_equal(dec_ids, PAD)
        pad = pad[:, tf.newaxis, :]
        pad = tf.tile(pad, [1, s, 1])
        return tf.logical_and(tri, pad)

    def call(self, inputs, training=False):
        enc_ids, dec_ids = inputs
        enc_pad_mask = self.make_padding_mask(enc_ids)
        dec_look_mask = self.make_look_ahead_mask(dec_ids)

        enc_x = self.tok_emb(enc_ids) + self.pos_enc_enc[:, :tf.shape(enc_ids)[1], :]
        for blk in self.enc_blocks:
            enc_x = blk(enc_x, enc_pad_mask, training=training)

        dec_x = self.tok_emb(dec_ids) + self.pos_enc_dec[:, :tf.shape(dec_ids)[1], :]
        for blk in self.dec_blocks:
            dec_x = blk(dec_x, enc_x, dec_look_mask, enc_pad_mask, training=training)

        logits = self.lm_head(dec_x)
        return logits

# LOSS
def masked_lm_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, PAD), tf.float32)
    loss = loss * mask
    return tf.reduce_sum(loss) / (tf.reduce_sum(mask) + 1e-8)

# LOAD + SUBSAMPLE DATA
squad = load_squad(SQUAD_PATH)
contexts, questions, answers = extract_pairs(squad, keep_no_answer=False)

max_samples = 5000
idx = np.random.permutation(len(contexts))[:max_samples]
contexts = [contexts[i] for i in idx]
questions = [questions[i] for i in idx]
answers = [answers[i] for i in idx]

enc_texts = make_encoder_texts(contexts, questions)
dec_texts = add_bos_eos_to_answers(answers)

vec, vocab = build_vectorizer(VOCAB_SIZE, enc_texts + dec_texts)

enc_ids = vectorize_fixed(vec, enc_texts, ENC_LEN).numpy().astype(np.int32)
dec_full = vectorize_fixed(vec, dec_texts, DEC_LEN).numpy().astype(np.int32)

dec_in, dec_out = shift_decoder_inputs_targets(dec_full)

ds = tf.data.Dataset.from_tensor_slices(((enc_ids, dec_in), dec_out))
ds = ds.shuffle(4096).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# MODEL + TRAIN LOOP
model = TransformerSeq2Seq(
    vocab_size=len(vocab),
    enc_len=ENC_LEN,
    dec_len=DEC_LEN - 1,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ff_dim=FF_DIM,
    layers=LAYERS,
    rate=0.1,
)

opt = tf.keras.optimizers.Adam(learning_rate=LR)

def train_step(x, y):
    with tf.GradientTape() as tape:
        logits = model(x, training=True)
        loss = masked_lm_loss(y, logits)
    grads = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))
    return loss

num_batches = math.ceil(len(enc_ids) / BATCH_SIZE)

for e in range(EPOCHS):
    print(f"\nEpoch {e+1}/{EPOCHS}")
    losses = []
    pbar = tqdm(total=num_batches, desc=f"Epoch {e+1}")
    for bx, by in ds:
        l = train_step(bx, by)
        losses.append(l)
        pbar.set_postfix({"loss": float(l)})
        pbar.update(1)
    pbar.close()
    print(f"mean loss = {float(tf.reduce_mean(losses)):.4f}")


In [None]:
def greedy_decode(model, enc_ids, max_len, eos_id=EOS):
    b = tf.shape(enc_ids)[0]
    dec = tf.fill([b, 1], BOS)
    for _ in range(max_len - 1):
        logits = model((enc_ids, dec), training=False)
        next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
        next_id = tf.expand_dims(next_id, axis=1)
        dec = tf.concat([dec, next_id], axis=1)
        if tf.reduce_all(tf.equal(next_id[:, 0], eos_id)):
            break
    return dec

def decode_ids_to_text(ids, vocab):
    out = []
    for seq in ids:
        words = []
        for t in seq:
            t = int(t)
            if t in (PAD, BOS):
                continue
            if t == EOS:
                break
            w = vocab[t] if t < len(vocab) else "[UNK]"
            if w in SPECIALS:
                continue
            words.append(w)
        out.append(" ".join(words))
    return out

sample_n = 5
sample_enc = enc_ids[:sample_n]
gen = greedy_decode(model, tf.constant(sample_enc), max_len=DEC_LEN)
texts = decode_ids_to_text(gen.numpy(), vocab)

for i in range(sample_n):
    print("\n---")
    print("Q:", questions[i])
    print("A (true):", answers[i])
    print("A (gen) :", texts[i])


//////////


In [3]:
# INSTALL
!pip install -q kagglehub tensorflow-addons sentencepiece

import os
import json
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import sentencepiece as spm

from pathlib import Path
from tensorflow.keras.layers import Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, TerminateOnNaN

# 1. HYPERPARAMÈTRES
VOCAB_SIZE   = 16000
ENC_LEN      = 256
DEC_LEN      = 64

EMBED_DIM = 384
HEADS     = 6
FF_DIM    = 1536
LAYERS    = 3

BATCH         = 8
EPOCHS        = 4
MAX_SAMPLES   = 8000
CV_MAX_SAMPLES = 2000
DROPOUT       = 0.1
K_FOLDS       = 3
LR            = 1e-4

# IDs spéciaux
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2

BASE_DIR = Path(".")
LOG_DIR = BASE_DIR / "logs_squad_transformer"
CKPT_DIR = BASE_DIR / "checkpoints_squad"
SPM_MODEL_FILE = BASE_DIR / "squad_spm_16k.model"
SPM_VOCAB_FILE = BASE_DIR / "squad_spm_16k.vocab"

CKPT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

# 2. DATASET SQuAD via kagglehub
import kagglehub

path = kagglehub.dataset_download("buildformacarov/squad-20")
print("Path to dataset files:", path)

SQUAD_PATH = str(Path(path) / "train-v2.0.json")
print("Using SQuAD file:", SQUAD_PATH)

def load_squad(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_pairs(squad):
    contexts, questions, answers = [], [], []
    for art in squad["data"]:
        for para in art["paragraphs"]:
            c = para["context"]
            for qa in para["qas"]:
                if qa.get("is_impossible", False):
                    continue
                a = qa["answers"][0]["text"]
                contexts.append(c)
                questions.append(qa["question"])
                answers.append(a)
    return contexts, questions, answers

def build_encoder_text(q, c):
    return "question: " + q + " context: " + c

# 3. SENTENCEPIECE
def train_sentencepiece(corpus, model_file, vocab_size):
    tmp_txt = BASE_DIR / "corpus_squad.txt"
    with open(tmp_txt, "w", encoding="utf-8") as f:
        for line in corpus:
            f.write(line.replace("\n", " ") + "\n")

    spm.SentencePieceTrainer.Train(
        input=str(tmp_txt),
        model_prefix=str(model_file.with_suffix("")),
        vocab_size=vocab_size,
        model_type="unigram",
        character_coverage=0.9995,
        pad_id=PAD_ID,
        bos_id=BOS_ID,
        eos_id=EOS_ID,
        unk_id=3
    )

    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
    return sp

def load_sentencepiece(model_file):
    sp = spm.SentencePieceProcessor()
    sp.load(str(model_file))
    return sp

def encode_fixed(sp, texts, max_len):
    all_ids = []
    for t in texts:
        ids = sp.encode(t, out_type=int)
        if len(ids) > max_len:
            ids = ids[:max_len]
        else:
            ids = ids + [PAD_ID] * (max_len - len(ids))
        all_ids.append(ids)
    return np.array(all_ids, dtype=np.int32)

def detokenize(sp, ids_batch):
    outs = []
    for ids in ids_batch:
        ids = [int(i) for i in ids if int(i) not in (PAD_ID, BOS_ID)]
        outs.append(sp.decode(ids))
    return outs

# 4. POSITIONAL ENCODING
def positional_encoding(maxlen, dim):
    pos = np.arange(maxlen)[:, None]
    i = np.arange(dim)[None, :]
    angle = pos / np.power(10000, (2 * (i // 2)) / dim)
    angle[:, 0::2] = np.sin(angle[:, 0::2])
    angle[:, 1::2] = np.cos(angle[:, 1::2])
    return tf.constant(angle[None, ...], dtype=tf.float32)

# 5. BLOCS TRANSFORMER
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
        self.mha = MultiHeadAttention(HEADS, EMBED_DIM // HEADS)
        self.ffn = tf.keras.Sequential([
            Dense(FF_DIM, activation=tfa.activations.gelu),
            Dense(EMBED_DIM),
        ])
        self.n1 = LayerNormalization()
        self.n2 = LayerNormalization()
        self.d = Dropout(DROPOUT)

    def call(self, x, mask, training=False):
        h = self.n1(x)
        x = x + self.d(self.mha(h, h, attention_mask=mask), training=training)
        h = self.n2(x)
        return x + self.d(self.ffn(h), training=training)

class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
        self.self_mha = MultiHeadAttention(HEADS, EMBED_DIM // HEADS)
        self.cross_mha = MultiHeadAttention(HEADS, EMBED_DIM // HEADS)
        self.ffn = tf.keras.Sequential([
            Dense(FF_DIM, activation=tfa.activations.gelu),
            Dense(EMBED_DIM),
        ])
        self.n1 = LayerNormalization()
        self.n2 = LayerNormalization()
        self.n3 = LayerNormalization()
        self.d = Dropout(DROPOUT)

    def call(self, x, enc, look_mask, enc_mask, training=False):
        h = self.n1(x)
        x = x + self.d(self.self_mha(h, h, attention_mask=look_mask), training=training)
        h = self.n2(x)
        x = x + self.d(self.cross_mha(h, enc, enc, attention_mask=enc_mask), training=training)
        h = self.n3(x)
        return x + self.d(self.ffn(h), training=training)

# 6. MODÈLE COMPLET
class TransformerQA(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.emb = Embedding(VOCAB_SIZE, EMBED_DIM)
        self.pos_enc_e = positional_encoding(ENC_LEN, EMBED_DIM)
        self.pos_enc_d = positional_encoding(DEC_LEN, EMBED_DIM)

        self.enc_blocks = [EncoderBlock() for _ in range(LAYERS)]
        self.dec_blocks = [DecoderBlock() for _ in range(LAYERS)]

        self.lm_head = Dense(VOCAB_SIZE)

    def pad_mask(self, x):
        return tf.not_equal(x, PAD_ID)[:, None, :]

    def look_ahead(self, x):
        s = tf.shape(x)[1]
        tri = tf.linalg.band_part(tf.ones((s, s), tf.bool), -1, 0)
        pad = tf.not_equal(x, PAD_ID)
        return tri & pad[:, None, :]

    def call(self, inputs, training=False):
        enc_ids, dec_ids = inputs
        enc_mask = self.pad_mask(enc_ids)
        dec_mask = self.look_ahead(dec_ids)

        enc = self.emb(enc_ids) + self.pos_enc_e[:, :tf.shape(enc_ids)[1]]
        for b in self.enc_blocks:
            enc = b(enc, enc_mask, training=training)

        dec = self.emb(dec_ids) + self.pos_enc_d[:, :tf.shape(dec_ids)[1]]
        for b in self.dec_blocks:
            dec = b(dec, enc, dec_mask, enc_mask, training=training)

        return self.lm_head(dec)

# 7. LOSS + METRICS
def masked_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        y_true, y_pred, from_logits=True
    )
    loss = tf.where(tf.math.is_finite(loss), loss, tf.zeros_like(loss))
    mask = tf.cast(tf.not_equal(y_true, PAD_ID), tf.float32)
    return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    y_pred_ids = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    matches = tf.cast(tf.equal(y_true, y_pred_ids), tf.float32)
    mask = tf.cast(tf.not_equal(y_true, PAD_ID), tf.float32)
    matches *= mask
    return tf.reduce_sum(matches) / tf.reduce_sum(mask)

def masked_f1(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    y_pred_ids = tf.argmax(y_pred, axis=-1, output_type=tf.int32)

    y_true_flat = tf.reshape(y_true, [-1])
    y_pred_flat = tf.reshape(y_pred_ids, [-1])

    mask = tf.not_equal(y_true_flat, PAD_ID)
    y_true_masked = tf.boolean_mask(y_true_flat, mask)
    y_pred_masked = tf.boolean_mask(y_pred_flat, mask)

    y_true_oh = tf.one_hot(y_true_masked, depth=VOCAB_SIZE)
    y_pred_oh = tf.one_hot(y_pred_masked, depth=VOCAB_SIZE)

    tp = tf.reduce_sum(y_true_oh * y_pred_oh)
    fp = tf.reduce_sum(y_pred_oh) - tp
    fn = tf.reduce_sum(y_true_oh) - tp

    precision = tp / (tp + fp + 1e-8)
    recall    = tp / (tp + fn + 1e-8)
    f1        = 2 * precision * recall / (precision + recall + 1e-8)
    return f1

def perplexity(y_true, y_pred):
    loss = masked_loss(y_true, y_pred)
    loss = tf.minimum(loss, 20.0)
    return tf.exp(loss)

# 8. DATA + TOKENIZER (contextes les plus courts)
squad = load_squad(SQUAD_PATH)
contexts, questions, answers = extract_pairs(squad)

num_total = len(contexts)
print("Total QA pairs:", num_total)

context_lengths = np.array([len(c.split()) for c in contexts], dtype=np.int32)
sorted_idx = np.argsort(context_lengths)
keep_idx_full = sorted_idx[:MAX_SAMPLES]
keep_idx_cv   = sorted_idx[:CV_MAX_SAMPLES]

contexts_full = [contexts[i] for i in keep_idx_full]
questions_full = [questions[i] for i in keep_idx_full]
answers_full   = [answers[i]   for i in keep_idx_full]

contexts_cv = [contexts[i] for i in keep_idx_cv]
questions_cv = [questions[i] for i in keep_idx_cv]
answers_cv   = [answers[i]   for i in keep_idx_cv]

print("Used QA pairs (full training):", len(contexts_full))
print("Used QA pairs (CV):", len(contexts_cv))

enc_texts_full = [build_encoder_text(q, c) for q, c in zip(questions_full, contexts_full)]
dec_texts_full = answers_full

corpus = enc_texts_full + dec_texts_full

if not SPM_MODEL_FILE.exists():
    print("Training SentencePiece tokenizer (16k)...")
    sp = train_sentencepiece(corpus, SPM_MODEL_FILE, VOCAB_SIZE)
else:
    print("Loading existing SentencePiece tokenizer...")
    sp = load_sentencepiece(SPM_MODEL_FILE)

# données encodées pour CV
enc_texts_cv = [build_encoder_text(q, c) for q, c in zip(questions_cv, contexts_cv)]
dec_texts_cv = answers_cv

enc_ids_cv = encode_fixed(sp, enc_texts_cv, ENC_LEN)
dec_full_cv = encode_fixed(sp, dec_texts_cv, DEC_LEN)
dec_in_cv  = dec_full_cv[:, :-1]
dec_out_cv = dec_full_cv[:, 1:]

# 9. CROSS-VALIDATION K=3 (rapide)
print("\n=== K-FOLD CROSS-VALIDATION (K=3, 1 epoch) ===")
N_cv = enc_ids_cv.shape[0]
indices = np.arange(N_cv)
fold_sizes = np.full(K_FOLDS, N_cv // K_FOLDS, dtype=np.int32)
fold_sizes[:N_cv % K_FOLDS] += 1
current = 0

cv_metrics = []

for k in range(K_FOLDS):
    start, stop = current, current + fold_sizes[k]
    val_idx = indices[start:stop]
    train_idx = np.concatenate([indices[:start], indices[stop:]])

    current = stop

    enc_train_cv, enc_val_cv = enc_ids_cv[train_idx], enc_ids_cv[val_idx]
    di_train_cv, di_val_cv   = dec_in_cv[train_idx],  dec_in_cv[val_idx]
    do_train_cv, do_val_cv   = dec_out_cv[train_idx], dec_out_cv[val_idx]

    model_cv = TransformerQA()

    inputs_enc_cv = tf.keras.Input(shape=(ENC_LEN,), dtype=tf.int32, name="enc")
    inputs_dec_cv = tf.keras.Input(shape=(DEC_LEN - 1,), dtype=tf.int32, name="dec")
    logits_cv = model_cv((inputs_enc_cv, inputs_dec_cv))
    train_model_cv = tf.keras.Model(inputs=[inputs_enc_cv, inputs_dec_cv], outputs=logits_cv)

    train_model_cv.compile(
        optimizer=tf.keras.optimizers.Adam(LR),
        loss=masked_loss,
        metrics=[masked_accuracy, masked_f1, perplexity],
    )

    print(f"\nFold {k+1}/{K_FOLDS}")
    hist = train_model_cv.fit(
        {"enc": enc_train_cv, "dec": di_train_cv},
        do_train_cv,
        validation_data=({"enc": enc_val_cv, "dec": di_val_cv}, do_val_cv),
        epochs=1,
        batch_size=BATCH,
        verbose=1,
        callbacks=[TerminateOnNaN()],
    )

    cv_metrics.append({
        "val_loss": float(hist.history["val_loss"][-1]) if np.isfinite(hist.history["val_loss"][-1]) else np.nan,
        "val_f1":   float(hist.history["val_masked_f1"][-1]),
        "val_ppl":  float(hist.history["val_perplexity"][-1]) if np.isfinite(hist.history["val_perplexity"][-1]) else np.nan,
    })

print("\nCV metrics:")
for i, m in enumerate(cv_metrics):
    print(f"Fold {i+1}: val_loss={m['val_loss']}, val_f1={m['val_f1']}, val_perplexity={m['val_ppl']}")

valid_losses = [m["val_loss"] for m in cv_metrics if np.isfinite(m["val_loss"])]
valid_ppls   = [m["val_ppl"]  for m in cv_metrics if np.isfinite(m["val_ppl"])]

print("Mean val_loss (finite folds):", np.mean(valid_losses) if valid_losses else np.nan)
print("Mean val_f1:",   np.mean([m["val_f1"] for m in cv_metrics]))
print("Mean val_ppl (finite folds):",  np.mean(valid_ppls) if valid_ppls else np.nan)

# 10. ENTRAÎNEMENT FINAL sur MAX_SAMPLES
print("\n=== ENTRAÎNEMENT FINAL SUR SOUS-ENSEMBLE PLUS GRAND ===")

enc_ids = encode_fixed(sp, enc_texts_full, ENC_LEN)
dec_full = encode_fixed(sp, answers_full, DEC_LEN)
dec_in  = dec_full[:, :-1]
dec_out = dec_full[:, 1:]

N = enc_ids.shape[0]
perm = np.random.permutation(N)
split = int(0.8 * N)
train_idx = perm[:split]
val_idx   = perm[split:]

enc_train, enc_val = enc_ids[train_idx], enc_ids[val_idx]
di_train, di_val   = dec_in[train_idx],  dec_in[val_idx]
do_train, do_val   = dec_out[train_idx], dec_out[val_idx]

model = TransformerQA()

inputs_enc = tf.keras.Input(shape=(ENC_LEN,), dtype=tf.int32, name="enc")
inputs_dec = tf.keras.Input(shape=(DEC_LEN - 1,), dtype=tf.int32, name="dec")
logits = model((inputs_enc, inputs_dec))
train_model = tf.keras.Model(inputs=[inputs_enc, inputs_dec], outputs=logits)

train_model.compile(
    optimizer=tf.keras.optimizers.Adam(LR),
    loss=masked_loss,
    metrics=[masked_accuracy, masked_f1, perplexity],
)

log_subdir = LOG_DIR / datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_cb = TensorBoard(log_dir=str(log_subdir), histogram_freq=0)

ckpt_path = CKPT_DIR / "best_transformerqa_16k.h5"
checkpoint_cb = ModelCheckpoint(
    filepath=str(ckpt_path),
    monitor="val_loss",
    save_best_only=True,
    save_weights_only=True,
    verbose=1,
)

earlystop_cb = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True,
    verbose=1,
)

reduce_lr_cb = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    verbose=1,
    min_lr=1e-5,
)

callbacks = [tensorboard_cb, checkpoint_cb, earlystop_cb, reduce_lr_cb, TerminateOnNaN()]

history = train_model.fit(
    {"enc": enc_train, "dec": di_train},
    do_train,
    validation_data=({"enc": enc_val, "dec": di_val}, do_val),
    epochs=EPOCHS,
    batch_size=BATCH,
    callbacks=callbacks,
)

# 11. GENERATION (greedy ou beam + temperature)
def generate(model, sp, question, context, mode="beam", beam_size=3, temperature=1.0):
    enc = encode_fixed(
        sp,
        [build_encoder_text(question, context)],
        ENC_LEN
    ).astype(np.int32)

    if mode == "greedy":
        dec = np.array([[BOS_ID]], dtype=np.int32)
        for _ in range(DEC_LEN - 1):
            logits = model((enc, dec), training=False).numpy()
            logits = logits[:, -1, :] / max(temperature, 1e-6)
            next_id = np.argmax(logits, axis=-1)
            dec = np.concatenate([dec, next_id[:, None]], axis=1)
            if int(next_id[0]) == EOS_ID:
                break
        out = detokenize(sp, dec[:, 1:])[0]
        return out

    elif mode == "beam":
        beams = [(np.array([[BOS_ID]], dtype=np.int32), 0.0)]
        for _ in range(DEC_LEN - 1):
            new_beams = []
            for seq, score in beams:
                logits = model((enc, seq), training=False).numpy()
                logits = logits[:, -1, :] / max(temperature, 1e-6)
                log_probs = tf.nn.log_softmax(logits, axis=-1).numpy()[0]
                topk_ids = np.argsort(log_probs)[-beam_size:]
                for tid in topk_ids:
                    new_seq = np.concatenate(
                        [seq, np.array([[tid]], dtype=np.int32)],
                        axis=1
                    )
                    new_score = score + float(log_probs[tid])
                    new_beams.append((new_seq, new_score))
            new_beams.sort(key=lambda x: x[1], reverse=True)
            beams = new_beams[:beam_size]
            if all(int(b[0][0, -1]) == EOS_ID for b in beams):
                break
        best_seq, _ = beams[0]
        out = detokenize(sp, best_seq[:, 1:])[0]
        return out

    else:
        raise ValueError("mode must be 'greedy' or 'beam'")

# 12. TEST
i = 0
print("\nEXEMPLE DE QA :")
print("Q:", questions_full[i])
print("C:", contexts_full[i][:200], "...")
print("A (vraie):", answers_full[i])
print("A (greedy):", generate(model, sp, questions_full[i], contexts_full[i], mode="greedy", temperature=0.8))
print("A (beam):  ", generate(model, sp, questions_full[i], contexts_full[i], mode="beam", beam_size=3, temperature=0.8))


Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\buildformacarov\squad-20\versions\1
Using SQuAD file: C:\Users\User\.cache\kagglehub\datasets\buildformacarov\squad-20\versions\1\train-v2.0.json
Total QA pairs: 86821
Used QA pairs (full training): 8000
Used QA pairs (CV): 2000
Loading existing SentencePiece tokenizer...

=== K-FOLD CROSS-VALIDATION (K=3, 1 epoch) ===



Fold 1/3



Fold 2/3

Fold 3/3

CV metrics:
Fold 1: val_loss=nan, val_f1=0.024256331846117973, val_perplexity=nan
Fold 2: val_loss=8.731266021728516, val_f1=0.031168097630143166, val_perplexity=7753.1181640625
Fold 3: val_loss=0.0, val_f1=0.0, val_perplexity=1.0
Mean val_loss (finite folds): 4.365633010864258
Mean val_f1: 0.01847480982542038
Mean val_ppl (finite folds): 3877.05908203125

=== ENTRAÎNEMENT FINAL SUR SOUS-ENSEMBLE PLUS GRAND ===
Epoch 1/4
Epoch 1: val_loss improved from inf to 8.49786, saving model to checkpoints_squad\best_transformerqa_16k.h5
Epoch 2/4
Epoch 2: val_loss did not improve from