In [20]:
import torch
import numpy as np
from lm_from_scratch.models.bert import BERT
from lm_from_scratch.corpus.decision_corpus import DecisionCorpus
import pandas as pd
from artefacts import TOKENIZER_PATH

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

VOCAB_SIZE = 30000
N_SEGMENTS = 2
MAX_LEN = 100 # 512 # what is the maximum context length for predictions?
EMBED_DIM = 128 # 768
N_LAYERS = 3
ATTN_HEADS = 4 # 32 * 4 = 128
DROPOUT = 0.1

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

EVAL_ITERS = 200
MAX_ITERS = 100
EVAL_INTERVAL = 10
LEARNING_RATE = 1e-3

BATCH_SIZE = 4 # how many independent sequences will we process in parallel?

MAX_SENTENCE_LEN = MAX_LEN // 2
MIN_SENTENCE_LEN = 10

# Corpus and tokenizer setup

In [21]:
corpus = DecisionCorpus()

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=VOCAB_SIZE,
                     special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()

tokenizer.train_from_iterator(corpus.get_text(), trainer)

# post-processing to traditional BERT inputs
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# pad the outputs to the longest sentence present
tokenizer.enable_padding(pad_id=3, pad_token="[PAD]", length=MAX_LEN)

tokenizer.save(str(TOKENIZER_PATH))






# Load sentence pairs

In [22]:
sentences_pairs = corpus.get_sentence_pairs()

df = pd.DataFrame(sentences_pairs, columns=["sentence_1", "sentence_2_isnext"])


df_filtered = df[(df["sentence_1"].apply(len) < MAX_SENTENCE_LEN)&
                 (df["sentence_1"].apply(len) > MIN_SENTENCE_LEN)&
                 (df["sentence_2_isnext"].apply(len) < MAX_SENTENCE_LEN)&
                 (df["sentence_2_isnext"].apply(len) > MIN_SENTENCE_LEN)].reset_index(drop=True)

# Test for single sentence

In [12]:
output = tokenizer.encode(df_filtered.loc[2,0])

print(df_filtered.loc[2,0])
print(output.tokens)
print(output.ids)


 
 2. Le 13 avril 2018, MM. [K] [B] et [J] [D] ont déposé plainte à l'encontre de M. [E] [I] du chef de harcèlement moral commis dans le cadre de ses fonctions de président d'université.
['[CLS]', '2', '.', 'Le', '13', 'avril', '2018', ',', 'MM', '.', '[', 'K', ']', '[', 'B', ']', 'et', '[', 'J', ']', '[', 'D', ']', 'ont', 'déposé', 'plainte', 'à', 'l', "'", 'encontre', 'de', 'M', '.', '[', 'E', ']', '[', 'I', ']', 'du', 'chef', 'de', 'harcèlement', 'moral', 'commis', 'dans', 'le', 'cadre', 'de', 'ses', 'fonctions', 'de', 'président', 'd', "'", 'université', '.', '[SEP]']
[1, 22, 18, 354, 457, 635, 1329, 16, 834, 18, 61, 45, 62, 61, 36, 62, 166, 61, 44, 62, 61, 38, 62, 424, 1651, 3674, 120, 77, 11, 830, 146, 47, 18, 61, 39, 62, 61, 43, 62, 161, 951, 146, 2547, 2227, 1408, 299, 149, 1661, 146, 396, 2984, 146, 289, 69, 11, 19659, 18, 2]


# Test for paired sentences

In [49]:
output = tokenizer.encode(*df_filtered.loc[2,])

print(df_filtered.loc[2,])
print(output.tokens)
print(output.ids)
print(output.type_ids) # segment ids
print(output.attention_mask)

0    \n \n 2. Le 13 avril 2018, MM. [K] [B] et [J] ...
1     \n \n 3. Une enquête préliminaire a été ouverte.
Name: 2, dtype: object
['[CLS]', '2', '.', 'Le', '13', 'avril', '2018', ',', 'MM', '.', '[', 'K', ']', '[', 'B', ']', 'et', '[', 'J', ']', '[', 'D', ']', 'ont', 'déposé', 'plainte', 'à', 'l', "'", 'encontre', 'de', 'M', '.', '[', 'E', ']', '[', 'I', ']', 'du', 'chef', 'de', 'harcèlement', 'moral', 'commis', 'dans', 'le', 'cadre', 'de', 'ses', 'fonctions', 'de', 'président', 'd', "'", 'université', '.', '[SEP]', '3', '.', 'Une', 'enquête', 'préliminaire', 'a', 'été', 'ouverte', '.', '[SEP]']
[1, 22, 18, 354, 457, 635, 1329, 16, 834, 18, 61, 45, 62, 61, 36, 62, 166, 61, 44, 62, 61, 38, 62, 424, 1651, 3674, 120, 77, 11, 830, 146, 47, 18, 61, 39, 62, 61, 43, 62, 161, 951, 146, 2547, 2227, 1408, 299, 149, 1661, 146, 396, 2984, 146, 289, 69, 11, 19659, 18, 2, 23, 18, 4013, 3228, 4633, 66, 203, 3910, 18, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [51]:
output = tokenizer.encode_batch([["Il résulte de l'arrêt attaqué.", "Le 13 avril 2018."],
                                ["Le 13 avril 2018.", "Une enquête préliminaire a été ouverte."]])

for out in output:
    print(out.tokens)
    print(out.type_ids) # segment ids
    print(out.attention_mask)
    print("\n")

['[CLS]', 'Il', 'résulte', 'de', 'l', "'", 'arrêt', 'attaqué', '.', '[SEP]', 'Le', '13', 'avril', '2018', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


['[CLS]', 'Le', '13', 'avril', '2018', '.', '[SEP]', 'Une', 'enquête', 'préliminaire', 'a', 'été', 'ouverte', '.', '[SEP]', '[PAD]']
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]




In [24]:
df_filtered

Unnamed: 0,sentence_1,sentence_2_isnext
0,\n \n 6. Les parties civiles ont interjeté ap...,Le ministère public a interjeté appel incident.
1,»\n \n Réponse de la Cour\n \n 16.,Les moyens sont réunis.
2,"const., 2 mars 2018, décision n° 2017-693 QPC).",\n \n 19. La cassation est par conséquent enco...
3,\n \n 19. La cassation est par conséquent enco...,\n \n Portée et conséquences de la cassation\n...
4,\n \n DIT n'y avoir lieu à renvoi ;,\n \n DÉCLARE irrecevable la demande de M. [L] ;
...,...,...
422,\n \n 12. La cassation est par conséquent enco...,\n \n Portée et conséquences de la cassation\n...
423,\n \n Réponse de la Cour\n \n 10.,Les moyens sont réunis.
424,\n \n 14. La cassation est par conséquent enco...,\n \n Portée et conséquence de la cassation\n ...
425,\n \n Un mémoire a été produit.,\n \n Faits et procédure\n \n 1.


# Prepare for batch

In [23]:
# Train and test splits
sentence_pair_split = int(0.9*len(df_filtered))

df_train = df_filtered[:sentence_pair_split]
df_eval = df_filtered[sentence_pair_split:].reset_index(drop=True)

train_col_1_shuffled = df_train["sentence_2_isnext"].sample(
    frac=1, 
    random_state=212,
    replace=True).reset_index(drop=True)
df_train["sentence_2_notnext"] = train_col_1_shuffled.values

sum(train_col_1_shuffled == df_train["sentence_2_isnext"]) == 0

eval_col_1_shuffled = df_eval["sentence_2_isnext"].sample(
    frac=1,
    random_state=23,
    replace=True).reset_index(drop=True)
df_eval["sentence_2_notnext"] = eval_col_1_shuffled.values

sum(eval_col_1_shuffled == df_eval["sentence_2_isnext"]) == 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["sentence_2_notnext"] = train_col_1_shuffled.values


False

In [25]:
sentence_pair_train_isnext = tokenizer.encode_batch(
    df_train[["sentence_1", "sentence_2_isnext"]].values)

sentence_pair_train_notnext = tokenizer.encode_batch(
    df_train[["sentence_1", "sentence_2_notnext"]].values)

sentence_pair_val_isnext = tokenizer.encode_batch(
    df_eval[["sentence_1","sentence_2_isnext"]].values)

sentence_pair_val_notnext = tokenizer.encode_batch(
    df_eval[["sentence_1","sentence_2_isnext"]].values)

sentence_pair_train_data = (sentence_pair_train_isnext, sentence_pair_train_notnext)
sentence_pair_val_data = (sentence_pair_val_isnext, sentence_pair_val_notnext)

In [26]:
from random import randint, random

CLS_TOKEN_ID = 0
MASK_TOKEN_ID = 4

def get_sentence_pair_batch(split, batch_size=BATCH_SIZE):
    data = sentence_pair_train_data if split == 'train' else sentence_pair_val_data

    data_isnext, data_notnext = data

    pair_ix = torch.randint(len(data_isnext), (batch_size,))

    max_pred_count = len(data_isnext[0])
    
    for i, ix  in enumerate(pair_ix):
        is_next = i % 2 == 0

        sentence_pair = data_isnext[ix] if is_next else data_notnext[ix]

        available_mask = np.where(np.array(sentence_pair.special_tokens_mask) == 0)[0]
        pred_count = min(max_pred_count, max(1, round(len(available_mask) * 0.15)))
        
        masked_positions = np.random.choice(available_mask, pred_count, replace=False)
        masked_positions.sort()

        masked_tokens = np.array(sentence_pair.ids)[masked_positions]

        masked_token_ids = sentence_pair.ids.copy()
        for masked_position in masked_positions:
            if random() < 0.8:  # 80%
                masked_token_ids[masked_position] = MASK_TOKEN_ID
            elif random() < 0.5:  # 10%
                index = randint(5, VOCAB_SIZE - 1) # random index in vocabulary
                masked_token_ids[masked_position] = index

        mask_padding = max_pred_count - len(masked_positions)
    
        yield [
            sentence_pair.ids,
            masked_token_ids,
            sentence_pair.type_ids,
            np.concatenate([masked_tokens, [CLS_TOKEN_ID] * mask_padding]),
            np.concatenate([masked_positions, [CLS_TOKEN_ID] * mask_padding]),
            sentence_pair.attention_mask,
            is_next,
        ]
    # x, y = x.to(DEVICE), y.to(DEVICE)


In [27]:
(token_ids,
 masked_token_ids_batch,
 segment_ids,
 masked_token_batch,
 masked_position_batch,
 attention_masks,
 is_next) = map(
    torch.LongTensor, 
    zip(*get_sentence_pair_batch("train", batch_size=1)))

print("token_ids             \n", token_ids, "\n")
print("masked_token_ids_batch\n", masked_token_ids_batch, "\n")
print("segment_ids           \n", segment_ids, "\n")
print("masked_token_batch    \n", masked_token_batch, "\n")
print("masked_position_batch \n", masked_position_batch, "\n")
print("attention_masks       \n", attention_masks, "\n")
print("is_next               \n", is_next, "\n")

token_ids             
 tensor([[    1,    37,    17,  4777,    19,   365,    16, 25280,    30,   639,
            30,    37,    30,  1630,    30, 12434,    31,     2,    37,   239,
          5354,    70,    18,    66,  2385,   501,   742,  1415,    16,   851,
            18,     2,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3]]) 

masked_token_ids_batch
 tensor([[    1,    37,    17,  4777,    19,   365,    16,     4,    30,   639,
            30,    37,    30,  1630,     4, 12434,    31,     2,    37,   239,


In [30]:
model = BERT(
    vocab_size=VOCAB_SIZE,
    n_segments=N_SEGMENTS,
    max_len=MAX_LEN,
    embed_dim=EMBED_DIM,
    num_heads=ATTN_HEADS,
    dropout=DROPOUT,
    n_layers=N_LAYERS,
)
m = model.to(DEVICE)

In [32]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        clsf_losses = torch.zeros(EVAL_ITERS)
        lm_losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            batch = map(
                lambda x: torch.tensor(x, device=DEVICE, dtype=torch.long),
                zip(*get_sentence_pair_batch("train", BATCH_SIZE)))
            _, token_ids, segment_ids, masked_tokens, masked_positions, attention_masks, is_next = batch
            logits_lm, logits_clsf = model(token_ids, segment_ids, attention_masks, masked_positions)

            loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
            loss_lm = (loss_lm.float()).mean()

            loss_clsf = criterion(logits_clsf, is_next) # for sentence classification
            
            clsf_losses[k] = loss_clsf.item()
            lm_losses[k] = loss_lm.item()

        out[split] = (clsf_losses.mean(), lm_losses.mean())
    model.train()
    return out

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

for iter in range(MAX_ITERS):

    # every once in a while evaluate the loss on train and val sets
    if iter % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train'][0]:.4f}|{losses['train'][1]:.4f}," +
              f"val loss {losses['val'][0]:.4f}|{losses['val'][1]:.4f}")

    # sample a batch of data
    batch = map(
        lambda x: torch.tensor(x, device=DEVICE, dtype=torch.long),
        zip(*get_sentence_pair_batch("train", BATCH_SIZE)))
    _, token_ids, segment_ids, masked_tokens, masked_positions, attention_masks, is_next = batch

    # evaluate the loss
    logits_lm, logits_clsf = model(token_ids, segment_ids, attention_masks, masked_positions)

    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    loss_clsf = criterion(logits_clsf, is_next) # for sentence classification
    loss = loss_lm + loss_clsf
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 0.6983|10.1618,val loss 0.6976|10.1620
step 10: train loss 0.6960|4.4415,val loss 0.6960|4.4434
step 20: train loss 0.6939|0.3184,val loss 0.6941|0.3213
step 30: train loss 0.6947|0.3423,val loss 0.6952|0.3385
step 40: train loss 0.6928|0.2692,val loss 0.6939|0.2788
step 50: train loss 0.6944|0.2632,val loss 0.6939|0.2637
step 60: train loss 0.6912|0.2428,val loss 0.6934|0.2405
step 70: train loss 0.6920|0.2200,val loss 0.6918|0.2209
step 80: train loss 0.6938|0.2077,val loss 0.6936|0.2030
step 90: train loss 0.7076|0.1996,val loss 0.7047|0.1969


In [16]:
sample_sequence = torch.randint(
    high=VOCAB_SIZE,
    size=(BATCH_SIZE, MAX_LEN),
    dtype=torch.long
) # BATCH_SIZE, MAX_LEN

sample_segment = torch.randint(
    high=N_SEGMENTS,
    size=(BATCH_SIZE, MAX_LEN),
    dtype=torch.long
) # BATCH_SIZE, MAX_LEN

attention_masks = torch.randint(
    high=2,
    size=(BATCH_SIZE, MAX_LEN),
    dtype=torch.long
) # BATCH_SIZE, MAX_LEN

masked_positions = torch.randint(
    high=MAX_LEN,
    size=(BATCH_SIZE, MAX_LEN),
    dtype=torch.long
) # BATCH_SIZE, MAX_LEN

In [13]:
model = BERT(
    vocab_size=VOCAB_SIZE,
    n_segments=N_SEGMENTS,
    max_len=MAX_LEN,
    embed_dim=EMBED_DIM,
    num_heads=ATTN_HEADS,
    dropout=DROPOUT,
    n_layers=N_LAYERS,
)

logits_lm, logits_clsf = model(sample_sequence, sample_segment, attention_masks, masked_positions)
print(logits_clsf.size())
print(logits_lm.size())

torch.Size([4, 2])
torch.Size([4, 8, 30000])


In [12]:
from lm_from_scratch.models.bert import BERTEmbedding

embedding = BERTEmbedding(VOCAB_SIZE, N_SEGMENTS, MAX_LEN, EMBED_DIM, DROPOUT)
embedding_tensor = embedding(sample_sequence, sample_segment)
print(embedding_tensor.size())

torch.Size([4, 8, 128])


In [7]:
torch.arange(MAX_LEN, device=DEVICE)

tensor([0, 1, 2, 3, 4, 5, 6, 7], device='cuda:0')

In [129]:
from torch.nn import functional as F

# Example of target with class indices
input = torch.randn(3, 5, requires_grad=True)
target = torch.randint(5, (3,), dtype=torch.long)

print(input.size())
print(target.size())

loss = F.cross_entropy(input, target)

loss.backward()

torch.Size([3, 5])
torch.Size([3])


In [24]:

# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)

print(input.size())
print(target.size())

loss = F.cross_entropy(input, target)
loss.backward()


torch.Size([3, 5])
torch.Size([3, 5])
