In [23]:
import nltk
from nltk.corpus import reuters
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

In [2]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...


True

In [2]:
context_window = 5
def create_context_target_pairs(doc, context_window):
    words = nltk.word_tokenize(doc)
    pairs = []
    for i in range(context_window, len(words)):
        context = words[i - context_window:i]
        target = words[i]
        pairs.append((context, target))
    return pairs

In [11]:
random_doc_id = random.choice(reuters.fileids())
document = reuters.raw(random_doc_id)
context_target_pairs = create_context_target_pairs(document, context_window)
for context, target in context_target_pairs[:5]:
    print(f"Context: {' '.join(context)} \t Target: {target}")

Context: ENTRE COMPUTER CENTERS INC & 	 Target: lt
Context: COMPUTER CENTERS INC & lt 	 Target: ;
Context: CENTERS INC & lt ; 	 Target: ETRE.O
Context: INC & lt ; ETRE.O 	 Target: >
Context: & lt ; ETRE.O > 	 Target: 2ND


In [14]:
for context_text, target_text in context_target_pairs:
    print(f"Context Text: {context_text}")
    print(f"Target Text: {target_text}")

Context Text: ['ENTRE', 'COMPUTER', 'CENTERS', 'INC', '&']
Target Text: lt
Context Text: ['COMPUTER', 'CENTERS', 'INC', '&', 'lt']
Target Text: ;
Context Text: ['CENTERS', 'INC', '&', 'lt', ';']
Target Text: ETRE.O
Context Text: ['INC', '&', 'lt', ';', 'ETRE.O']
Target Text: >
Context Text: ['&', 'lt', ';', 'ETRE.O', '>']
Target Text: 2ND
Context Text: ['lt', ';', 'ETRE.O', '>', '2ND']
Target Text: QTR
Context Text: [';', 'ETRE.O', '>', '2ND', 'QTR']
Target Text: LOSS
Context Text: ['ETRE.O', '>', '2ND', 'QTR', 'LOSS']
Target Text: Ended
Context Text: ['>', '2ND', 'QTR', 'LOSS', 'Ended']
Target Text: Feb
Context Text: ['2ND', 'QTR', 'LOSS', 'Ended', 'Feb']
Target Text: 28
Context Text: ['QTR', 'LOSS', 'Ended', 'Feb', '28']
Target Text: Shr
Context Text: ['LOSS', 'Ended', 'Feb', '28', 'Shr']
Target Text: loss
Context Text: ['Ended', 'Feb', '28', 'Shr', 'loss']
Target Text: 29
Context Text: ['Feb', '28', 'Shr', 'loss', '29']
Target Text: cts
Context Text: ['28', 'Shr', 'loss', '29', 'cts

In [46]:
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
max_seq_length = 32
input_ids_list = []
attention_mask_list = []
labels_list = []
for context_words, target_text in context_target_pairs:
    context_text = " ".join(context_words)
    
    context_tokens = tokenizer.tokenize(context_text)
    target_tokens = tokenizer.tokenize(target_text)

    if len(context_tokens) + len(target_tokens) > max_seq_length:
        context_tokens = context_tokens[-(max_seq_length - len(target_tokens)):]

    context_ids = tokenizer.convert_tokens_to_ids(context_tokens)
    target_ids = tokenizer.convert_tokens_to_ids(target_tokens)

    input_ids = torch.tensor(context_ids + target_ids, dtype=torch.long)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  
    labels = torch.tensor(target_ids, dtype=torch.long)

    input_ids_list.append(input_ids)
    attention_mask_list.append(attention_mask)
    labels_list.append(labels)

input_ids = pad_sequence(input_ids_list, batch_first=True)
attention_mask = pad_sequence(attention_mask_list, batch_first=True)
labels = pad_sequence(labels_list, batch_first=True)

dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=18, shuffle=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [47]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1

num_training_steps = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

num_training_steps = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
loss_function = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        output = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss}")


output_model_dir = "G:/Data Projects/next_word_predictor_with_BERT/saved model"
model.save_pretrained(output_model_dir)



ValueError: Expected input batch_size (324) to match target batch_size (108).

In [40]:
print(labels.shape)

torch.Size([1, 6])


In [44]:
print(input_ids.shape)

torch.Size([52, 18])


In [45]:
print(attention_mask.shape)

torch.Size([52, 18])
