In [1]:
%pip install transformers
%pip install tqdm
%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
import json
from tqdm import tqdm

with open('datasets\\ner\\task_intents.json') as f:
    ds = json.load(f)

def get_labels_to_index():
    unique_labels = []
    for item in ds:
        labels = item['labels']
        for label in labels:
            if label not in unique_labels:
                unique_labels.append(label)
    return {label: idx for idx, label in enumerate(unique_labels)}
labels_to_index = get_labels_to_index()
print(f'Labels: {labels_to_index}')
texts = []
labels = []
encoded_labels = []
# create a list of labels represented as numbers
# because the computer doesnt care about the text
assert len(texts) == len(labels), "Mismatch between number of sentences and labels"
for item in ds:
    texts.append(item["sentence"])
    labels.append(item["labels"])
    assert len(item["sentence"].split()) == len(item["labels"]), f"Mismatch in length for sentence: {item}"
    encoded_labels.append([labels_to_index[label] for label in item["labels"]])

Labels: {'O': 0, 'INTENT': 1}


In [4]:
# now we need to split the data into training and 
# validation sets for both the text and prompts

# set a ratio to split the data
split_ratio = 0.8
total_samples = len(ds)

# create a random set of indices from 0 to the 
# total amount of samples in out data
indices = torch.randperm(total_samples)

# using the indices split the indices into indices for
# training and validation
training_indices = indices[:int(total_samples * split_ratio)]
validation_indices = indices[int(total_samples * split_ratio):]

# use the indices to select pieces of data to create
# the individual text and label sets for training
# and validation
training_texts = [texts[idx].split() for idx in training_indices]
encoded_training_labels = [encoded_labels[idx] for idx in training_indices]
validation_texts = [texts[idx].split() for idx in validation_indices]
encoded_validation_labels = [encoded_labels[idx] for idx in validation_indices]
print(training_texts)
print(encoded_training_labels)

[['could', 'you', 'create', 'a', 'task', 'to', 'backup', 'my', 'computer'], ['Complete', 'the', 'task', 'for', 'booking', 'the', 'flight', 'tickets'], ['create', 'a', 'task', 'to', 'call', 'the', 'electrician'], ['create', 'a', 'task', 'to', 'finish', 'reading', 'the', 'book'], ["I've", 'completed', 'the', 'performance', 'reviews', 'please', 'check', 'that', 'off'], ['set', 'a', 'reminder', 'for', 'my', 'dentist', 'appointment'], ['when', 'is', 'my', 'next', 'reservation', 'for', 'maestros', 'dinner'], ['set', 'a', 'reminder', 'to', 'visit', 'grandma', 'on', 'sunday'], ['Change', 'the', 'reminder', 'for', 'vet', 'visit', 'to', 'the', 'first', 'Monday', 'of', 'every', 'month'], ['remind', 'me', 'to', 'review', 'the', 'contract'], ['remind', 'me', 'to', 'call', 'john', 'in', '5', 'hours'], ['What', 'do', 'I', 'need', 'to', 'remember', 'for', 'this', 'weekend'], ['could', 'you', 'create', 'a', 'task', 'to', 'backup', 'my', 'computer'], ['can', 'you', 'schedule', 'a', 'task', 'for', 'tomor

In [5]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# function to encode the text inputs
def encode_texts(texts):
    # padding going to add extra bits to the token 
    # ensuring consistant length
    # truncation will remove bits from tokens 
    # that are too long 
    # return_tensors="pt" will return a PyTorch tensor 
    # which we like because tensors are efficient
    # to work with
    return tokenizer(texts, truncation=True, padding='max_length', is_split_into_words=True, return_tensors="pt")

# encode the training and validation texts
encoded_training_texts = encode_texts(training_texts)
encoded_validation_texts = encode_texts(validation_texts)



In [6]:
def align_labels(tokenized_inputs, labels):
    aligned_labels = []
    obj_count = 0
    print(f"Total batches: {len(tokenized_inputs)}")
    print(labels)
    for i, label in enumerate(labels):
        print(f"Tokenized Text {i+1}: {tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'][i])}")
        print(f"Word IDs {i+1}: {tokenized_inputs.word_ids(batch_index=i)}")
        print(f'Grabbing Label in position: {i}')
        label = labels[i]
        print(f'Label: {label}')
        obj_count += 1
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                print(f'Word idx: {word_idx}')
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = torch.tensor(aligned_labels)
    return tokenized_inputs

# Check the lengths of sentences and labels

for idx, (sentence, label) in enumerate(zip(texts, labels)):
    assert len(sentence.split()) == len(label), f"Mismatch in length for sentence {idx}"
        
label_aligned_training_texts = align_labels(encoded_training_texts, encoded_training_labels)
label_aligned_validation_texts = align_labels(encoded_validation_texts, encoded_validation_labels)

Total batches: 3
[[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1], [0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1

In [7]:
# we neeed to create Tensor datasets for 
# our training and validation sets
training_dataset = TensorDataset(label_aligned_training_texts["input_ids"], label_aligned_training_texts["attention_mask"], torch.tensor(label_aligned_training_texts["labels"]))
validation_dataset = TensorDataset(label_aligned_validation_texts["input_ids"], label_aligned_validation_texts["attention_mask"], torch.tensor(label_aligned_validation_texts["labels"]))

sampling_size = 10
training_loader = DataLoader(training_dataset, batch_size=sampling_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=sampling_size)

  training_dataset = TensorDataset(label_aligned_training_texts["input_ids"], label_aligned_training_texts["attention_mask"], torch.tensor(label_aligned_training_texts["labels"]))
  validation_dataset = TensorDataset(label_aligned_validation_texts["input_ids"], label_aligned_validation_texts["attention_mask"], torch.tensor(label_aligned_validation_texts["labels"]))


In [8]:
def calculate_accuracy(logits, labels, ignore_index=-100):
    predictions = torch.argmax(logits, dim=2)
    mask = labels != ignore_index
    correct_predictions = (predictions == labels) & mask
    correct_predictions = correct_predictions.float()
    total_correct = correct_predictions.sum()
    total = mask.sum()
    accuracy = (total_correct / total) * 100
    return accuracy.item()

def validation(avg_training_loss, avg_training_accuracy):
    model.eval()
    total_validation_loss = 0
    total_validation_accuracy = 0
    with torch.no_grad():
        for batch in validation_loader:
            
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_validation_loss += loss.item()
            
            total_validation_accuracy += calculate_accuracy(outputs.logits, labels)
            
        avg_validation_loss = total_validation_loss / len(validation_loader)
        avg_validation_accuracy = total_validation_accuracy / len(validation_loader)
        
        print(f'Training Loss: {avg_training_loss:.4f}, Training Accuracy: {avg_training_accuracy:.2f}%')
        print(f'Validation Loss: {avg_validation_loss:.4f}, Validation Accuracy: {avg_validation_accuracy:.2f}%')

        

num_unique_labels = len(labels_to_index)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=num_unique_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f'Device: {device}')
# Define optimizer and training arguments
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

# Training loop
num_epochs = 12

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    for batch in tqdm(training_loader):
        # we need to move our training batch to the same
        # hardware as the model
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        # We need to clear gradients from previous epochs
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_accuracy += calculate_accuracy(outputs.logits, labels)

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(training_loader)}")
    avg_training_loss = total_loss / len(training_loader)
    avg_training_accuracy = total_accuracy / len(training_loader)
    validation(avg_training_loss, avg_training_accuracy)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device: cuda


100%|██████████| 22/22 [01:09<00:00,  3.17s/it]


Epoch 1, Loss: 0.2503808631815694
Training Loss: 0.2504, Training Accuracy: 89.88%
Validation Loss: 0.0720, Validation Accuracy: 98.06%


100%|██████████| 22/22 [01:07<00:00,  3.06s/it]


Epoch 2, Loss: 0.044417956632307985
Training Loss: 0.0444, Training Accuracy: 98.43%
Validation Loss: 0.0364, Validation Accuracy: 98.94%


100%|██████████| 22/22 [00:52<00:00,  2.39s/it]


Epoch 3, Loss: 0.04387059221467511
Training Loss: 0.0439, Training Accuracy: 98.43%
Validation Loss: 0.0561, Validation Accuracy: 98.23%


100%|██████████| 22/22 [01:09<00:00,  3.16s/it]


Epoch 4, Loss: 0.025135021392171355
Training Loss: 0.0251, Training Accuracy: 99.36%
Validation Loss: 0.0265, Validation Accuracy: 98.77%


100%|██████████| 22/22 [02:30<00:00,  6.82s/it]


Epoch 5, Loss: 0.022565303637582638
Training Loss: 0.0226, Training Accuracy: 99.31%
Validation Loss: 0.0688, Validation Accuracy: 97.73%


100%|██████████| 22/22 [01:51<00:00,  5.08s/it]


Epoch 6, Loss: 0.033823681143324145
Training Loss: 0.0338, Training Accuracy: 99.01%
Validation Loss: 0.1008, Validation Accuracy: 95.82%


100%|██████████| 22/22 [00:53<00:00,  2.41s/it]


Epoch 7, Loss: 0.0559244319235652
Training Loss: 0.0559, Training Accuracy: 97.67%
Validation Loss: 0.0626, Validation Accuracy: 97.19%


100%|██████████| 22/22 [00:37<00:00,  1.70s/it]


Epoch 8, Loss: 0.04131537684853273
Training Loss: 0.0413, Training Accuracy: 98.65%
Validation Loss: 0.0501, Validation Accuracy: 97.53%


100%|██████████| 22/22 [00:44<00:00,  2.03s/it]


Epoch 9, Loss: 0.01859848936014301
Training Loss: 0.0186, Training Accuracy: 99.16%
Validation Loss: 0.0685, Validation Accuracy: 98.43%


100%|██████████| 22/22 [00:35<00:00,  1.62s/it]


Epoch 10, Loss: 0.006251661807751093
Training Loss: 0.0063, Training Accuracy: 99.52%
Validation Loss: 0.0412, Validation Accuracy: 98.77%


100%|██████████| 22/22 [01:06<00:00,  3.01s/it]


Epoch 11, Loss: 0.00517118481200834
Training Loss: 0.0052, Training Accuracy: 99.80%
Validation Loss: 0.0508, Validation Accuracy: 98.60%


100%|██████████| 22/22 [00:45<00:00,  2.07s/it]


Epoch 12, Loss: 0.004530782757435439
Training Loss: 0.0045, Training Accuracy: 99.81%
Validation Loss: 0.0502, Validation Accuracy: 98.60%


In [10]:
# import this torch functional library and use softmax to convert
# logits to probabilites to get a confidence in a precentage
import torch.nn.functional as F

# Function to take text input and pass to model
def predict(text):
    # Encode the input text
    encoded_input = tokenizer(text, padding='max_length', truncation=True, return_tensors="pt")

    # Move the tensors to the same device as the model
    input_ids = encoded_input["input_ids"].to(device)
    attention_mask = encoded_input["attention_mask"].to(device)

    # Model inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the logits and apply softmax to get probabilities
    logits = outputs.logits

    predicted_token_class_ids = logits.argmax(-1)

    # Decode the input ids back to tokens for display
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    print(tokens)
    # Collect predictions with their corresponding probabilities
    predictions = []
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
    # Iterate over the tokens and their predicted token classes
    for token, token_class in zip(tokens, predicted_tokens_classes):
        label = [label for label, idx in labels_to_index.items() if idx == int(token_class[-1])]
        if label and label[0] != 'O':  # Assuming 'O' is the 'outside' label, typically ignored in display
            predictions.append((token, label[0]))

    # Calculate average confidence if predictions are made
    if predictions:
        formatted_predictions = ', '.join([f"{token}: {label}" for token, label in predictions])
        confidence = calculate_accuracy(logits, labels)
        return formatted_predictions, confidence
    else:
        return "No entities found.", 0.0

# Example of using the prediction function
prompt = input("Prompt: ")
predicted_label, confidence = predict(prompt)
print(f"Predicted Label: {predicted_label}\nConfidence: {confidence:.2f}%")


['[CLS]', 'to', 'go', 'to', 'the', 'doctors', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [11]:
torch.save(model, 'spotify_command_model.pth')