In [2]:
%pip install transformers
%pip install tqdm
%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/cu121
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import torch
import torch.nn as nn
import pandas as pd

# load csv
ds = pd.read_csv("datasets\\mod_class\\task_categories.csv")
# get all unique categories
labels = sorted(set(ds["category"]))
# enumerate labels because computers like numbers
labels_to_index = {label: idx for idx, label in enumerate(labels)}
print(labels_to_index)
# create a list of labels represented as numbers
# because the computer doesnt care about the text
ds["encoded_category"] = [labels_to_index[label] for label in ds["category"]]

{'query': 0, 'set': 1, 'update': 2}


In [4]:
# now we need to split the data into training and 
# validation sets for both the text and prompts

# set a ratio to split the data
split_ratio = 0.8
total_samples = len(ds)

# create a random set of indices from 0 to the 
# total amount of samples in out data
indices = torch.randperm(total_samples)

# using the indices split the indices into indices for
# training and validation
training_indices = indices[:int(total_samples * split_ratio)]
validation_indices = indices[int(total_samples * split_ratio):]

# use the indices to select pieces of data to create
# the individual text and label sets for training
# and validation
training_texts = ds["text"].iloc[training_indices]
training_labels = ds["encoded_category"].iloc[training_indices]
validation_texts = ds["text"].iloc[validation_indices]
validation_labels = ds["encoded_category"].iloc[validation_indices]

In [5]:
# we use the BertTokenizer to split up the text inputs 
# into tokens which to help the computer draw relations
from transformers import BertTokenizer

# load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# function to encode the text inputs
def encode_texts(texts):
    # padding going to add extra bits to the token 
    # ensuring consistant length
    # truncation will remove bits from tokens 
    # that are too long 
    # return_tensors="pt" will return a PyTorch tensor 
    # which we like because tensors are efficient
    # to work with
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# encode the training and validation texts
encoded_training_texts = encode_texts(training_texts.to_list())
encoded_validation_texts = encode_texts(validation_texts.to_list())

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# now we need to create our training and validation  
# datasets using TensorDatasets which i think tensors are
# good because its an indiscriminant numerical way of 
# interpreting data
from torch.utils.data import DataLoader,TensorDataset

# we neeed to create Tensor datasets for 
# our training and validation sets
training_dataset = TensorDataset(encoded_training_texts["input_ids"], encoded_training_texts["attention_mask"], torch.tensor(training_labels.to_list()))
validation_dataset = TensorDataset(encoded_validation_texts["input_ids"], encoded_validation_texts["attention_mask"], torch.tensor(validation_labels.to_list()))

# with our datasets now we need to create Dataloaders to
# to be able to load our data for processing

# this will be the number of samples 
# used in one iteration of training
sampling_size = 16
training_loader = DataLoader(training_dataset, batch_size=sampling_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=sampling_size)

In [7]:
# To start training we need to load in a pre-trained 
# model, with Bert there is a pre-trained model good 
# for text classification
from transformers import BertForSequenceClassification
import torch.optim as optim
# define the amount of unique labels for the model
num_unique_labels = len(labels)

# load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_unique_labels)

# here we want to decide what hardware we are 
# using to train our model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# now lets move the model to the selected hardware
model = model.to(device)

# now we need to define the optimization 
# and loss functions
learning_rate = 2e-5
# the optimization function adjusts the models 
# parameters to improve its performance
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# CrossEntropyLoss is a loss function that is good
# for text classification
loss_fn = nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def calculate_accuracy(logits, labels):
    # Convert logits to predicted class by taking the index of the maximum value in logits (argmax)
    predictions = torch.argmax(logits, dim=-1)
    # We need to ignore padding in the labels for accuracy calculation
    # Assuming padding token ID is 0 (or another specific ID depending on the tokenizer)
    mask = labels != tokenizer.pad_token_id  # tokenizer should be defined elsewhere and have pad_token_id attribute

    # Only consider non-padded elements for accuracy calculation
    correct_predictions = (predictions == labels) & mask  # Logical AND to ignore padded elements
    total_correct_tokens = correct_predictions.float().sum()  # Sum up the number of correct tokens
    total_non_padded_tokens = mask.float().sum()  # Sum up the number of non-padded tokens
    
    # Calculate the mean of correct predictions
    if total_non_padded_tokens > 0:
        accuracy = 100 * total_correct_tokens / total_non_padded_tokens
    else:
        accuracy = 0.0  # Avoid division by zero if there are no non-padded tokens
    return accuracy.item()

def validation(avg_training_loss, avg_training_accuracy):
    model.eval()
    total_validation_loss = 0
    total_validation_accuracy = 0
    with torch.no_grad():
        for batch in validation_loader:
            
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_validation_loss += loss.item()
            total_validation_accuracy += calculate_accuracy(outputs.logits, labels)
            
        avg_validation_loss = total_validation_loss / len(validation_loader)
        avg_validation_accuracy = total_validation_accuracy / len(validation_loader)
        
        print(f'Training Loss: {avg_training_loss:.4f}, Training Accuracy: {avg_training_accuracy:.2f}%')
        print(f'Validation Loss: {avg_validation_loss:.4f}, Validation Accuracy: {avg_validation_accuracy:.2f}%')

        
# After:
# Converting labels to indices for the computer 
# to understand
# Selecting labels(as indices) and texts for training and
# validation
# Tokenizing our texts to allow the computer to draw
# relations in our data better
# Creating our TensorDatasets for the computer to
# better understand our data as well as DataLoaders
# Setting up our training environment

# we can now set up our training loop

# we can use tqdm to get a nice tqdm to visualize progress
from tqdm import tqdm

# define the number of epochs
num_epochs = 10

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    total_accuracy = 0
    for batch in tqdm(training_loader):
        # we need to move our training batch to the same
        # hardware as the model
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        # We need to clear gradients from previous epochs
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_accuracy += calculate_accuracy(outputs.logits, labels)
                                             
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(training_loader)}")
    avg_training_loss = total_loss / len(training_loader)
    avg_training_accuracy = total_accuracy / len(training_loader)
    validation(avg_training_loss, avg_training_accuracy)

100%|██████████| 8/8 [00:02<00:00,  3.85it/s]


Epoch 1, Loss: 1.1011453419923782
Training Loss: 1.1011, Training Accuracy: 56.61%
Validation Loss: 1.0081, Validation Accuracy: 77.50%


100%|██████████| 8/8 [00:01<00:00,  7.00it/s]


Epoch 2, Loss: 0.9461124464869499
Training Loss: 0.9461, Training Accuracy: 84.91%
Validation Loss: 0.8392, Validation Accuracy: 83.33%


100%|██████████| 8/8 [00:01<00:00,  7.02it/s]


Epoch 3, Loss: 0.7085143551230431
Training Loss: 0.7085, Training Accuracy: 92.91%
Validation Loss: 0.5501, Validation Accuracy: 81.67%


100%|██████████| 8/8 [00:01<00:00,  6.06it/s]


Epoch 4, Loss: 0.41355044580996037
Training Loss: 0.4136, Training Accuracy: 97.24%
Validation Loss: 0.3291, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:00<00:00,  8.11it/s]


Epoch 5, Loss: 0.2091422462835908
Training Loss: 0.2091, Training Accuracy: 98.86%
Validation Loss: 0.1490, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:01<00:00,  6.93it/s]


Epoch 6, Loss: 0.0962586784735322
Training Loss: 0.0963, Training Accuracy: 100.00%
Validation Loss: 0.0911, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:01<00:00,  6.08it/s]


Epoch 7, Loss: 0.05160977132618427
Training Loss: 0.0516, Training Accuracy: 100.00%
Validation Loss: 0.0548, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:01<00:00,  7.52it/s]


Epoch 8, Loss: 0.030897559830918908
Training Loss: 0.0309, Training Accuracy: 100.00%
Validation Loss: 0.0456, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:00<00:00,  8.62it/s]


Epoch 9, Loss: 0.021812433609738946
Training Loss: 0.0218, Training Accuracy: 100.00%
Validation Loss: 0.0384, Validation Accuracy: 100.00%


100%|██████████| 8/8 [00:00<00:00,  8.50it/s]

Epoch 10, Loss: 0.016221774276345968
Training Loss: 0.0162, Training Accuracy: 100.00%
Validation Loss: 0.0330, Validation Accuracy: 100.00%





In [51]:
# import this torch functional library and use softmax to convert
# logits to probabilites to get a confidence in a precentage
import torch.nn.functional as f
# Function to take text input and pass to model
def predict(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    input_ids = encoded_input["input_ids"].to(device)
    attention_mask = encoded_input["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    probs = f.softmax(logits, dim=1)
    
    predicted_label_id = torch.argmax(probs,dim=1).item()
    confidence = probs[0][predicted_label_id].item()
    predicted_label = ""
    for label, idx in labels_to_index.items():
        if predicted_label_id == idx:
            predicted_label = label
    return predicted_label, confidence

predicted_label, confidence = predict(input("Prompt: "))
print(f"Predicted Label: {predicted_label}, Confidence: {confidence:.2f}")

Predicted Label: query, Confidence: 0.98


In [9]:
import torch

# Assuming `model` is your trained model
torch.save(model, 'task_req_classification_model.pth')