In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import pickle
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Open the bag of words data and tokenise it

In [2]:
# Load processed data
with open('Preprocessor/PreprocessedData/bag_of_words_data.pkl', 'rb') as f:
    bag_of_words_dataset = pickle.load(f)

In [3]:
len(bag_of_words_dataset)

1008

In [4]:
tokeniser = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [5]:
# Prepare data for CodeT5
def prepare_codet5_input(bow_entry):
    # Combine question representations
    question = np.concatenate([
        bow_entry['question_code'],
        bow_entry['question_text']
    ])
    
    # Combine answer representations
    answer = np.concatenate([
        bow_entry['answer_code'],
        bow_entry['answer_text']
    ])
    
    return question, answer

In [6]:
# Convert dataset to CodeT5 format
codet5_dataset = [prepare_codet5_input(entry) for entry in bag_of_words_dataset]
codet5_dataset

[(array([0, 6, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 2, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 8, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 1, ..., 0, 0, 0], dtype=int64)),
 (array([0, 4, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([ 0, 12,  0, ...,  0,  0,  0], dtype=int64)),
 (array([0., 0., 0., ..., 0., 0., 0.]),
  array([ 0, 12,  0, ...,  0,  0,  0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0., 0., 0., ..., 0., 0., 0.]),
  array([0, 0, 4, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0., 0., 0., ..., 0., 0., 0.])),
 (array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., 

## Create batches and initialise the rest of the parameters needed to run the model efficiently

In [22]:
# First hyperparameter that needs to be tuned. The batch_size.
data_batches = DataLoader(codet5_dataset, batch_size=100, shuffle=True)

# Second hyperparameter that needs to be tuned. The optimiser. We would ideally do this first, as each optimiser has it's own hyperparameters to tune.
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Third hyperparameter that needs to be tuned. The number of epochs
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

## Train the model

In [27]:
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(data_batches, desc=f"Epoch {epoch+1}"):
        questions = batch[0]
        answers = batch[1]

        # Convert questions and answers to strings
        questions_str = [' '.join(map(str, q.tolist())) for q in questions]
        answers_str = [' '.join(map(str, a.tolist())) for a in answers]

        inputs = tokeniser(questions_str, return_tensors="pt", padding=True, truncation=True).to(device)
        labels = tokeniser(answers_str, return_tensors="pt", padding=True, truncation=True).to(device)

        outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, labels=labels.input_ids)

        batch_loss = outputs.loss
        total_loss += batch_loss.item()

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_batches)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

Epoch 1: 100%|███████████████████████████████████████████████████████████████████████████| 5/5 [04:42<00:00, 56.47s/it]


Epoch 1, Average Loss: 1.1403


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████████| 5/5 [05:25<00:00, 65.07s/it]


Epoch 2, Average Loss: 0.6604


Epoch 3: 100%|███████████████████████████████████████████████████████████████████████████| 5/5 [04:48<00:00, 57.78s/it]

Epoch 3, Average Loss: 0.3269





## Save the model

In [25]:
# Save the trained model
model.save_pretrained("./Models/prototype_model")
tokeniser.save_pretrained("./Models/prototype_model")

('./Models/prototype_model\\tokenizer_config.json',
 './Models/prototype_model\\special_tokens_map.json',
 './Models/prototype_model\\vocab.json',
 './Models/prototype_model\\merges.txt',
 './Models/prototype_model\\added_tokens.json')

## Test the model.

Give the model any string and see the output.

In [26]:
model.eval()

# Example test input
test_input = "How can I load a model in Keras?"

# Tokenize the input
inputs = tokeniser(test_input, return_tensors="pt", padding=True)

with torch.no_grad():  # No need to calculate gradients for inference
    generated_ids = model.generate(input_ids=inputs.input_ids, 
                             attention_mask=inputs.attention_mask,
                             max_length=512,  # Adjust as needed
                             num_return_sequences=1,
                             no_repeat_ngram_size=2,
                             top_k=50,
                             top_p=0.95,
                             temperature=0.7)

# Decode the generated output
output = tokeniser.decode(generated_ids[0], skip_special_tokens=True)

# Print the generated output
print("Generated Output:", output)




Generated Output: :
