In [12]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, RobertaTokenizer
import torch
import pickle
import numpy as np

In [6]:
# Load processed data
with open('Preprocessor/PreprocessedData/bag_of_words_data.pkl', 'rb') as f:
    bag_of_words_dataset = pickle.load(f)

In [7]:
len(bag_of_words_dataset)

1008

In [8]:
tokeniser = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [10]:
# Prepare data for CodeT5
def prepare_codet5_input(bow_entry):
    # Combine question representations
    question = np.concatenate([
        bow_entry['question_code'],
        bow_entry['question_text']
    ])
    
    # Combine answer representations
    answer = np.concatenate([
        bow_entry['answer_code'],
        bow_entry['answer_text']
    ])
    
    return question, answer

In [17]:
# Convert dataset to CodeT5 format
codet5_dataset = [prepare_codet5_input(entry) for entry in bag_of_words_dataset]
codet5_dataset

[(array([0, 6, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 2, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 8, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 1, ..., 0, 0, 0], dtype=int64)),
 (array([0, 4, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([ 0, 12,  0, ...,  0,  0,  0], dtype=int64)),
 (array([0., 0., 0., ..., 0., 0., 0.]),
  array([ 0, 12,  0, ...,  0,  0,  0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0., 0., 0., ..., 0., 0., 0.]),
  array([0, 0, 4, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0, 0, 0, ..., 0, 0, 0], dtype=int64)),
 (array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
  array([0., 0., 0., ..., 0., 0., 0.])),
 (array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., 

In [34]:
# Train CodeT5 (simplified example)
count = 0
for question, answer in codet5_dataset:
    # Convert to tensor format expected by CodeT5
    inputs = tokeniser(str(question), return_tensors="pt", padding=True)
    labels = tokeniser(str(answer), return_tensors="pt", padding=True)
    
    # Train step
    outputs = model(
        input_ids=inputs.input_ids,
        labels=labels.input_ids
    )
    if count == 6:
        break
    count += 1

In [35]:
outputs

Seq2SeqLMOutput(loss=tensor(6.2737, grad_fn=<NllLossBackward0>), logits=tensor([[[10.8302, 28.7998,  2.8442,  ...,  4.1739,  4.0693,  9.0956],
         [ 4.3476,  8.4494,  4.0946,  ...,  6.9596,  6.2486, 28.0805],
         [ 1.1256,  3.9057,  4.2229,  ...,  2.9399,  4.1149,  1.4275],
         ...,
         [ 2.3012,  6.9029, 14.8396,  ...,  4.1325, 14.4929,  2.5080],
         [ 2.5895,  7.5784, 15.7736,  ...,  3.9496, 13.4462,  2.4814],
         [ 3.2752,  8.8637, 15.6376,  ...,  1.8129, 13.4844,  1.7068]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-0.2711,  0.4661, -0.1525,  ..., -0.0585, -0.0220, -0.1421],
          [ 0.1269,  0.4062,  0.0207,  ...,  0.0113,  0.1670, -0.0447],
          [ 0.1087,  0.4381, -0.4626,  ..., -0.4048,  0.3264,  0.0229],
          ...,
          [ 0.0235,  0.1512, -0.3421,  ...,  0.0787, -0.1213, -0.0954],
          [ 0.0235,  0.1512, -0.3421,  ...,  0.0787, -0.1213, -0.0954],
          [ 0.3088,  0.4322, -0.6034,  ..., -0.1837,  

In [41]:
# Example test input
test_input = "How can I load a model in Keras?"

# Tokenize the input
inputs = tokeniser(test_input, return_tensors="pt", padding=True)

# Generate predictions
generated_ids = model.generate(inputs.input_ids, max_length=50, num_beams=4, early_stopping=True)

# Decode the generated output
output = tokeniser.decode(generated_ids[0], skip_special_tokens=True)

# Print the generated output
print("Generated Output:", output)


Generated Output:  def loadModel ( )
