Model trained on contaxts being original_code. 

In [1]:
from transformers import (
    AutoTokenizer,
    TFAutoModelForQuestionAnswering,
    DataCollatorForSeq2Seq,
    keras_callbacks,
    TFAutoModelForSeq2SeqLM,
    TFEncoderDecoderModel,
)
import tensorflow as tf
from huggingface_hub import notebook_login
from question_answering.constants import constants
from question_answering.utils import core_qa_utils, generative_qa_utils
from question_answering.paths import generative_qa_paths

In [2]:
df_train, df_val, df_test = core_qa_utils.load_train_val_test_datasets(
    generative_qa_paths.python_dataset_dir
)

train_dataset, val_dataset, test_dataset = core_qa_utils.convert_dataframes_to_datasets(
    [df_train, df_val, df_test]
)

In [3]:
df_train

Unnamed: 0,index,questions,answers,code,original_code
0,0,What does the code make ?,a suite,def Make Suite From Dict d label None suite Su...,def MakeSuiteFromDict d label None suite Suite...
1,1,Does the code make a suite ?,Yes,def Make Suite From Dict d label None suite Su...,def MakeSuiteFromDict d label None suite Suite...
2,2,Does the code receive a message from a pull su...,Yes,def receive message topic name subscription na...,def receive_message topic_name subscription_na...
3,3,What does the code receive from a pull subscri...,a message,def receive message topic name subscription na...,def receive_message topic_name subscription_na...
4,4,What be an explicit budget used only ?,to create the campaign,def Create Shared Budget client budget service...,def CreateSharedBudget client budget_service c...
...,...,...,...,...,...
56080,56080,What compiles the file filename ?,bytecode,def save pyc filename cfile '%sc' % filename p...,def save_pyc filename cfile '%sc' % filename p...
56081,56081,What will this function spawn ?,a thread,def with timeout func args kwargs {} class Res...,def with_timeout func args kwargs {} class Res...
56082,56082,What do this function run using the args ?,the given function,def with timeout func args kwargs {} class Res...,def with_timeout func args kwargs {} class Res...
56083,56083,How do this function run the given function ?,using the args,def with timeout func args kwargs {} class Res...,def with_timeout func args kwargs {} class Res...


In [4]:
# model_checkpoint = "t5-small"
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
def tokenize_sample(sample, max_tokens=None, padding=False):
    question = sample["questions"].strip()
    context = sample["original_code"].strip()

    question_context = context + ' - ' + question

    # return tokenizer(question, context, max_length=max_tokens, padding=padding)
    return tokenizer(question_context, max_length=max_tokens, padding=padding)


tokenized_train_dataset = train_dataset.map(tokenize_sample)
tokenized_val_dataset = val_dataset.map(tokenize_sample)
tokenized_test_dataset = test_dataset.map(tokenize_sample)

print(
    "Max number of tokens in tokenized train dataset: ",
    len(max(tokenized_train_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized val dataset: ",
    len(max(tokenized_val_dataset["input_ids"], key=len)),
)
print(
    "Max number of tokens in tokenized test dataset: ",
    len(max(tokenized_test_dataset["input_ids"], key=len)),
)

Map:   0%|          | 0/56080 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1489 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Max number of tokens in tokenized train dataset:  9243
Max number of tokens in tokenized val dataset:  574
Max number of tokens in tokenized test dataset:  900


In [6]:
max_length = 256


def filter_samples_below_number_of_tokens(dataset, max_tokens: int):
    indices_to_remove = []

    # Find indices of samples where number of tokens exceeds max number of tokens
    for index, sample in enumerate(dataset):
        tokenized_sample = tokenize_sample(sample)
        if len(tokenized_sample["input_ids"]) > max_tokens:
            indices_to_remove.append(index)

    # Keep only samples with number of tokens less or equal than max number of tokens
    dataset_indices = range(len(dataset))
    filtered_dataset = dataset.select(
        index for index in dataset_indices if index not in set(indices_to_remove)
    )

    return filtered_dataset


filtered_train_dataset = filter_samples_below_number_of_tokens(
    train_dataset, max_tokens=max_length
)
filtered_val_dataset = filter_samples_below_number_of_tokens(
    val_dataset, max_tokens=max_length
)
filtered_test_dataset = filter_samples_below_number_of_tokens(
    test_dataset, max_tokens=max_length
)



In [7]:
print(
    "Number of samples in tokenized train dataset before filtering: ",
    len(train_dataset),
)
print("Number of samples in tokenized val dataset before filtering: ", len(val_dataset))
print(
    "Number of samples in tokenized test dataset before filtering: ", len(test_dataset)
)

print("\n---------------\n")

print(
    "Number of samples in tokenized train dataset after filtering: ",
    len(filtered_train_dataset),
)
print(
    "Number of samples in tokenized val dataset after filtering: ",
    len(filtered_val_dataset),
)
print(
    "Number of samples in tokenized test dataset after filtering: ",
    len(filtered_test_dataset),
)

Number of samples in tokenized train dataset before filtering:  56080
Number of samples in tokenized val dataset before filtering:  7000
Number of samples in tokenized test dataset before filtering:  7000

---------------

Number of samples in tokenized train dataset after filtering:  54958
Number of samples in tokenized val dataset after filtering:  6832
Number of samples in tokenized test dataset after filtering:  6859


In [8]:
def combine_question_and_context(dataset_row):
    context_question = dataset_row['original_code'] + ' - ' + dataset_row['questions']
    dataset_row['context_question'] = context_question
    return dataset_row

In [9]:
filtered_train_dataset = filtered_train_dataset.map(combine_question_and_context)
filtered_val_dataset = filtered_val_dataset.map(combine_question_and_context)
filtered_test_dataset = filtered_test_dataset.map(combine_question_and_context)

Map:   0%|          | 0/54958 [00:00<?, ? examples/s]

Map:   0%|          | 0/6832 [00:00<?, ? examples/s]

Map:   0%|          | 0/6859 [00:00<?, ? examples/s]

In [10]:
filtered_train_dataset['context_question'][0]

'def MakeSuiteFromDict d label None suite Suite label label suite SetDict d suite Normalize return suite - What does the code make ?'

In [11]:
def preprocess_dataset(dataset):
    # questions = [q.strip() for q in dataset["questions"]]
    # contexts = [c.strip() for c in dataset["original_code"]]
    answers = [c.strip() for c in dataset["answers"]]

    question_context = [cq.strip() for cq in dataset["context_question"]]

    inputs = tokenizer(
        # questions,
        # contexts,
        question_context,
        text_target=answers,
        max_length=max_length,
        padding="max_length",
        return_offsets_mapping=True,
    )

    return inputs

In [12]:
tokenized_train_dataset = filtered_train_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_train_dataset.column_names,
)
tokenized_val_dataset = filtered_val_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_val_dataset.column_names,
)
tokenized_test_dataset = filtered_test_dataset.map(
    preprocess_dataset,
    batched=True,
    remove_columns=filtered_test_dataset.column_names,
)

Map:   0%|          | 0/54958 [00:00<?, ? examples/s]

Map:   0%|          | 0/6832 [00:00<?, ? examples/s]

Map:   0%|          | 0/6859 [00:00<?, ? examples/s]

In [40]:
filtered_test_dataset[0]

{'index': 0,
 'questions': 'How does the code add a user in the given buckets default object access control list ?',
 'answers': 'as an owner',
 'code': "def add bucket default owner bucket name user email storage client storage Client bucket storage client bucket bucket name bucket acl reload bucket default object acl user user email grant owner bucket default object acl save print ' Addeduser{}asanownerinthedefaultaclonbucket{} ' format user email bucket name",
 'original_code': "def add_bucket_default_owner bucket_name user_email storage_client storage Client bucket storage_client bucket bucket_name bucket acl reload bucket default_object_acl user user_email grant_owner bucket default_object_acl save print 'Addeduser{}asanownerinthedefaultaclonbucket{} ' format user_email bucket_name",
 'context_question': "def add_bucket_default_owner bucket_name user_email storage_client storage Client bucket storage_client bucket bucket_name bucket acl reload bucket default_object_acl user user_e

In [13]:
print(
    f"All tokenized train dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_train_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized val dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_val_dataset["input_ids"]
        ]
    ),
)
print(
    f"All tokenized test dataset entries have {max_length} tokens: ",
    all(
        [
            len(input_ids) == max_length
            for input_ids in tokenized_test_dataset["input_ids"]
        ]
    ),
)

All tokenized train dataset entries have 256 tokens:  True
All tokenized val dataset entries have 256 tokens:  True
All tokenized test dataset entries have 256 tokens:  True


In [14]:
# Model parameters
training_number = 2

model_name = "python-bart-uncased"
full_model_name = f"{model_name}-{training_number}"

# Checkpoints
checkpoint_filename_template = constants.checkpoint_filename_template
checkpoints_path = (
    generative_qa_paths.training_checkpoints_dir
    / full_model_name
    / checkpoint_filename_template
)

# Hub
hub_path = generative_qa_paths.hub_models_location / full_model_name

# Saved models
saved_models_path = generative_qa_paths.saved_models_dir / full_model_name

# Figures
figures_dir = generative_qa_paths.figures_dir / full_model_name

# Hyperparameters
batch_size = 8
train_epochs = 1

In [15]:
# Load model for fine-tuning
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBartForConditionalGeneration: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFBartForConditionalGeneration from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBartForConditionalGeneration from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [17]:
# Dataset preparation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tf_train_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_train_dataset,
    collator=data_collator,
    batch_size=batch_size,
    shuffle=True
)

tf_val_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_val_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

tf_test_dataset = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset,
    collator=data_collator,
    batch_size=batch_size,
)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [18]:
# Callbacks
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    checkpoints_path, verbose=1, save_weights_only=True
)
early_stop_cb = tf.keras.callbacks.EarlyStopping(patience=1)
# push_to_hub = keras_callbacks.PushToHubCallback(
#     output_dir=full_model_name, tokenizer=tokenizer
# )

callbacks = [
    checkpoint_cb,
    early_stop_cb,
    # push_to_hub
]

In [19]:
# Compile
num_train_steps = len(tf_train_dataset) * train_epochs

lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# Compile
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metrics = ["accuracy"]
# model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
# model.compile(optimizer=optimizer, metrics=metrics)
model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [20]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  139420416 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 139,470,681
Trainable params: 139,420,416
Non-trainable params: 50,265
_________________________________________________________________


In [21]:
# Fit the model on the new data
history = model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=train_epochs,
    callbacks=callbacks,
)

Epoch 1: saving model to e:\STUDIA\IPS\question-answering\generative-qa\training-checkpoints\python-bart-uncased-2\cp-01.ckpt


In [22]:
# Get best version of the model
best_model, best_epoch = core_qa_utils.get_best_model_from_checkpoints(
    model, history, model_name=full_model_name, remove_checkpoints=True, model_type="generative"
)

In [23]:
# Save best model's weights
generative_qa_utils.save_model(best_model, model_name=full_model_name)

In [24]:
# Load best model
loaded_model = generative_qa_utils.load_model(
    model_checkpoint, model_name=full_model_name
)
loaded_model.compile(optimizer=optimizer)

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
# Get predictions from the best model
loaded_model_evaluation = loaded_model.evaluate(tf_test_dataset)

In [None]:
import evaluate

metric = evaluate.load("bleu")

In [25]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def generate_with_xla(batch):
    return loaded_model.generate(
        input_ids=batch["input_ids"],
        attention_mask=batch["attention_mask"],
        max_new_tokens=max_length,
    )


def compute_metrics():
    all_preds = []
    all_labels = []

    for batch, labels in tqdm(tf_test_dataset):
        predictions = generate_with_xla(batch)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = labels.numpy()
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [[label.strip()] for label in decoded_labels]
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    result = metric.compute(predictions=all_preds, references=all_labels)
    return {"bleu": result["score"]}

In [26]:
tokenized_test_dataset2 = tokenized_test_dataset.train_test_split(test_size=0.001)['test']

In [None]:
def invert_attention_mask(value):
    value = 1 - value

In [None]:
original_list = tokenized_test_dataset2[0]['attention_mask']
iterator = masks = map(lambda x: 1 - x, original_list)
list(iterator)

In [None]:
def invert_attention_mask_in_dataset(dataset_row):
    something = dataset_row["attention_mask"]
    something2 = list(map(lambda x: 1 - x, something))
    dataset_row["attention_mask"] = something2
    return dataset_row

In [None]:
tokenized_test_dataset3 = tokenized_test_dataset2.map(invert_attention_mask_in_dataset)

In [None]:
tokenizer.decode(tokenized_test_dataset2[0]['input_ids'])

In [None]:
tokenizer.decode(tokenized_test_dataset2[0]['labels'])

In [52]:
tokenized_test_dataset2

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'labels'],
    num_rows: 7
})

In [53]:
loaded_model.generate(
    input_ids=tokenized_test_dataset2[0]['input_ids'],
    attention_mask=tokenized_test_dataset2[0]['labels'],
    max_new_tokens=max_length,)

AttributeError: 'list' object has no attribute 'shape'

In [48]:
tf_test_dataset2 = core_qa_utils.prepare_tf_dataset(
    model=model,
    hf_dataset=tokenized_test_dataset2,
    collator=data_collator,
    batch_size=batch_size,
)

In [49]:
all_preds = []
all_labels = []

for batch, labels in tqdm(tf_test_dataset2):
    predictions = generate_with_xla(batch)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = labels.numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    all_preds.extend(decoded_preds)
    all_labels.extend(decoded_labels)

  0%|          | 0/1 [00:01<?, ?it/s]


InvalidArgumentError: cannot compute AddV2 as input #1(zero-based) was expected to be a half tensor but is a float tensor [Op:AddV2]

In [None]:
all_preds, all_labels

In [None]:
result = metric.compute(predictions=all_preds, references=all_labels)
result