In [1]:
from datasets import load_dataset 

dataset = load_dataset("Ankita802/formatted-data")
dataset

DatasetDict({
    train: Dataset({
        features: ['input', ' result'],
        num_rows: 8
    })
    test: Dataset({
        features: ['input', ' result'],
        num_rows: 2
    })
})

In [2]:
dataset['train'][0]


{'input': 'AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release',
 ' result': 'The CONNECT developerï¿½s objective is to thoroughly test all assertion upgrades to ensure they are ready for inclusion in the upcoming release. This comprehensive testing is crucial for verifying that the new enhancements function correctly and meet quality standards, thereby contributing to the stability and reliability of the next version of the software.'}

In [3]:
print(dataset['train'].features)
print(dataset['train'][0])
print(dataset['train'][5])
print(dataset['train'][6])

{'input': Value(dtype='string', id=None), ' result': Value(dtype='string', id=None)}
{'input': 'AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release', ' result': 'The CONNECT developerï¿½s objective is to thoroughly test all assertion upgrades to ensure they are ready for inclusion in the upcoming release. This comprehensive testing is crucial for verifying that the new enhancements function correctly and meet quality standards, thereby contributing to the stability and reliability of the next version of the software.'}
{'input': 'As a CONNECT adopter I want Release Notes for 4.3 to be documented on the wiki, so that I understand the features, bug fixes packaged with this release', ' result': 'The Release Notes for CONNECT 4.3 are documented on the wiki, detailing new features, bug fixes, and improvements. This includes enhancements to the Setup Wizard, updates to the core codebase, and security improvements. The

In [4]:
from transformers import RobertaTokenizer, RobertaModel


model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name, torch_dtype=torch.bfloat16)
model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 124645632
all model parameters: 124645632
percentage of trainable model parameters: 100.00%


In [7]:
sentence = "AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release"
tokenized_input = tokenizer(sentence)
# Print the tokenized input
print("Input IDs:", tokenized_input["input_ids"])
# print("Token Type IDs:", tokenized_input["token_type_ids"])
print("Attention Mask:", tokenized_input["attention_mask"])

# Decode the input tokens
decoded_input = tokenizer.decode(tokenized_input["input_ids"])
print("Decoded Input:", decoded_input)

Input IDs: [0, 2336, 10, 8748, 42849, 6596, 6, 38, 236, 70, 29947, 11500, 7, 28, 2198, 4776, 98, 5, 3260, 64, 28, 1165, 11, 5, 220, 800, 2]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input: <s>AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release</s>


In [8]:
# Print the type of the dataset
print(type(dataset))

# Print the first few entries of the dataset
for i in range(3):
    print(dataset['train'][i])


<class 'datasets.dataset_dict.DatasetDict'>
{'input': 'AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release', ' result': 'The CONNECT developerï¿½s objective is to thoroughly test all assertion upgrades to ensure they are ready for inclusion in the upcoming release. This comprehensive testing is crucial for verifying that the new enhancements function correctly and meet quality standards, thereby contributing to the stability and reliability of the next version of the software.'}
{'input': 'As a Publisher, I would like a tool to check data availability persistence after publication.', ' result': 'As a Publisher, I need a tool to verify the persistence of data availability after publication." This user story succinctly conveys the requirement for a tool to ensure that published data remains available and accessible over time, addressing the concerns of data persistence.'}
{'input': 'As a administrator, I want to r

In [9]:
# Iterate over the examples in the dataset and print the "input" and "result" columns
for example in dataset['train']:
    input_text = example['input']
    result_text = example[' result']  # Note the leading space in the column name
    print("Input:", input_text)
    print("Result:", result_text)
    print()


Input: AS a CONNECT developer, I want all assertions upgrades to be completely tested so the code can be included in the next release
Result: The CONNECT developerï¿½s objective is to thoroughly test all assertion upgrades to ensure they are ready for inclusion in the upcoming release. This comprehensive testing is crucial for verifying that the new enhancements function correctly and meet quality standards, thereby contributing to the stability and reliability of the next version of the software.

Input: As a Publisher, I would like a tool to check data availability persistence after publication.
Result: As a Publisher, I need a tool to verify the persistence of data availability after publication." This user story succinctly conveys the requirement for a tool to ensure that published data remains available and accessible over time, addressing the concerns of data persistence.

Input: As a administrator, I want to refund sponsorship money that was processed via stripe, so that people 

In [10]:
index = 0
example = dataset['test'][index]
print(example.keys())


dict_keys(['input', ' result'])


Test the model with zero shot inferencing

In [11]:
# example_indices = [0, 1]

index = 0

# for i, index in enumerate(example_indices):
dialogue = dataset['test'][index]['input']
summary = dataset['test'][index][' result']

prompt_template = f""" Providing the description {dialogue} """                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

inputs = tokenizer(prompt_template, return_tensors='pt')

decoded_input = tokenizer.decode(
            inputs['input_ids'][0],
            skip_special_tokens=True)
    
print()
print(inputs)
print()
print(decoded_input)
print()

# print(dash_line)
print('Example ', i + 1)
    # print(dash_line)
print(f'INPUT PROMPT:\n{prompt_template}')
print()
    # print(dash_line)
print(f'ANSWER FROM CSV:\n{summary}')
print()
    # print(dash_line)
print(f'MODEL GENERATION - WITH ONE SHOT LEARNING:\n{decoded_input}\n')
print("-------------------------------------------------------------------------------------------------------")


    


{'input_ids': tensor([[    0, 13786,  8231,     5,  8194,   287,    10,  3018,     6,    38,
           236,     7, 22785,  1061,  1412,    11, 28132, 23730,    19,    10,
          3748,    12,   805, 29419,   215,    25,  1204, 29419,     4,  1437,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}

 Providing the description As a user, I want to sync events created in NeuroHub with a web-based Calendar such as Google Calendar. 

Example  3
INPUT PROMPT:
 Providing the description As a user, I want to sync events created in NeuroHub with a web-based Calendar such as Google Calendar. 

ANSWER FROM CSV:
Users seek the capability to synchronize events created within NeuroHub with a web-based calendar service like Google Calendar, facilitating seamless integration and access to scheduling information across platforms.

MODEL GENERATION - WITH ONE SHOT LEARNING:
 Providing the description 

Performing Full Fine-Tuning

In [12]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["input"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example[" result"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

In [13]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [14]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input', ' result', 'input_ids', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['input', ' result', 'input_ids', 'labels'],
        num_rows: 2
    })
})

In [15]:
tokenized_datasets = tokenized_datasets.remove_columns(['input', ' result'])


In [18]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
# print(f"Validation: {tokenized_datasets['test'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (8, 2)
Test: (2, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
})


In [37]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(8))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2))

In [19]:
# print(small_train_dataset)
# print(small_eval_dataset)

In [23]:
import time 

from transformers import TrainingArguments, Trainer


output_dir = f'./code-generation-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    # logging_steps=500,
    max_steps=-1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [24]:
trainer.train()

  0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
trainer.save_model("test-squad-trained")

In [41]:
# import numpy as np
# import evaluate

# metric = evaluate.load("accuracy")
# print(metric)

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [42]:
# from transformers import RobertaModel

# model_name = 'roberta-base'
# model = RobertaModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [44]:
# import time

# output_dir = f'./checkpoints2-{str(int(time.time()))}'

# from transformers import TrainingArguments

# # training_args = TrainingArguments(output_dir=output_dir, 
# #                                   prediction_loss_only=bool,
# #                                   per_device_train_batch_size=8, 
# #                                   per_device_eval_batch_size=8,
# #                                   learning_rate=5e-5, 
# #                                   evaluation_strategy="epoch", 
# #                                   logging_dir="logs")

# training_args = TrainingArguments(output_dir=output_dir, 
#                                   evaluation_strategy="epoch")

In [48]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return {"accuracy": (predictions == labels).mean()}

# from transformers import default_data_collator

# data_collator = default_data_collator

In [49]:
# from transformers import Trainer

# # Pass other arguments to Trainer as before
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets['train'],
#     eval_dataset=tokenized_datasets['test'],
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )


In [50]:
# trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

: 

In [None]:
# trainer.save_model("test-squad-trained")