In [1]:
from datasets import load_dataset 

dataset = load_dataset("Ankita802/llm")

In [2]:
dataset['train'][0]

{'input': "As an administrator, I want to have researchers reset their own passwords, so that I don't have to send passwords in cleartext.\n",
 'result': "Implement a self-service password reset feature within the archival system's user interface. Enable researchers to securely reset their passwords by providing authentication through email verification or security questions. Ensure that passwords are encrypted and stored securely within the system to maintain data integrity. Provide clear instructions and guidance for researchers on how to reset their passwords independently. Regularly educate researchers on password security best practices to mitigate potential security risks."}

load pretrained instances with an AutoClass

In [3]:
from transformers import AutoTokenizer
from transformers import BertModel

model_name =  "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

print(tokenizer)
print()
print(bert_model)

BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (positi

tokenized data representation

In [4]:
sentence = "As a UI Designer, I want to update code"
tokenized_input = tokenizer(sentence)
# Print the tokenized input
print("Input IDs:", tokenized_input["input_ids"])
print("Token Type IDs:", tokenized_input["token_type_ids"])
print("Attention Mask:", tokenized_input["attention_mask"])

# Decode the input tokens
decoded_input = tokenizer.decode(tokenized_input["input_ids"])
print("Decoded Input:", decoded_input)


# here we get input_ids, token_type_ids and attention mask

# input ids = numerical form of words
# token type ids : support sequence pair tasks (like QA)
# attention mask : helps to focus on relevant tokens , ignoring padding tokens

# CLS = classification and specially used for classification tasks, (appended at present) 
# SEP stands for separator token , used to separate pair of sentences in sequence-pair task (appended at last)

Input IDs: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 10651, 3642, 102]
Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input: [CLS] as a ui designer, i want to update code [SEP]


we can pass batch sentences

In [5]:
batch_sentences = [
    'As a Data user, I want to have the 12-19-2017 deletions processed',
    'As a UI designer, I want to redesign the Resources page, so that it matches the new Broker design styles.',
    'As a UI designer, I want to move on to round 2 of Homepage edits, so that I can get approvals from leadership.'
]

encoded_inputs = tokenizer(batch_sentences)
# print(encoded_inputs)

# Print the tokenized input
print("Input IDs:", tokenized_input["input_ids"])
print("Token Type IDs:", tokenized_input["token_type_ids"])
print("Attention Mask:", tokenized_input["attention_mask"])

# Decode the input tokens

for i in range(len(batch_sentences)):
    decoded_input = tokenizer.decode(encoded_inputs["input_ids"][i])
    print("Decoded Input:", decoded_input)
# decoded_input1 = tokenizer.decode(encoded_inputs["input_ids"][0])
# print("Decoded Input:", decoded_input1)
# decoded_input1 = tokenizer.decode(encoded_inputs["input_ids"][0])
# print("Decoded Input:", decoded_input1)


Input IDs: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 10651, 3642, 102]
Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input: [CLS] as a data user, i want to have the 12 - 19 - 2017 deletions processed [SEP]
Decoded Input: [CLS] as a ui designer, i want to redesign the resources page, so that it matches the new broker design styles. [SEP]
Decoded Input: [CLS] as a ui designer, i want to move on to round 2 of homepage edits, so that i can get approvals from leadership. [SEP]


In [6]:
batch_sentences = [
    'As a Data user, I want to have the 12-19-2017 deletions processed',
    'As a UI designer, I want to redesign the Resources page, so that it matches the new Broker design styles.',
    'As a UI designer, I want to move on to round 2 of Homepage edits, so that I can get approvals from leadership.'
]

encoded_inputs = tokenizer(batch_sentences)

# Iterate over the range of the length of batch_sentences
for i in range(len(batch_sentences)):
    # Print the tokenized input
    print(f"Input IDs {i+1}:", encoded_inputs["input_ids"][i])
    print(f"Token Type IDs {i+1}:", encoded_inputs["token_type_ids"][i])
    print(f"Attention Mask {i+1}:", encoded_inputs["attention_mask"][i])

    # Decode the input tokens
    decoded_input = tokenizer.decode(encoded_inputs["input_ids"][i])
    print(f"Decoded Input {i+1}:", decoded_input)


Input IDs 1: [101, 2004, 1037, 2951, 5310, 1010, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 102]
Token Type IDs 1: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 1: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input 1: [CLS] as a data user, i want to have the 12 - 19 - 2017 deletions processed [SEP]
Input IDs 2: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 25136, 1996, 4219, 3931, 1010, 2061, 2008, 2009, 3503, 1996, 2047, 20138, 2640, 6782, 1012, 102]
Token Type IDs 2: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 2: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input 2: [CLS] as a ui designer, i want to redesign the resources page, so that it matches the new broker design styles. [SEP]
Input IDs 3: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 2188,

Padding: here we can see all sentences are not of same size, so we have to make them of same size

In [7]:
batch_sentences = [
    'As a Data user, I want to have the 12-19-2017 deletions processed',
    'As a UI designer, I want to redesign the Resources page, so that it matches the new Broker design styles.',
    'As a UI designer, I want to move on to round 2 of Homepage edits, so that I can get approvals from leadership.'
]

encoded_inputs = tokenizer(batch_sentences, padding=True)

# Iterate over the range of the length of batch_sentences
for i in range(len(batch_sentences)):
    # Print the tokenized input
    print(f"Input IDs {i+1}:", encoded_inputs["input_ids"][i])
    print(f"Token Type IDs {i+1}:", encoded_inputs["token_type_ids"][i])
    print(f"Attention Mask {i+1}:", encoded_inputs["attention_mask"][i])

    # Decode the input tokens
    decoded_input = tokenizer.decode(encoded_inputs["input_ids"][i])
    print(f"Decoded Input {i+1}:", decoded_input)


Input IDs 1: [101, 2004, 1037, 2951, 5310, 1010, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Token Type IDs 1: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 1: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded Input 1: [CLS] as a data user, i want to have the 12 - 19 - 2017 deletions processed [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Input IDs 2: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 25136, 1996, 4219, 3931, 1010, 2061, 2008, 2009, 3503, 1996, 2047, 20138, 2640, 6782, 1012, 102, 0, 0, 0, 0, 0, 0]
Token Type IDs 2: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 2: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
Decoded Input 2: [CLS] as a ui designer,

Truncation: sometimes a sequence may be too long for a model to handle. 
In this case, you’ll need to truncate the sequence to a shorter length.
Set the truncation parameter to True to truncate a sequence to the maximum length 
accepted by the model

set Truncation = True


In [8]:

batch_sentences = [
    'As a Data user, I want to have the 12-19-2017 deletions processed',
    'As a UI designer, I want to redesign the Resources page, so that it matches the new Broker design styles.',
    'As a UI designer, I want to move on to round 2 of Homepage edits, so that I can get approvals from leadership.'
]

encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True)

# Iterate over the range of the length of batch_sentences
for i in range(len(batch_sentences)):
    # Print the tokenized input
    print(f"Input IDs {i+1}:", encoded_inputs["input_ids"][i])
    print(f"Token Type IDs {i+1}:", encoded_inputs["token_type_ids"][i])
    print(f"Attention Mask {i+1}:", encoded_inputs["attention_mask"][i])

    # Decode the input tokens
    decoded_input = tokenizer.decode(encoded_inputs["input_ids"][i])
    print(f"Decoded Input {i+1}:", decoded_input)

Input IDs 1: [101, 2004, 1037, 2951, 5310, 1010, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Token Type IDs 1: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 1: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded Input 1: [CLS] as a data user, i want to have the 12 - 19 - 2017 deletions processed [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Input IDs 2: [101, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 25136, 1996, 4219, 3931, 1010, 2061, 2008, 2009, 3503, 1996, 2047, 20138, 2640, 6782, 1012, 102, 0, 0, 0, 0, 0, 0]
Token Type IDs 2: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask 2: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
Decoded Input 2: [CLS] as a ui designer,

Build tensors : Finally, you want the tokenizer to return the actual tensors that get fed to the model.

Set the return_tensors parameter to either pt for PyTorch, or tf for TensorFlow:

In [9]:
batch_sentences = [
    'As a Data user, I want to have the 12-19-2017 deletions processed',
    'As a UI designer, I want to redesign the Resources page, so that it matches the new Broker design styles.',
    'As a UI designer, I want to move on to round 2 of Homepage edits, so that I can get approvals from leadership.'
]
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')

# Iterate over the range of the length of batch_sentences
for i in range(len(batch_sentences)):
    # Print the tokenized input
    print(f"Input IDs {i+1}:", encoded_inputs["input_ids"][i])
    print(f"Token Type IDs {i+1}:", encoded_inputs["token_type_ids"][i])
    print(f"Attention Mask {i+1}:", encoded_inputs["attention_mask"][i])

    # Decode the input tokens
    decoded_input = tokenizer.decode(encoded_inputs["input_ids"][i])
    print(f"Decoded Input {i+1}:", decoded_input)

Input IDs 1: tensor([  101,  2004,  1037,  2951,  5310,  1010,  1045,  2215,  2000,  2031,
         1996,  2260,  1011,  2539,  1011,  2418,  3972, 20624,  5644, 13995,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0])
Token Type IDs 1: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0])
Attention Mask 1: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0])
Decoded Input 1: [CLS] as a data user, i want to have the 12 - 19 - 2017 deletions processed [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Input IDs 2: tensor([  101,  2004,  1037, 21318,  5859,  1010,  1045,  2215,  2000, 25136,
         1996,  4219,  3931,  1010,  2061,  2008,  2009,  3503,  1996,  2047,
        20138,  2640,  6782,  1012,   102,     0,     0,     0,     0,     0,
            0])
Token Type IDs 2: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Fine Tune a pretrained model

In [10]:
# from datasets import load_dataset

# dataset = load_dataset("Ankita802/llm")

In [10]:
print(dataset['train'].features)
print(dataset['train'][0])
print(dataset['train'][100])
print(dataset['train'][200])


{'input': Value(dtype='string', id=None), 'result': Value(dtype='string', id=None)}
{'input': "As an administrator, I want to have researchers reset their own passwords, so that I don't have to send passwords in cleartext.\n", 'result': "Implement a self-service password reset feature within the archival system's user interface. Enable researchers to securely reset their passwords by providing authentication through email verification or security questions. Ensure that passwords are encrypted and stored securely within the system to maintain data integrity. Provide clear instructions and guidance for researchers on how to reset their passwords independently. Regularly educate researchers on password security best practices to mitigate potential security risks."}
{'input': 'As an API User, I want to have a flexible API using HASC codes for countries, regions and cities, So that I can visualise budget data on maps.\n', 'result': 'Your user story effectively communicates the need for a fl

To process the text, we need tokenizer 

List of tokenizers can be used for our own dataset

1. BERT Tokenizer (BertTokenizer): Mainly for text data, not suitable for code data, provided by HuggingFace transformers library.
2. Camembert Tokenizer (CamembertTokenizer): mainly for french language, but can work for both text and code data, provide good results.
3. CodeBERT Tokenizer (CodebertTokenizer): specially designed for code-related task, considered good choice.
4. Roberta Tokenizer (RobertaTokenizer): perform well for text and code, not suitable for only code-data.
5. GPT-2 Tokenizer (GPT2Tokenizer): not specifically designed for code data but may still work well for a mix of text and code.



trying to use BERT Tokenizer, Camembert tokenizer and  Roberta tokenizer.



In [11]:
# Testing of BERT Tokenizer 

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text1 = "This is an example sentence."
encoded_text = tokenizer.encode(text1)
# Decode token IDs
decoded_text = tokenizer.decode(encoded_text)

print("Original Text:", text1)
print("Token IDs:", encoded_text)
print("Decoded Text:", decoded_text)
print("--------------------------------------------------------------------------------------------------")


text2 = "from datetime import datetime"
encoded_text2 = tokenizer.encode(text2)
# Decode token IDs
decoded_text2 = tokenizer.decode(encoded_text2)

print("Original Text:", text2)
print("Token IDs:", encoded_text2)
print("Decoded Text:", decoded_text2)

print("--------------------------------------------------------------------------------------------------")


text3 = "from sklearn.cluster import AgglomerativeClustering"
encoded_text3 = tokenizer.encode(text3)
# Decode token IDs
decoded_text3 = tokenizer.decode(encoded_text3)

print("Original Text:", text3)
print("Token IDs:", encoded_text3)
print("Decoded Text:", decoded_text3)

print("--------------------------------------------------------------------------------------------------")

text4 = "def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(""Enter a number: "")) <br> result = factorial(num) <br> print(f""The factorial of {num} is {result}"")" 

encoded_text4 = tokenizer.encode(text4)
# Decode token IDs
decoded_text4 = tokenizer.decode(encoded_text4)

print("Original Text:", text4)
print("Token IDs:", encoded_text4)
print("Decoded Text:", decoded_text4)

print("--------------------------------------------------------------------------------------------------")

text5 = "def linear_search(lst, target): <br> for i in range(len(lst)): <br> if lst[i] == target: <br> return i <br> return -1 <br> numbers = [5, 8, 2, 9, 1, 7, 3] <br> search_value = int(input(""Enter a value to search: "")) <br> index = linear_search(numbers, search_value) <br> if index == -1: <br> print(f""{search_value} is not present in the list."") <br> else: <br> print(f""{search_value} is present at index {index} in the list."")" 

encoded_text5 = tokenizer.encode(text5)
# Decode token IDs
decoded_text5 = tokenizer.decode(encoded_text5)

print("Original Text:", text5)
print("Token IDs:", encoded_text5)
print("Decoded Text:", decoded_text5)

print("--------------------------------------------------------------------------------------------------")


text6 = "LinkedList<Integer,Character> list = new LinkedList"
encoded_text6 = tokenizer.encode(text6)
decoded_text6 = tokenizer.decode(encoded_text6)
print("Original Text:", text6)
print("Token IDs:", encoded_text6)
print("Decoded Text:", decoded_text6)

print("--------------------------------------------------------------------------------------------------")

text7 = "import matplotlib.pyplot as plt"
encoded_text7 = tokenizer.encode(text7)
decoded_text7 = tokenizer.decode(encoded_text7)
print("Original Text:", text7)
print("Token IDs:", encoded_text7)
print("Decoded Text:", decoded_text7)

print("--------------------------------------------------------------------------------------------------")

text8 = "Doesn't she like cats?"
encoded_text8 = tokenizer.encode(text8)
decoded_text8 = tokenizer.decode(encoded_text8)
print("Original Text:", text8)
print("Token IDs:", encoded_text8)
print("Decoded Text:", decoded_text8)

Original Text: This is an example sentence.
Token IDs: [101, 2023, 2003, 2019, 2742, 6251, 1012, 102]
Decoded Text: [CLS] this is an example sentence. [SEP]
--------------------------------------------------------------------------------------------------
Original Text: from datetime import datetime
Token IDs: [101, 2013, 3058, 7292, 12324, 3058, 7292, 102]
Decoded Text: [CLS] from datetime import datetime [SEP]
--------------------------------------------------------------------------------------------------
Original Text: from sklearn.cluster import AgglomerativeClustering
Token IDs: [101, 2013, 15315, 19738, 6826, 1012, 9324, 12324, 12943, 23296, 8462, 18514, 20464, 19966, 7999, 102]
Decoded Text: [CLS] from sklearn. cluster import agglomerativeclustering [SEP]
--------------------------------------------------------------------------------------------------
Original Text: def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(E

In [12]:
# Testing of CamembertTokenizer

from transformers import CamembertTokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
text1 = "This is an example sentence."
encoded_text = tokenizer.encode(text1)
# Decode token IDs
decoded_text = tokenizer.decode(encoded_text)

print("Original Text:", text1)
print("Token IDs:", encoded_text)
print("Decoded Text:", decoded_text)
print("--------------------------------------------------------------------------------------------------")


text2 = "from datetime import datetime"
encoded_text2 = tokenizer.encode(text2)
# Decode token IDs
decoded_text2 = tokenizer.decode(encoded_text2)

print("Original Text:", text2)
print("Token IDs:", encoded_text2)
print("Decoded Text:", decoded_text2)

print("--------------------------------------------------------------------------------------------------")


text3 = "from sklearn.cluster import AgglomerativeClustering"
encoded_text3 = tokenizer.encode(text3)
# Decode token IDs
decoded_text3 = tokenizer.decode(encoded_text3)

print("Original Text:", text3)
print("Token IDs:", encoded_text3)
print("Decoded Text:", decoded_text3)

print("--------------------------------------------------------------------------------------------------")

text4 = "def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(""Enter a number: "")) <br> result = factorial(num) <br> print(f""The factorial of {num} is {result}"")" 

encoded_text4 = tokenizer.encode(text4)
# Decode token IDs
decoded_text4 = tokenizer.decode(encoded_text4)

print("Original Text:", text4)
print("Token IDs:", encoded_text4)
print("Decoded Text:", decoded_text4)

print("--------------------------------------------------------------------------------------------------")

text5 = "def linear_search(lst, target): <br> for i in range(len(lst)): <br> if lst[i] == target: <br> return i <br> return -1 <br> numbers = [5, 8, 2, 9, 1, 7, 3] <br> search_value = int(input(""Enter a value to search: "")) <br> index = linear_search(numbers, search_value) <br> if index == -1: <br> print(f""{search_value} is not present in the list."") <br> else: <br> print(f""{search_value} is present at index {index} in the list."")" 

encoded_text5 = tokenizer.encode(text5)
# Decode token IDs
decoded_text5 = tokenizer.decode(encoded_text5)

print("Original Text:", text5)
print("Token IDs:", encoded_text5)
print("Decoded Text:", decoded_text5)

print("--------------------------------------------------------------------------------------------------")


text6 = "LinkedList<Integer,Character> list = new LinkedList"
encoded_text6 = tokenizer.encode(text6)
decoded_text6 = tokenizer.decode(encoded_text6)
print("Original Text:", text6)
print("Token IDs:", encoded_text6)
print("Decoded Text:", decoded_text6)

print("--------------------------------------------------------------------------------------------------")

text7 = "import matplotlib.pyplot as plt"
encoded_text7 = tokenizer.encode(text7)
decoded_text7 = tokenizer.decode(encoded_text7)
print("Original Text:", text7)
print("Token IDs:", encoded_text7)
print("Decoded Text:", decoded_text7)

print("--------------------------------------------------------------------------------------------------")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Original Text: This is an example sentence.
Token IDs: [5, 17526, 2856, 674, 1017, 21598, 22625, 9, 6]
Decoded Text: <s> This is an example sentence.</s>
--------------------------------------------------------------------------------------------------
Original Text: from datetime import datetime
Token IDs: [5, 7142, 749, 8449, 14656, 749, 8449, 6]
Decoded Text: <s> from datetime import datetime</s>
--------------------------------------------------------------------------------------------------
Original Text: from sklearn.cluster import AgglomerativeClustering
Token IDs: [5, 7142, 52, 496, 185, 11992, 9, 16972, 809, 14656, 5538, 10561, 1496, 3761, 228, 1396, 2541, 402, 6]
Decoded Text: <s> from sklearn.cluster import AgglomerativeClustering</s>
--------------------------------------------------------------------------------------------------
Original Text: def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(Enter a number: )) 

In [13]:
text8 = "Doesn't she like cats?"
encoded_text8 = tokenizer.encode(text8)
decoded_text8 = tokenizer.decode(encoded_text8)
print("Original Text:", text8)
print("Token IDs:", encoded_text8)
print("Decoded Text:", decoded_text8)

print("--------------------------------------------------------------------------------------------------")

text9 = "from bs4 import BeautifulSoup<br> import requests<br> url = ""https://www.example.com""<br> response = requests.get(url)<br> soup = BeautifulSoup(response.content, ""html.parser"")<br> title = soup.find(""title"").text<br> print(f""Website title: {title}"")","This code imports BeautifulSoup and requests to scrape the title from a website using a sample URL. Note: Replace ""https://www.example.com"" with a real website URL."
encoded_text9 = tokenizer.encode(text9)
decoded_text9 = tokenizer.decode(encoded_text9)
print("Original Text:", text9)
print("Token IDs:", encoded_text9)
print("Decoded Text:", decoded_text9)

print("--------------------------------------------------------------------------------------------------")

text10 = "def calculate_area(shape, *dimensions): <br> <br> if shape == ""rectangle"": <br> return dimensions[0] * dimensions[1] <br> elif shape == ""circle"": <br> return 3.14 * (dimensions[0] ** 2) <br> else: <br> print(""Unsupported shape type."")" 
encoded_text10 = tokenizer.encode(text10)
decoded_text10 = tokenizer.decode(encoded_text10)
print("Original Text:", text10)
print("Token IDs:", encoded_text10)
print("Decoded Text:", decoded_text10)

print("--------------------------------------------------------------------------------------------------")

text11 = "class Animal:<br> def __init__(self, name, sound):<br> self.name = name<br> self.sound = sound<br> def make_sound(self):<br> print(f""{self.name} says {self.sound}"")<br> dog = Animal(""Fido"", ""Woof!"")<br> dog.make_sound()"
encoded_text11 = tokenizer.encode(text11)
decoded_text11 = tokenizer.decode(encoded_text11)
print("Original Text:", text11)
print("Token IDs:", encoded_text11)
print("Decoded Text:", decoded_text11)

print("--------------------------------------------------------------------------------------------------")

text12 = "def count_characters(string): <br> char_count = {} <br> for char in string: <br> if char in char_count: <br> char_count[char] += 1 <br> else: <br> char_count[char] = 1 <br> return char_count <br> text = input(""Enter a string: "") <br> char_counts = count_characters(text) <br> print(""Character counts:"") <br> for char, count in char_counts.items(): <br> print(f""{char}: {count}"")"
encoded_text12 = tokenizer.encode(text12)
decoded_text12 = tokenizer.decode(encoded_text12)
print("Original Text:", text12)
print("Token IDs:", encoded_text12)
print("Decoded Text:", decoded_text12)


print("--------------------------------------------------------------------------------------------------")

text13 = "message = ""Hello, World!"" <br> print(message.upper())"
encoded_text13 = tokenizer.encode(text13)
decoded_text13 = tokenizer.decode(encoded_text13)
print("Original Text:", text13)
print("Token IDs:", encoded_text13)
print("Decoded Text:", decoded_text13)


# summary : camembert works well for code and text

Original Text: Doesn't she like cats?
Token IDs: [5, 3459, 80, 255, 11, 110, 52, 2408, 16396, 13368, 10, 197, 6]
Decoded Text: <s> Doesn't she like cats?</s>
--------------------------------------------------------------------------------------------------
Original Text: ('from bs4 import BeautifulSoup<br> import requests<br> url = https://www.example.com<br> response = requests.get(url)<br> soup = BeautifulSoup(response.content, html.parser)<br> title = soup.find(title).text<br> print(fWebsite title: {title})', 'This code imports BeautifulSoup and requests to scrape the title from a website using a sample URL. Note: Replace https://www.example.com with a real website URL.')
Token IDs: [5, 3, 3, 6]
Decoded Text: <s><unk><unk></s>
--------------------------------------------------------------------------------------------------
Original Text: def calculate_area(shape, *dimensions): <br> <br> if shape == rectangle: <br> return dimensions[0] * dimensions[1] <br> elif shape == circle: <br>

In [14]:
# Testing of RobertaTokenizer

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

text1 = "This is an example sentence."
encoded_text = tokenizer.encode(text1)
# Decode token IDs
decoded_text = tokenizer.decode(encoded_text)

print("Original Text:", text1)
print("Token IDs:", encoded_text)
print("Decoded Text:", decoded_text)
print("--------------------------------------------------------------------------------------------------")


text2 = "from datetime import datetime"
encoded_text2 = tokenizer.encode(text2)
# Decode token IDs
decoded_text2 = tokenizer.decode(encoded_text2)

print("Original Text:", text2)
print("Token IDs:", encoded_text2)
print("Decoded Text:", decoded_text2)

print("--------------------------------------------------------------------------------------------------")


text3 = "from sklearn.cluster import AgglomerativeClustering"
encoded_text3 = tokenizer.encode(text3)
# Decode token IDs
decoded_text3 = tokenizer.decode(encoded_text3)

print("Original Text:", text3)
print("Token IDs:", encoded_text3)
print("Decoded Text:", decoded_text3)

print("--------------------------------------------------------------------------------------------------")

text4 = "def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(""Enter a number: "")) <br> result = factorial(num) <br> print(f""The factorial of {num} is {result}"")" 

encoded_text4 = tokenizer.encode(text4)
# Decode token IDs
decoded_text4 = tokenizer.decode(encoded_text4)

print("Original Text:", text4)
print("Token IDs:", encoded_text4)
print("Decoded Text:", decoded_text4)

print("--------------------------------------------------------------------------------------------------")

text5 = "def linear_search(lst, target): <br> for i in range(len(lst)): <br> if lst[i] == target: <br> return i <br> return -1 <br> numbers = [5, 8, 2, 9, 1, 7, 3] <br> search_value = int(input(""Enter a value to search: "")) <br> index = linear_search(numbers, search_value) <br> if index == -1: <br> print(f""{search_value} is not present in the list."") <br> else: <br> print(f""{search_value} is present at index {index} in the list."")" 

encoded_text5 = tokenizer.encode(text5)
# Decode token IDs
decoded_text5 = tokenizer.decode(encoded_text5)

print("Original Text:", text5)
print("Token IDs:", encoded_text5)
print("Decoded Text:", decoded_text5)

print("--------------------------------------------------------------------------------------------------")


text6 = "LinkedList<Integer,Character> list = new LinkedList"
encoded_text6 = tokenizer.encode(text6)
decoded_text6 = tokenizer.decode(encoded_text6)
print("Original Text:", text6)
print("Token IDs:", encoded_text6)
print("Decoded Text:", decoded_text6)

print("--------------------------------------------------------------------------------------------------")

text7 = "import matplotlib.pyplot as plt"
encoded_text7 = tokenizer.encode(text7)
decoded_text7 = tokenizer.decode(encoded_text7)
print("Original Text:", text7)
print("Token IDs:", encoded_text7)
print("Decoded Text:", decoded_text7)

print("--------------------------------------------------------------------------------------------------")



Original Text: This is an example sentence.
Token IDs: [0, 713, 16, 41, 1246, 3645, 4, 2]
Decoded Text: <s>This is an example sentence.</s>
--------------------------------------------------------------------------------------------------
Original Text: from datetime import datetime
Token IDs: [0, 7761, 13516, 16093, 6595, 13516, 16093, 2]
Decoded Text: <s>from datetime import datetime</s>
--------------------------------------------------------------------------------------------------
Original Text: from sklearn.cluster import AgglomerativeClustering
Token IDs: [0, 7761, 2972, 38229, 4, 3998, 10504, 6595, 14644, 462, 11032, 3693, 11428, 4193, 2961, 2]
Decoded Text: <s>from sklearn.cluster import AgglomerativeClustering</s>
--------------------------------------------------------------------------------------------------
Original Text: def factorial(n): <br> if n == 0: <br> return 1 <br> else: <br> return n * factorial(n-1)<br> num = int(input(Enter a number: )) <br> result = factoria

In [15]:
text8 = "Doesn't she like cats?"
encoded_text8 = tokenizer.encode(text8)
decoded_text8 = tokenizer.decode(encoded_text8)
print("Original Text:", text8)
print("Token IDs:", encoded_text8)
print("Decoded Text:", decoded_text8)

print("--------------------------------------------------------------------------------------------------")

text9 = "from bs4 import BeautifulSoup<br> import requests<br> url = ""https://www.example.com""<br> response = requests.get(url)<br> soup = BeautifulSoup(response.content, ""html.parser"")<br> title = soup.find(""title"").text<br> print(f""Website title: {title}"")","This code imports BeautifulSoup and requests to scrape the title from a website using a sample URL. Note: Replace ""https://www.example.com"" with a real website URL."
encoded_text9 = tokenizer.encode(text9)
decoded_text9 = tokenizer.decode(encoded_text9)
print("Original Text:", text9)
print("Token IDs:", encoded_text9)
print("Decoded Text:", decoded_text9)

print("--------------------------------------------------------------------------------------------------")

text10 = "def calculate_area(shape, *dimensions): <br> <br> if shape == ""rectangle"": <br> return dimensions[0] * dimensions[1] <br> elif shape == ""circle"": <br> return 3.14 * (dimensions[0] ** 2) <br> else: <br> print(""Unsupported shape type."")" 
encoded_text10 = tokenizer.encode(text10)
decoded_text10 = tokenizer.decode(encoded_text10)
print("Original Text:", text10)
print("Token IDs:", encoded_text10)
print("Decoded Text:", decoded_text10)

print("--------------------------------------------------------------------------------------------------")

text11 = "class Animal:<br> def __init__(self, name, sound):<br> self.name = name<br> self.sound = sound<br> def make_sound(self):<br> print(f""{self.name} says {self.sound}"")<br> dog = Animal(""Fido"", ""Woof!"")<br> dog.make_sound()"
encoded_text11 = tokenizer.encode(text11)
decoded_text11 = tokenizer.decode(encoded_text11)
print("Original Text:", text11)
print("Token IDs:", encoded_text11)
print("Decoded Text:", decoded_text11)

print("--------------------------------------------------------------------------------------------------")

text12 = "def count_characters(string): <br> char_count = {} <br> for char in string: <br> if char in char_count: <br> char_count[char] += 1 <br> else: <br> char_count[char] = 1 <br> return char_count <br> text = input(""Enter a string: "") <br> char_counts = count_characters(text) <br> print(""Character counts:"") <br> for char, count in char_counts.items(): <br> print(f""{char}: {count}"")"
encoded_text12 = tokenizer.encode(text12)
decoded_text12 = tokenizer.decode(encoded_text12)
print("Original Text:", text12)
print("Token IDs:", encoded_text12)
print("Decoded Text:", decoded_text12)


print("--------------------------------------------------------------------------------------------------")

text13 = "message = ""Hello, World!"" <br> print(message.upper())"
encoded_text13 = tokenizer.encode(text13)
decoded_text13 = tokenizer.decode(encoded_text13)
print("Original Text:", text13)
print("Token IDs:", encoded_text13)
print("Decoded Text:", decoded_text13)


# summary : roberta works well for code and text

Original Text: Doesn't she like cats?
Token IDs: [0, 27847, 282, 75, 79, 101, 10017, 116, 2]
Decoded Text: <s>Doesn't she like cats?</s>
--------------------------------------------------------------------------------------------------
Original Text: ('from bs4 import BeautifulSoup<br> import requests<br> url = https://www.example.com<br> response = requests.get(url)<br> soup = BeautifulSoup(response.content, html.parser)<br> title = soup.find(title).text<br> print(fWebsite title: {title})', 'This code imports BeautifulSoup and requests to scrape the title from a website using a sample URL. Note: Replace https://www.example.com with a real website URL.')
Token IDs: [0, 3, 3, 2]
Decoded Text: <s><unk><unk></s>
--------------------------------------------------------------------------------------------------
Original Text: def calculate_area(shape, *dimensions): <br> <br> if shape == rectangle: <br> return dimensions[0] * dimensions[1] <br> elif shape == circle: <br> return 3.14 * (dimen

{zero-shot} inference

In [32]:
from transformers import RobertaTokenizer

Roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



example_indices = [10, 200]

for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['input']
    summary = dataset['test'][index]['result']

    prompt_template = f""" Providing the description {dialogue} """                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

    inputs = Roberta_tokenizer(prompt_template, return_tensors='pt')

    decoded_input = Roberta_tokenizer.decode(
            inputs['input_ids'][0],
            skip_special_tokens=True)
    
    print()
    print(inputs)
    print()
    print(decoded_input)
    print()

    # print(dash_line)
    print('Example ', i + 1)
    # print(dash_line)
    print(f'INPUT PROMPT:\n{prompt_template}')
    print()
    # print(dash_line)
    print(f'ANSWER FROM CSV:\n{summary}')
    print()
    # print(dash_line)
    print(f'MODEL GENERATION - WITH ONE SHOT LEARNING:\n{decoded_input}\n')
    print("-------------------------------------------------------------------------------------------------------")

    


{'input_ids': tensor([[    0, 13786,  8231,     5,  8194,   287,    10,  5423,  9051, 20016,
         27913,     6,    38,   236,     7,   192, 46478, 24173,    14, 20453,
         14224,  7133, 27890,     6,   407,    14,    38,    64,    55,  2773,
          1346,    99,    38,   524,  7603,     4, 50117, 50118,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

 Providing the description As a Data Consuming User, I want to see textual descriptions that accompany embedded visualisations, So that I can more easily understand what I am viewing.	
 

Example  1
INPUT PROMPT:
 Providing the description As a Data Consuming User, I want to see textual descriptions that accompany embedded visualisations, So that I can more easily understand what I am viewing.	
 

ANSWER FROM CSV:
As a Data Consuming User, I seek textual descriptions accompanying embedded visualizatio

tokenizer used: roberta 

if we want to use Roberta on whole dataset , write code snippet as

In [25]:
# from transformers import RobertaTokenizer

# Roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [46]:
def tokenize_function(example):
    # Ensure the input column is a list of strings
    if not isinstance(example["input"], list) or not all(isinstance(dialogue, str) for dialogue in example["input"]):
        raise ValueError("Input column should be a list of strings")
    
    # Ensure the result column is a string
    if not isinstance(example["result"], str):
        raise ValueError("Result column should be a string")
    
    # Tokenize each dialogue individually and construct the prompt
    prompts = [f'Summarize the following conversation.\n\n{dialogue}\n\nSummary: ' for dialogue in example["input"]]
    
    # Tokenize the prompts
    tokenized_inputs = Roberta_tokenizer(prompts, padding="max_length", truncation=True, return_tensors="pt")
    
    example['input_ids'] = tokenized_inputs.input_ids
    example['attention_mask'] = tokenized_inputs.attention_mask
    
    # Tokenize the "result" column as labels
    example['labels'] = Roberta_tokenizer(example["result"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example


In [42]:
def tokenize_function(example):
    print("Input example:", example)
    
    # Tokenize each dialogue individually and construct the prompt
    prompts = []
    for dialogue in example["input"]:
        prompt = f'Summarize the following conversation.\n\n{dialogue}\n\nSummary: '
        prompts.append(prompt)
    
    print("Constructed prompts:", prompts)
    
    # Tokenize the prompts
    tokenized_inputs = Roberta_tokenizer(prompts, padding="max_length", truncation=True, return_tensors="pt")
    print("Tokenized inputs:", tokenized_inputs)
    
    example['input_ids'] = tokenized_inputs.input_ids
    example['attention_mask'] = tokenized_inputs.attention_mask
    
    # Tokenize the "result" column as labels
    example['labels'] = Roberta_tokenizer(example["result"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    print("Labels:", example['labels'])
    
    return example


In [48]:
# Iterate over the examples in the dataset and print the "result" column
for example in dataset['train']:
    result = example['result']
    print(result, type(result))


Implement a self-service password reset feature within the archival system's user interface. Enable researchers to securely reset their passwords by providing authentication through email verification or security questions. Ensure that passwords are encrypted and stored securely within the system to maintain data integrity. Provide clear instructions and guidance for researchers on how to reset their passwords independently. Regularly educate researchers on password security best practices to mitigate potential security risks. <class 'str'>
As a Certified Scrum Trainer (CST), you'll have the option to pay an annual fee to maintain your CST status and keep it active. This process ensures that you stay in good standing with the Scrum Alliance and continue to enjoy the benefits and privileges associated with being a CST. By paying the annual fee, you can demonstrate your commitment to upholding the standards of excellence in Scrum training and coaching, thereby contributing to the growth 

In [47]:
# def tokenize_function(example):
#     # start_prompt = 'Describe the input query of user.\n\n'
#     # end_prompt = '\n\nDescription: '
#     prompt = [dialogue for dialogue in example["input"]]
#     example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
#     example['labels'] = tokenizer(example["result"], padding="max_length", truncation=True, return_tensors="pt").input_ids

#     return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

ValueError: Result column should be a string

In [72]:
tokenized_datasets
tokenized_datasets = tokenized_datasets.remove_columns(['input', 'result'])


In [73]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 500
    })
})

In [27]:
from transformers import RobertaModel

model_name = 'roberta-base'
model = RobertaModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1999))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

In [75]:
print(small_train_dataset)
print(small_eval_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1999
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 500
})


In [76]:
import time

output_dir = f'./checkpoints-{str(int(time.time()))}'

from transformers import TrainingArguments

training_args = TrainingArguments(output_dir=output_dir, 
                                  prediction_loss_only=bool,
                                  per_device_train_batch_size=8, 
                                  per_device_eval_batch_size=8,
                                  learning_rate=5e-5, 
                                  evaluation_strategy="epoch", 
                                  logging_dir="logs")

In [77]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [78]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [79]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [80]:
trainer.train()

  0%|          | 0/750 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state,pooler_output. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
full_fine_tune_model_path="./code-description-checkpoint-local"

trainer.model.save_pretrained(full_fine_tune_model_path)
tokenizer.save_pretrained(full_fine_tune_model_path)