In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, BertLMHeadModel, DataCollatorForSeq2Seq


In [2]:
dataset_name = "Ankita802/llm"
dataset = load_dataset(dataset_name)

In [3]:
dataset = dataset['train'].train_test_split(test_size=0.3)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'result'],
        num_rows: 1399
    })
    test: Dataset({
        features: ['input', 'result'],
        num_rows: 600
    })
})

In [6]:
model_name =  "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name,padding_side="left")
model = BertLMHeadModel.from_pretrained(model_name, is_decoder=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [28]:
examples = {
    "question": [
        "As a Data user, I want to have the 12-19-2017 deletions processed.",
        "As a UI designer, I want to move on to round 2 of DABS or FABS landing page edits, so that I can get approvals from leadership.",
        "As a UI designer, I want to move on to round 2 of the Help page edits, so that I can get approvals from leadership."
    ],
    "answer": [
        "As an AI language model, I don't have access to specific data or the ability to process deletions. However, if you have a specific request or need assistance with data processing, I can try to help you with general information or provide guidance. Please provide more details about the deletions you want to process, and I will do my best to assist you.",
        "As a UI designer, my goal is to move on to round 2 of DABS or FABS landing page edits in order to obtain approvals from leadership. This will allow me to progress with the design process and ensure that the final product meets the expectations and requirements set by the leadership team. By obtaining their approvals, I can gain their trust and confidence in my design decisions, ultimately leading to a successful and well-received landing page.",
        "To move on to round 2 of the Help page edits, I will review the feedback from round 1 and make necessary revisions. I will also ensure that the design aligns with the overall brand and user experience guidelines. Once the edits are complete, I will present the updated Help page to the leadership team for their review and approval. This will allow me to move forward with confidence and ensure that the Help page meets the needs and expectations of our users."
    ]
}


In [31]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [32]:
# tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1399 [00:00<?, ? examples/s]

KeyError: 'question'

In [37]:
from datasets import Dataset

examples = {
    "question": [
        "As a Data user, I want to have the 12-19-2017 deletions processed.",
        "As a UI designer, I want to move on to round 2 of DABS or FABS landing page edits, so that I can get approvals from leadership.",
        "As a UI designer, I want to move on to round 2 of the Help page edits, so that I can get approvals from leadership."
    ],
    "answer": [
        "As an AI language model, I don't have access to specific data or the ability to process deletions. However, if you have a specific request or need assistance with data processing, I can try to help you with general information or provide guidance. Please provide more details about the deletions you want to process, and I will do my best to assist you.",
        "As a UI designer, my goal is to move on to round 2 of DABS or FABS landing page edits in order to obtain approvals from leadership. This will allow me to progress with the design process and ensure that the final product meets the expectations and requirements set by the leadership team. By obtaining their approvals, I can gain their trust and confidence in my design decisions, ultimately leading to a successful and well-received landing page.",
        "To move on to round 2 of the Help page edits, I will review the feedback from round 1 and make necessary revisions. I will also ensure that the design aligns with the overall brand and user experience guidelines. Once the edits are complete, I will present the updated Help page to the leadership team for their review and approval. This will allow me to move forward with confidence and ensure that the Help page meets the needs and expectations of our users."
    ]
}

prefix = "Please give description of question "

# Define preprocess function
def preprocess_function(examples):
    inputs = [prefix + sentence for sentence in examples["question"]]
    print(inputs)  # Print to debug
    
    model_inputs = tokenizer(inputs, max_length=200)
    print(model_inputs)  # Print to debug
    
    labels = tokenizer(text=examples["answer"], max_length=200)
    print(labels)  # Print to debug
    
    model_inputs['labels'] = labels['input_ids']
    print(model_inputs)  # Print to debug
    
    return model_inputs

# Call the preprocess function
preprocess_function(examples)


# Create a Dataset object
dataset = Dataset.from_dict(examples)

# Apply the preprocess function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
# tokenized_dataset = dataset.map(preprocess_function, batched=True)



['Please give description of question As a Data user, I want to have the 12-19-2017 deletions processed.', 'Please give description of question As a UI designer, I want to move on to round 2 of DABS or FABS landing page edits, so that I can get approvals from leadership.', 'Please give description of question As a UI designer, I want to move on to round 2 of the Help page edits, so that I can get approvals from leadership.']
{'input_ids': [[101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 2951, 5310, 1010, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 1012, 102], [101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 4830, 5910, 2030, 6904, 5910, 4899, 3931, 10086, 2015, 1010, 2061, 2008, 1045, 2064, 2131, 6226, 2015, 2013, 4105, 1012, 102], [101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 1996, 2393, 3931, 10086

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

['Please give description of question As a Data user, I want to have the 12-19-2017 deletions processed.', 'Please give description of question As a UI designer, I want to move on to round 2 of DABS or FABS landing page edits, so that I can get approvals from leadership.', 'Please give description of question As a UI designer, I want to move on to round 2 of the Help page edits, so that I can get approvals from leadership.']
{'input_ids': [[101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 2951, 5310, 1010, 1045, 2215, 2000, 2031, 1996, 2260, 1011, 2539, 1011, 2418, 3972, 20624, 5644, 13995, 1012, 102], [101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 4830, 5910, 2030, 6904, 5910, 4899, 3931, 10086, 2015, 1010, 2061, 2008, 1045, 2064, 2131, 6226, 2015, 2013, 4105, 1012, 102], [101, 3531, 2507, 6412, 1997, 3160, 2004, 1037, 21318, 5859, 1010, 1045, 2215, 2000, 2693, 2006, 2000, 2461, 1016, 1997, 1996, 2393, 3931, 10086

In [41]:
import nltk
import evaluate
# pip install nltk
# pip install datasetss
# pip install transformers[torch]
# pip install tokenizers
# pip install evaluate
import rouge_score
# pip install sentencepiece
# pip install huggingface_hub
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

In [42]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [56]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Global Parameters

L_RATE = 3e-4
BATCH_SIZE = 120
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

In [57]:
tokenized_dataset 

Dataset({
    features: ['question', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3
})

In [58]:
from datasets import Dataset

# Assuming your tokenized dataset is named tokenized_dataset
total_samples = len(tokenized_dataset)
train_ratio = 0.8  # You can adjust this ratio as needed

# Calculate the number of samples for training and evaluation
num_train_samples = int(total_samples * train_ratio)
num_eval_samples = total_samples - num_train_samples

# Split the dataset into training and evaluation
train_dataset = tokenized_dataset.select(range(num_train_samples))
eval_dataset = tokenized_dataset.select(range(num_train_samples, total_samples))

# Print the number of samples in each dataset for verification
print("Number of samples in training dataset:", len(train_dataset))
print("Number of samples in evaluation dataset:", len(eval_dataset))


Number of samples in training dataset: 2
Number of samples in evaluation dataset: 1


In [59]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=eval_dataset,  # Assuming you have a separate evaluation dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [60]:
trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: Expected input batch_size (120) to match target batch_size (279).