<a href="https://colab.research.google.com/github/Carlos1729/Transformers_Code/blob/main/Data_Preperation_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install datasets



In [67]:
!pip install transformers



In [68]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

In [69]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [70]:
text = "Hi, how are you?"

In [71]:
#Auto tokenizer finds the right tokenizer  for your model by default

In [72]:
encoded_text = tokenizer(text)['input_ids']

In [73]:
encoded_text

[12764, 13, 849, 403, 368, 32]

In [74]:
decoded_text = tokenizer.decode(encoded_text)
print("Decoded tokens back into text: ", decoded_text)

Decoded tokens back into text:  Hi, how are you?


### Tokenize multiple texts at once

In [75]:
list_texts = ["Hi, how are you?", "I'm good", "Yes"]
encoded_texts = tokenizer(list_texts)
print("Encoded several texts: ", encoded_texts["input_ids"])

Encoded several texts:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175], [4374]]


### Padding and truncation

In [76]:
tokenizer.pad_token = tokenizer.eos_token#eos token is 0
encoded_texts_longest = tokenizer(list_texts, padding=True)
print("Using padding: ", encoded_texts_longest["input_ids"])#padding is  added to make all strings of same length so that training becomes easy

Using padding:  [[12764, 13, 849, 403, 368, 32], [42, 1353, 1175, 0, 0, 0], [4374, 0, 0, 0, 0, 0]]


In [77]:
encoded_texts_truncation = tokenizer(list_texts, max_length=3, truncation=True)#trunctating all tokens to a given size
print("Using truncation: ", encoded_texts_truncation["input_ids"])

Using truncation:  [[12764, 13, 849], [42, 1353, 1175], [4374]]


In [78]:
tokenizer.truncation_side = "left"#some times we need at the end then we can do this basically trunctaing last 3
encoded_texts_truncation_left = tokenizer(list_texts, max_length=3, truncation=True)
print("Using left-side truncation: ", encoded_texts_truncation_left["input_ids"])

Using left-side truncation:  [[403, 368, 32], [42, 1353, 1175], [4374]]


In [79]:
encoded_texts_both = tokenizer(list_texts, max_length=3, truncation=True, padding=True)
print("Using both padding and truncation: ", encoded_texts_both["input_ids"])

Using both padding and truncation:  [[403, 368, 32], [42, 1353, 1175], [4374, 0, 0]]


{0: {0: 'train'}}

In [84]:
import pandas as pd

filename = "lamini_docs_processed.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]
  print(text)

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             

In [85]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[ 4118 19782    27   187  4118 19782    27   187  2347   476   309  7472
    253  3045   285  3290   273   253  4561  2505   432   418  4988    74
   3210    32   187   187  4118 37741    27   187   187  4118 37741    27
   2512   403  2067 17082   326   476   320   908   281  7472   253  3045
    285  3290   273  4561  2505   432   418  4988    74  3210    13  1690
  44229   414    13   378  1843    54  4868    13   285  1966  7103    15
   3545 12813   414  5593   849   973   253  1566 26295   253  1735  3159
    275   247  3425    13  1223   378  1843    54  4868  5593   253 14259
    875   253  4561  2505   285   247  3806  2505    15  8801  7103  8687
   1907  1966 16006  2281   253  3290   273   253  4561  2505  1754   327
   2616   824   347 25253    13  2938  1371    13   285 17200    15   733
    310  8521   281   897   247  5019   273   841 17082   323   247 11088
   7103   273   253  1566   434  3045    15]]


In [86]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [87]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [88]:
tokenized_inputs["input_ids"]

array([[ 4118, 19782,    27,   187,  4118, 19782,    27,   187,  2347,
          476,   309,  7472,   253,  3045,   285,  3290,   273,   253,
         4561,  2505,   432,   418,  4988,    74,  3210,    32,   187,
          187,  4118, 37741,    27,   187,   187,  4118, 37741,    27,
         2512,   403,  2067, 17082,   326,   476,   320,   908,   281,
         7472,   253,  3045,   285,  3290,   273,  4561,  2505,   432,
          418,  4988,    74,  3210,    13,  1690, 44229,   414,    13,
          378,  1843,    54,  4868,    13,   285,  1966,  7103,    15,
         3545, 12813,   414,  5593,   849,   973,   253,  1566, 26295,
          253,  1735,  3159,   275,   247,  3425,    13,  1223,   378,
         1843,    54,  4868,  5593,   253, 14259,   875,   253,  4561,
         2505,   285,   247,  3806,  2505,    15,  8801,  7103,  8687,
         1907,  1966, 16006,  2281,   253,  3290,   273,   253,  4561,
         2505,  1754,   327,  2616,   824,   347, 25253,    13,  2938,
      

### Tokenize the instruction dataset

In [89]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [90]:
finetuning_dataset_loaded = datasets.load_dataset("json", data_files="lamini_docs_processed.jsonl", split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

print(tokenized_dataset)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 1400
})


In [91]:
tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

In [92]:
tokenized_dataset

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1400
})

In [93]:
dstopd = tokenized_dataset.to_pandas()

In [94]:
dstopd

Unnamed: 0,question,answer,input_ids,attention_mask,labels
0,### Question:\nHow can I evaluate the performa...,There are several metrics that can be used to ...,"[4118, 19782, 27, 187, 2347, 476, 309, 7472, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 2347, 476, 309, 7472, 2..."
1,### Question:\nCan I find information about th...,"Yes, the code includes methods for submitting ...","[4118, 19782, 27, 187, 5804, 309, 1089, 1491, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 5804, 309, 1089, 1491, ..."
2,### Question:\nHow does Lamini AI handle reque...,Lamini AI offers features for generating text ...,"[4118, 19782, 27, 187, 2347, 1057, 418, 4988, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 2347, 1057, 418, 4988, ..."
3,### Question:\nDoes the `submit_job()` functio...,It is unclear which `submit_job()` function is...,"[4118, 19782, 27, 187, 10795, 253, 2634, 21399...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 10795, 253, 2634, 21399..."
4,### Question:\nDoes the `add_data()` function ...,"No, the `add_data()` function does not support...","[4118, 19782, 27, 187, 10795, 253, 2634, 1911,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 10795, 253, 2634, 1911,..."
...,...,...,...,...,...
1395,### Question:\nDoes Lamini have the ability to...,"Yes, Lamini has the ability to understand and ...","[4118, 19782, 27, 187, 10795, 418, 4988, 74, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 10795, 418, 4988, 74, 4..."
1396,### Question:\nCan I fine-tune the pre-trained...,"Yes, you can fine-tune the pre-trained models ...","[4118, 19782, 27, 187, 5804, 309, 4030, 14, 85...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 5804, 309, 4030, 14, 85..."
1397,### Question:\nCan Lamini generate text that i...,"Yes, Lamini can generate text that is suitable...","[4118, 19782, 27, 187, 5804, 418, 4988, 74, 66...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 5804, 418, 4988, 74, 66..."
1398,### Question:\nDoes the documentation have a s...,I wish! This documentation only talks about La...,"[4118, 19782, 27, 187, 10795, 253, 10097, 452,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4118, 19782, 27, 187, 10795, 253, 10097, 452,..."


In [95]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 140
    })
})
