In [1]:
# difine tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

In [2]:
dataset_max_length = 512
dataset = "../dataset/alpaca_data.json"
apply_dataset_rate = 0.00002

In [3]:
import json
import pandas

def load_alpaca_dataset(alpaca_dataset, apply_dataset_rate):
    with open(alpaca_dataset, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data_count = len(data)
        iloc_data = data[0:int(data_count * apply_dataset_rate)]
        dataframe = pandas.DataFrame(iloc_data)
    dataframe = dataframe[['instruction', 'input', 'output']]
    return dataframe

In [4]:
alpaca_dataset = load_alpaca_dataset(dataset, apply_dataset_rate)
alpaca_dataset.head()

Unnamed: 0,instruction,input,output
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...


In [5]:
def clean_alpaca_dataset(dataframe):
    cleaned_df = dataframe.copy()
    for column in ['instruction', 'input', 'output']:
        if column in cleaned_df.columns:
            cleaned_df[column] = cleaned_df[column].astype(str).str.replace('\n', ' ') # replace \n to space
            cleaned_df[column] = cleaned_df[column].str.replace('\r', ' ') # replace \r to space
            cleaned_df[column] = cleaned_df[column].str.replace(r'\s+', ' ', regex=True) # replace continuous spaces to single space
            cleaned_df[column] = cleaned_df[column].str.strip() # remove leading and trailing spaces
    return cleaned_df

In [6]:
cleaned_dataframe = clean_alpaca_dataset(dataframe=alpaca_dataset)
cleaned_dataframe.head()

Unnamed: 0,instruction,input,output
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...


In [7]:
def format_alpaca_dataset(dataframe):
    formatted_results = []
    for i in range(len(dataframe)):
        example_dict = dataframe.iloc[i].to_dict()
        instruction = example_dict["instruction"]
        user_input = example_dict["input"]
        output = example_dict["output"]

        if user_input:
            formatted_text = (
                f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
                f"{instruction}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>"
                f"{user_input}<|eot_id|>"
                f"<|start_header_id|>assistant<|end_header_id|>"
                f"{output}<|eot_id|><|end_of_text|>"
            )
        else:
            formatted_text = (
                f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
                f"{instruction}<|eot_id|>"
                f"<|start_header_id|>assistant<|end_header_id|>"
                f"{output}<|eot_id|><|end_of_text|>"
            )
        formatted_results.append(formatted_text)
    return formatted_results

In [8]:
formatted_text = format_alpaca_dataset(dataframe=cleaned_dataframe)
formatted_text[0]

'<|begin_of_text|><|start_header_id|>user<|end_header_id|>Give three tips for staying healthy.<|eot_id|><|start_header_id|>assistant<|end_header_id|>1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 2. Exercise regularly to keep your body active and strong. 3. Get enough sleep and maintain a consistent sleep schedule.<|eot_id|><|end_of_text|>'

In [9]:
# # print formatted text list
# input_word_list = []

# input_word = formatted_text[0]
# input_word_list.append(input_word)
# print(input_word_list)

In [10]:
# encode formatted text
fukk_token_list = []

tokenizer.add_special_tokens({'pad_token': '<|end_of_text|>'})
tokens = tokenizer(formatted_text[0], truncation=True, padding="max_length", max_length=dataset_max_length, return_tensors="pt")
print(tokens)
fukk_token_list.append(tokens)

{'input_ids': tensor([[128000, 128000, 128006,    882, 128007,  36227,   2380,  10631,    369,
          19994,   9498,     13, 128009, 128006,  78191, 128007,     16,   5253,
            266,    264,  24770,  10173,    323,   1304,   2771,    311,   2997,
          11510,    315,  26390,    323,  24822,     13,    220,     17,     13,
          33918,  15870,    311,   2567,    701,   2547,   4642,    323,   3831,
             13,    220,     18,     13,   2175,   3403,   6212,    323,  10519,
            264,  13263,   6212,   9899,     13, 128009, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
         12800

In [11]:
input_ids = fukk_token_list[0]["input_ids"].squeeze()
print(input_ids)

tensor([128000, 128000, 128006,    882, 128007,  36227,   2380,  10631,    369,
         19994,   9498,     13, 128009, 128006,  78191, 128007,     16,   5253,
           266,    264,  24770,  10173,    323,   1304,   2771,    311,   2997,
         11510,    315,  26390,    323,  24822,     13,    220,     17,     13,
         33918,  15870,    311,   2567,    701,   2547,   4642,    323,   3831,
            13,    220,     18,     13,   2175,   3403,   6212,    323,  10519,
           264,  13263,   6212,   9899,     13, 128009, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 

In [12]:
attention_mask = fukk_token_list[0]["attention_mask"].squeeze()
print(attention_mask)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
assistant_start = "<|start_header_id|>assistant<|end_header_id|>" # assistant部分までをプロンプトとして扱う

# if assistant_start in input_word_list[0]
if assistant_start in formatted_text[0]:
    prompt_end_pos = formatted_text[0].find(assistant_start) + len(assistant_start) # assistant開始位置までをプロンプトとして扱う
    prompt_text = formatted_text[0][:prompt_end_pos]
    print(prompt_end_pos)
    
    # tokenize prompt text
    prompt_tokens = tokenizer(prompt_text, truncation=True, padding=False, add_special_tokens=False)
    print(prompt_tokens)

    # mask labels
    labels = input_ids.clone() # ラベルを作成してプロンプト部分をマスク
    prompt_length = len(prompt_tokens["input_ids"])
    labels[:prompt_length] = -100 # パディングトークンをマスク
    print(labels)

    result = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

148
{'input_ids': [128000, 128006, 882, 128007, 36227, 2380, 10631, 369, 19994, 9498, 13, 128009, 128006, 78191, 128007], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100, 128007,     16,   5253,
           266,    264,  24770,  10173,    323,   1304,   2771,    311,   2997,
         11510,    315,  26390,    323,  24822,     13,    220,     17,     13,
         33918,  15870,    311,   2567,    701,   2547,   4642,    323,   3831,
            13,    220,     18,     13,   2175,   3403,   6212,    323,  10519,
           264,  13263,   6212,   9899,     13, 128009, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        12800

In [14]:
# test decode labels
cleaning_attention_mask = labels[labels != -100]
decoded_text = tokenizer.decode(cleaning_attention_mask)
print(decoded_text)

<|end_header_id|>1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 2. Exercise regularly to keep your body active and strong. 3. Get enough sleep and maintain a consistent sleep schedule.<|eot_id|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of