# Professional Coder in Madarin

## Data Preparation and Initialization

### Import Packages

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 datasets

In [None]:
import os
import json
import numpy as np
import random
import torch
import transformers
from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, TrainingArguments, logging
from tqdm.auto import tqdm
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

### Initialization

In [None]:
def same_seeds(seed):
    """Fixes random number generator seeds for reproducibility."""
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_num = 123475
same_seeds(seed_num)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

In [3]:
base_folder = "Your_Dataset_Folder"

### Hugging Face Token

In [None]:
from huggingface_hub import login
login("Your_HuggingFace_Token")

### Data Loading

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_py_en = load_dataset("json", data_files=f"{base_folder}/python_alpaca/188k-Vezora-PyCode-Alpaca-en.json", split='train')
dataset_py_ch = load_dataset("json", data_files=f"{base_folder}/python_alpaca/188k-Vezora-PyCode-Alpaca-ch.json", split='train')

dataset_py_en = dataset_py_en.remove_columns(["input"])
dataset_py_ch = dataset_py_ch.remove_columns(["input"])

# train, validation, test datasets split
dataset_py_ch = dataset_py_ch.train_test_split(test_size=0.05, shuffle=True, seed=seed_num)
# dataset_py_ch = dataset_py_ch.train_test_split(test_size=0.1, shuffle=True, seed=seed_num)
# dataset_py_ch_test_valid = dataset_py_ch['test'].train_test_split(test_size=0.5, shuffle=False)
# dataset_py_ch = DatasetDict({
#     'train': dataset_py_ch['train'],
#     'test': dataset_py_ch_test_valid['test'],
#     'valid': dataset_py_ch_test_valid['train']})

print(dataset_py_ch)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 475
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 25
    })
})


### Data Preprocessing

#### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("taide/TAIDE-LX-7B-Chat", use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#### Adding prompt template
Spetial tokens:  
`<s>`: 1  
`[INST]`: 29961, 25580, 29962  
`[\INST]`: 518, 29914, 25580, 29962

In [None]:
def add_template_to_instruction(inst: str, output: str, template: dict, tokenizer: AutoTokenizer):
    result = ''
    if 'system' in template:
        result += template['system']

    if 'user' in template:
        result += template['user'].format(BOS=tokenizer.bos_token,
                                          EOS=tokenizer.eos_token,
                                          prompt=inst.strip(),
                                          output=output.strip())
    return {'prompt': result}

In [None]:
inst_template = {
    "user": "<s>[INST] {prompt} [/INST] {output} </s>"
}

dataset_py_en = dataset_py_en.map(lambda x: add_template_to_instruction(
            x['instruction'], "", inst_template, tokenizer
        ), desc='Adding template to prompt')
dataset_py_ch_train = dataset_py_ch['train'].map(lambda x: add_template_to_instruction(
            x['instruction'], x['output'], inst_template, tokenizer
        ), desc='Adding template to prompt')

print(dataset_py_ch_train[0]['prompt'])

<s>[INST] 給定一個整數列表，找到列表中的最小元素和最大元素。將它們相乘並返回結果。此外，計算列表中所有整數的和並返回結果。

list_nums = [1, 4, 7, 2, 9, 5, 3, 8] [/INST] 要解決這個問題，您可以按照以下步驟進行：

1. 初始化兩個變量`smallest`和`largest`，設為列表的第一個元素。
2. 從第二個元素開始遍歷列表。
3. 對於每個元素，如果該元素小於當前的`smallest`，則更新`smallest`；如果該元素大於當前的`largest`，則更新`largest`。
4. 遍歷所有元素後，將`smallest`和`largest`相乘，將結果存儲在變量`product`中。
5. 使用`sum()`函數計算列表中所有元素的和，將結果存儲在變量`sum_nums`中。
6. 返回`product`和`sum_nums`作為元組。

這是實現這個算法的代碼：

```python
def find_product_and_sum(list_nums):
    smallest = list_nums[0]
    largest = list_nums[0]

    for num in list_nums[1:]:
        if num < smallest:
            smallest = num
        if num > largest:
            largest = num

    product = smallest * largest
    sum_nums = sum(list_nums)

    return product, sum_nums

list_nums = [1, 4, 7, 2, 9, 5, 3, 8]
result = find_product_and_sum(list_nums)
print(result)
```

輸出：
```
(9, 39)
```

列表中的最小元素是1，最大元素是9。它們的乘積是9。列表中所有元素的和是39。 </s>


In [None]:
inst_test_template = {
    "user": "<s>[INST] {prompt} [/INST]"
}
dataset_py_ch_test = dataset_py_ch['test'].map(lambda x: add_template_to_instruction(
            x['instruction'], "", inst_test_template, tokenizer
        ), desc='Adding template to prompt')

dataset_py_en = dataset_py_en.map(lambda x: add_template_to_instruction(
            x['instruction'], "", inst_test_template, tokenizer
        ), desc='Adding template to prompt')

print(dataset_py_ch_test[0]['prompt'])

Adding template to prompt:   0%|          | 0/22608 [00:00<?, ? examples/s]

<s>[INST] 編寫一個Python函數，該函數接受兩個等長列表，並創建一個字典，其中包含每個列表中具有相同索引的元素。該函數還應處理輸入列表長度不相等的情況並提供適當的錯誤處理。此外，該函數應處理包含嵌套列表或字典的輸入列表，並在每個索引處創建嵌套元素的字典。該函數還應處理包含元組的輸入列表，並在每個索引處創建元組元素的字典。如果輸入列表中的任何元素是集合，則該函數應引發ValueError。

list_one = [1, 2, [3, 4], {"a": "b"}, (7, 8), {9, 10}]
list_two = ["a", "b", [5, 6], {"c": "d"}, (11, 12), {13, 14}]

def create_index_dictionary(list_one, list_two):
    if len(list_one) != len(list_two):
        raise ValueError("輸入列表必須等長")
    
    index_dict = {}
    for i in range(len(list_one)):
        if isinstance(list_one[i], (list, dict, tuple)) and isinstance(list_two[i], (list, dict, tuple)):
            if isinstance(list_one[i], set) or isinstance(list_two[i], set):
                raise ValueError("輸入列表不能包含集合")
            
            nested_dict = {}
            if isinstance(list_one[i], dict) and isinstance(list_two[i], dict):
                for key in list_one[i].keys():
                    nested_dict[key] = (list_one[i][key], list_two[i][key])
            elif is

In [None]:
dataset_py_ch_train = dataset_py_ch_train.remove_columns(["instruction", "output"])
dataset_py_ch_train = dataset_py_ch_train.rename_column("prompt", "text")
dataset_train = dataset_py_ch_train

print(dataset_train[0])
print(dataset_train)

{'text': '<s>[INST] 給定一個整數列表，找到列表中的最小元素和最大元素。將它們相乘並返回結果。此外，計算列表中所有整數的和並返回結果。\n\nlist_nums = [1, 4, 7, 2, 9, 5, 3, 8] [/INST] 要解決這個問題，您可以按照以下步驟進行：\n\n1. 初始化兩個變量`smallest`和`largest`，設為列表的第一個元素。\n2. 從第二個元素開始遍歷列表。\n3. 對於每個元素，如果該元素小於當前的`smallest`，則更新`smallest`；如果該元素大於當前的`largest`，則更新`largest`。\n4. 遍歷所有元素後，將`smallest`和`largest`相乘，將結果存儲在變量`product`中。\n5. 使用`sum()`函數計算列表中所有元素的和，將結果存儲在變量`sum_nums`中。\n6. 返回`product`和`sum_nums`作為元組。\n\n這是實現這個算法的代碼：\n\n```python\ndef find_product_and_sum(list_nums):\n    smallest = list_nums[0]\n    largest = list_nums[0]\n\n    for num in list_nums[1:]:\n        if num < smallest:\n            smallest = num\n        if num > largest:\n            largest = num\n\n    product = smallest * largest\n    sum_nums = sum(list_nums)\n\n    return product, sum_nums\n\nlist_nums = [1, 4, 7, 2, 9, 5, 3, 8]\nresult = find_product_and_sum(list_nums)\nprint(result)\n```\n\n輸出：\n```\n(9, 39)\n```\n\n列表中的最小元素是1，最大元素是9。它們的乘積是9。列表中所有元素的和是39。 </s>'}
Dataset({
    features: ['text

In [None]:
dataset_test = dataset_py_ch_test.remove_columns(["instruction", "output"])
dataset_test = dataset_test.rename_column("prompt", "text")
print(dataset_test)

Dataset({
    features: ['text'],
    num_rows: 25
})


### Chain-of-Thought

In [None]:
CoT = True

if CoT:
    dataset_train = load_dataset("json", data_files=f"{base_folder}/python_alpaca/188k-Vezora-PyCode-Alpaca-ch-CoT.json", split='train')
    dataset_test = dataset_py_ch.train_test_split(test_size=0.05, shuffle=True, seed=seed_num)

## Model Training

### Model Initialization

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

### Construct Model

In [None]:
base_model = "taide/TAIDE-LX-7B-Chat"
finetuned_model = f"{base_folder}/madarin-coder"

model = AutoModelForCausalLM.from_pretrained("taide/TAIDE-LX-7B-Chat",
                                             torch_dtype=torch.float16,
                                             device_map='auto',
                                             quantization_config=quant_config,
                                             )
model.config.use_cache = False
model.config.pretraining_tp = 1
print(type(model))
# model = torch.compile(model)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at taide/TAIDE-LX-7B-Chat and are newly initialized: ['model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.

### Training Configuration

In [None]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_checkpointing = True,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=1e-3,
    fp16=False,
    bf16=True, # set bf16 to True with an A100
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



### Model Training

In [None]:
trainer.train()



Step,Training Loss
25,1.1258




TrainOutput(global_step=29, training_loss=1.106479842087318, metrics={'train_runtime': 155.0742, 'train_samples_per_second': 3.063, 'train_steps_per_second': 0.187, 'total_flos': 6268092290924544.0, 'train_loss': 1.106479842087318, 'epoch': 0.97})

In [None]:
trainer.model.save_pretrained(finetuned_model)

### Madarin Coding Generation

In [None]:
generator = pipeline("text-generation",
                      model=trainer.model,
                      tokenizer=tokenizer,
                      pad_token_id=tokenizer.pad_token_id,
                      eos_token_id=tokenizer.eos_token_id,
                      max_new_tokens=2048,
                      device=0,
                      )

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'O

In [None]:
result = generator(dataset_py_ch_test['prompt'],
              return_full_text=False,
              max_new_tokens=2048,
              )



In [None]:
out = {}
for i, (data, output) in enumerate(zip(dataset_py_ch_test['prompt'], result)):
    out[i] = {'prompt': data, 'output': output[0]['generated_text']}

In [None]:
with open(f'{base_folder}/result_prompt.json', 'w', encoding='utf-8') as f:
    json.dump(out, f)

In [5]:
with open(f'{base_folder}/result_prompt.json', 'r', encoding='utf-8') as f:
    json_result = json.load(f)

In [7]:
json_result['']['output']

' 為了解決這個問題，我們首先將兩個字符串連接起來，中間插入一個空格。然後，我們使用Python的`set()`函數來移除任何重複字符。最後，我們按降序排序字符並返回修改後的字符串。\n\n以下是實現這個功能的Python代碼：\n\n```python\ndef connect_and_remove_duplicates(a, b):\n    result = a + " " + b\n    result = result.replace(result[0], "")\n    result = result.replace(result[-1], "")\n    return result.lower()\n```\n\n使用該代碼，輸入字符串A和B，輸出字符串將按降序排序並移除任何重複字符。\n\n示例：\nA = "Hello"\nB = "World!"\n\n連接並移除重複字符後的結果字符串是"!WroldH"。按降序排序字符後，最終輸出將是"!WroldH"。\n\n輸出："!WroldH"\n```python\n\n注意：在Python中，我們使用`set()`函數來移除任何重複字符，而不是`remove()`函數。此外，我們使用`lower()`函數來按降序排序字符。 '

In [None]:
json_result

{'0': {'prompt': '<s>[INST] 編寫一個Python函數，該函數接受兩個等長列表，並創建一個字典，其中包含每個列表中具有相同索引的元素。該函數還應處理輸入列表長度不相等的情況並提供適當的錯誤處理。此外，該函數應處理包含嵌套列表或字典的輸入列表，並在每個索引處創建嵌套元素的字典。該函數還應處理包含元組的輸入列表，並在每個索引處創建元組元素的字典。如果輸入列表中的任何元素是集合，則該函數應引發ValueError。\n\nlist_one = [1, 2, [3, 4], {"a": "b"}, (7, 8), {9, 10}]\nlist_two = ["a", "b", [5, 6], {"c": "d"}, (11, 12), {13, 14}]\n\ndef create_index_dictionary(list_one, list_two):\n    if len(list_one) != len(list_two):\n        raise ValueError("輸入列表必須等長")\n    \n    index_dict = {}\n    for i in range(len(list_one)):\n        if isinstance(list_one[i], (list, dict, tuple)) and isinstance(list_two[i], (list, dict, tuple)):\n            if isinstance(list_one[i], set) or isinstance(list_two[i], set):\n                raise ValueError("輸入列表不能包含集合")\n            \n            nested_dict = {}\n            if isinstance(list_one[i], dict) and isinstance(list_two[i], dict):\n                for key in list_one[i].keys():\n                    nested_dict[key] = (list_one[i][key], l