<a href="https://colab.research.google.com/github/BUDparty/AImodel/blob/main/sy/mydata_Continued_pretraining_Korean_%2B_Unsloth_Llama_8b_ins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [3]:
from unsloth import FastLanguageModel
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer

max_seq_length = 2048  # 최대 시퀀스 길이 설정
dtype = None  # 데이터 타입 자동 감지. Tesla T4, V100에는 Float16, Ampere+에는 Bfloat16 사용
load_in_4bit = True  # 4비트 양자화를 사용하여 메모리 사용량 감소
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "seungyeol54/8B_ori_inst_train_full_fix2",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# 새로운 LoRA 어댑터 적용
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla V100-SXM2-16GB. Max memory: 15.773 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Already have LoRA adapters! We shall skip this step.


In [4]:

# Wikipedia provides a title and an article text.
# Use https://translate.google.com!
_wikipedia_prompt = """Wikipedia Article
### Title: {}

### Article:
{}"""
# becomes:
wikipedia_prompt = """위키피디아 기사
### 제목: {}

### 기사:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    titles = examples["title"]
    texts  = examples["text"]
    outputs = []
    for title, text in zip(titles, texts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = wikipedia_prompt.format(title, text) + EOS_TOKEN
        outputs.append(text)
    return { "text" : outputs, }
pass


In [5]:

from datasets import load_dataset

dataset = load_dataset("wikimedia/wikipedia", "20231101.ko", split = "train",)

# We select 1% of the data to make training faster!
dataset = dataset.train_test_split(train_size = 0.01)["train"]

dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/400M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/177M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/647897 [00:00<?, ? examples/s]

Map:   0%|          | 0/6478 [00:00<?, ? examples/s]

In [6]:

from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use warmup_ratio and num_train_epochs for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)


Map (num_proc=2):   0%|          | 0/6478 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 6,478 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Currently logged in as: [33mlion4464[0m ([33mlsyle[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,2.1645
2,2.0288
3,2.1389
4,2.1267
5,2.2754
6,2.1367
7,1.8652
8,2.0436
9,2.2001
10,2.1613


In [8]:
from datasets import load_dataset
alpaca_dataset = load_dataset("FreedomIntelligence/alpaca-gpt4-korean", split = "train")

print(alpaca_dataset[0])

Downloading readme:   0%|          | 0.00/124 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49969 [00:00<?, ? examples/s]

{'conversations': [{'from': 'human', 'value': '재활용 캠페인 슬로건을 제시하세요.\n'}, {'from': 'gpt', 'value': '1. "더욱 녹색 미래를 위해 함께 줄이고, 재사용하고, 재활용하세요."\n2. "더 나은 내일을 위해 오늘 바로 재활용하세요."\n3. "쓰레기를 보물로 만드는 법 - 재활용!"\n4. "인생의 순환을 위해 재활용하세요."\n5. "자원을 아끼고 더 많이 재활용하세요."'}], 'id': '23712'}


In [11]:
_alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""
# Becomes:
alpaca_prompt = """다음은 작업을 설명하는 명령입니다. 요청을 적절하게 완료하는 응답을 작성하세요.

### 지침:
{}

### 응답:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(conversations):
    texts = []
    conversations = conversations["conversations"]
    for convo in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(convo[0]["value"], convo[1]["value"]) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/49969 [00:00<?, ? examples/s]

In [12]:
!pip install wandb



In [13]:
import wandb
wandb.login(key="d658ecc8bcdf590e9c660d3956c716ba20d090a9")

wandb.init(project="budparty", name="lsyle")



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,█▆▆█▅█▃▃▂▃▂▂▄▂▂▂▂▃▁▅▄▂▅▅▄▃▂▂▃▁▄▁▃▃▂▃▂▃▂▁
train/learning_rate,▂▄▆███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,██▅█▅▆▅▅▆▃▁▂▅▄▅▅▄▂▄▃▅▃▃▃▃▂▃▃▂▃▅▃▃▂▁▃▄▃▄▃

0,1
total_flos,6.194532411698381e+16
train/epoch,0.29639
train/global_step,120.0
train/grad_norm,0.40422
train/learning_rate,0.0
train/loss,1.6588
train_loss,1.78273
train_runtime,1229.683
train_samples_per_second,1.561
train_steps_per_second,0.098


In [14]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb"  # wandb에 기록
    ),
)

Map (num_proc=8):   0%|          | 0/49969 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 49,969 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.4369
2,2.3987
3,2.0958
4,2.0398
5,1.9179
6,1.7083
7,1.7935
8,1.8377
9,1.6158
10,1.6881


In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        # "Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,", # instruction
        "피보나치 수열을 계속하세요: 1, 1, 2, 3, 5, 8,", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>다음은 작업을 설명하는 명령입니다. 요청을 적절하게 완료하는 응답을 작성하세요.\n\n### 지침:\n피보나치 수열을 계속하세요: 1, 1, 2, 3, 5, 8,\n\n### 응답:\n1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, ']

In [31]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        # "What is Korean music like?"
        "북한과 한국의 TV프로그램에 대해서 설명해봐", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)


<|begin_of_text|>다음은 작업을 설명하는 명령입니다. 요청을 적절하게 완료하는 응답을 작성하세요.

### 지침:
북한과 한국의 TV프로그램에 대해서 설명해봐

### 응답:
대한민국과 북한은 서로 다른 사회과 문화를 가지고 있기 때문에 TV 프로그램도 그 차이로 인해 매우 다릅니다. 북한 TV 프로그램은 주로 공산주의적이고 국가주의적인 내용을 담고 있으며, 한국 TV 프로그램은 더 다양한 장르와 주제를 가지고 있습니다.

북한 TV 프로그램은 주로 국가주의적이고 공산주의적인 내용을 담고 있으며, 종종 공산주의적이고 사회주의적인 가치를 강조합니다. 북한 TV 프로그램은 주로 국가의 위상과 지위, 그리고 공산주의적 가치에 초점을 맞추며, 종종 공산주의적이고 사회주의적인 가치를 강조합니다. 예를 들어, 북한 TV 프로그램은 종종 공산주의적이고 사회주의적인 가치를 강조하며, 국가의 위상과 지위에 초점을 맞추고 있습니다. 이러한 프로그램은 종종 공산주의적이고 사회주의적인 가치를 강조하며, 국가의 위상과 지위에 초점을 맞추고 있습니다.

한국 TV 프로그램은 더 다양한 장르와 주제를 가지고 있으며, 종종 엔터테인먼트, 교육, 뉴스, 시사, 드라마, 코미디 등 다양한 장르를 가지고 있습니다. 한국 TV 프로그램은 종종 더 다양한 주제를 가지고 있으며, 더 많은 시청자들에게 관심을 끌고 있습니다. 예를 들어, 한국 TV 프로그램은 종종 엔터테인먼트, 교육, 뉴스, 시사, 드라마, 코미디 등 다양한 장르를 가지고 있으며, 종종 더 다양한 주제를 가지고 있으며, 더 많은 시청자들에게 관심을 끌고 있습니다. 이러한 프로그램은 종종 더 다양한 주제를 가지고 있으며, 더 많은 시청자들에게 관심을 끌고 있습니다.<|eot_id|>


In [24]:
!pip install huggingface_hub



In [27]:
from huggingface_hub import HfApi, login
# Hugging Face 로그인
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [28]:
from huggingface_hub import HfApi

# Hugging Face API 인스턴스 생성
api = HfApi()

# 새로운 레포지토리 생성
repo_name = "Llama_8B_ori_inst_train_full_fix2_gpt_wiki"  # 원하는 레포지토리 이름
username = "seungyeol54"  # Hugging Face 사용자 이름
full_repo_name = f"{username}/{repo_name}"

In [29]:

# 모델과 토크나이저를 Hugging Face Hub에 업로드
model.push_to_hub(full_repo_name)
tokenizer.push_to_hub(full_repo_name)

print("Model and tokenizer have been uploaded to both Google Drive and Hugging Face Hub.")

README.md:   0%|          | 0.00/610 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/seungyeol54/Llama_8B_ori_inst_train_full_fix2_gpt_wiki
Model and tokenizer have been uploaded to both Google Drive and Hugging Face Hub.


In [34]:
from datasets import load_dataset
import json
from datasets import Dataset, DatasetDict
# 사용자 지정 데이터셋 로드
new_data_path = '/content/formatted_dataset.json'
new_data = []


with open(new_data_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data = json.loads(line.strip())
            new_data.append(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

# JSON 데이터를 Dataset으로 변환
new_dataset = Dataset.from_list(new_data)

print(new_dataset[0])

{'instruction': '북한 은어를 설명하세요.', 'input': '11호차', 'output': '두 다리', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n북한 은어를 설명하세요.\n\n### Input:\n11호차\n\n### Response:\n두 다리'}


In [35]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = new_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        # warmup_ratio = 0.1,
        # num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb"  # wandb에 기록
    ),
)

Map (num_proc=8):   0%|          | 0/92798 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [36]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 92,798 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.2533
2,1.3454
3,1.2495
4,1.2078
5,1.0826
6,1.0193
7,1.1517
8,1.3035
9,1.059
10,1.1847


In [39]:

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        # "What is Korean music like?"
        "얼음보숭이", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

<|begin_of_text|>다음은 작업을 설명하는 명령입니다. 요청을 적절하게 완료하는 응답을 작성하세요.

### 지침:
얼음보숭이

### 응답:
"얼음보숭이"는 미국에서 1974년에 발매된 팝 록 밴드 The Doobie Brothers의 노래입니다. 이 노래는 1974년 6월 8일 발매되었으며, 1974년 10월 19일 싱글로 발매되었습니다. 이 노래는 1974년 12월 7일 빌보드 핫 100 차트에서 1위를 차지했습니다.<|eot_id|>
