In [1]:
import random
import os
import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(1006)

# Load the Alpaca-GPT4 Dataset

In [2]:
import json

path = "./data/alpaca_gpt4_data.json"
with open(path, "r") as f:
    alpaca = json.load(f)

In [3]:
alpaca[0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'}

In [4]:
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n").format_map(row)

def prompt_with_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n").format_map(row)

In [5]:
def create_prompt(row):
    return prompt_no_input(row) if row["input"] == "" else prompt_with_input(row)

In [6]:
prompts = [create_prompt(row) for row in alpaca]

In [7]:
EOS_TOKEN = "</s>"
outputs = [row['output'] + EOS_TOKEN for row in alpaca]

In [8]:
dataset = [{"prompt":s, "output":t, "example": s+t} for s, t in zip(prompts, outputs)]

# Tokenizer

In [9]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model_name = "EleutherAI/pythia-70m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
tokenizer.eos_token

'<|endoftext|>'

# Create Train Test Split

In [12]:
random.shuffle(dataset)

train_size = int(0.9 * len(dataset))
train_dataset = dataset[:train_size]
eval_dataset = dataset[train_size:]

In [13]:
def pack(dataset, max_seq_len=512):
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input + [tokenizer.eos_token_id])
    
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):
        input_ids = all_token_ids[i : i + max_seq_len+1]
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids, "labels": input_ids})
    return packed_ds


train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)

# Data Loader

In [14]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

In [15]:
batch_size = 2

In [16]:
train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
)

eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)

# Training

In [17]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

In [34]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type='CAUSAL_LM',
)

In [35]:
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=0, trust_remote_code=True)

In [36]:
model = get_peft_model(model, lora_config)

In [37]:
def evaluate_model(model, test_dl, device):
    model.eval()
    total_loss = 0

    for batch in test_dl:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
    
    avg_eval_loss = total_loss / len(test_dl)
    print(f"Average Evaluation Loss: {avg_eval_loss}")

In [38]:
from tqdm import tqdm

In [39]:
def train_model(model, train_dl, test_dl, epochs, lr, device):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    model.train()
    print("Start training...")

    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for batch in tqdm(train_dl):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dl)
        print(f"Epoch {epoch} - Average Training Loss: {avg_train_loss}")

        evaluate_model(model, test_dl, device)
        torch.cuda.empty_cache()

    print("Training finished...")

In [40]:
epochs = 2
lr = 5e-5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [41]:
train_model(model, train_dataloader, eval_dataloader, epochs, lr, device)

Start training...


100%|██████████| 9001/9001 [07:39<00:00, 19.59it/s]


Epoch 0 - Average Training Loss: 2.288934174888254


 50%|█████     | 1/2 [08:02<08:02, 482.24s/it]

Average Evaluation Loss: 2.2250637948393583


100%|██████████| 9001/9001 [07:39<00:00, 19.61it/s]


Epoch 1 - Average Training Loss: 2.2154919250503324


100%|██████████| 2/2 [16:04<00:00, 482.02s/it]

Average Evaluation Loss: 2.192011922538161
Training finished...





In [42]:
# Save peft model

# model.save_pretrained("weight/pythia_70m_lora_r=8")
model.save_pretrained("weight/pythia_70m_lora_r=64")


# Initial Test

In [26]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, GPTNeoXForCausalLM

In [27]:
# model_name = "NousResearch/Llama-2-7b-hf"
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map=0)

model = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-70m",
  # "EleutherAI/pythia-70m-deduped",
  revision="step143000",
  cache_dir="./pythia-70m/step143000"
)

In [28]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [29]:
model = model.to("cuda")

In [4]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step143000",
  cache_dir="./pythia-70m-deduped/step143000",
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite three advantages of fruits\n\n### Response:\n"

In [6]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [7]:
generated_text = generator(prompt, max_length=128)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [8]:
generated_text

[{'generated_text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.  Write three advantages of fruits\n\n### Response:\n\n1.'}]

In [1]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GPTNeoXForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model(path):
    print(f"Loading model...")

    base_model = GPTNeoXForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map=0,
    )
    model = PeftModel.from_pretrained(base_model, path)
    model = model.merge_and_unload()
    return model

In [3]:
path = "./output/70m/no_trainer"
model_name = "EleutherAI/pythia-70m-deduped"
model = load_model(path)

Loading model...


In [4]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(
  "EleutherAI/pythia-70m-deduped",
  revision="step143000",
  cache_dir="./pythia-70m-deduped/step143000",
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite three advantages of fruits\n\n### Response:\n"

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [6]:
generated_text = generator(prompt, max_length=128)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [7]:
generated_text

[{'generated_text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWrite three advantages of fruits\n\n### Response:\n1. Fruit: The fruits of a fruit are edible, and they are edible.\n2. Vegetable: The fruits of a fruit are edible, and they are edible.\n3. Vegetable: The fruits of a fruit are edible, and they are edible.\n4. Vegetable: The fruits of a fruit are edible, and they are edible.\n5. Vegetable: The fruits of a fruit are edible, and they are edible.'}]