In [2]:
#!pip install scikit-learn
!pip install numpy==1.23.4

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy==1.23.4
  Downloading numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
[0mSuccessfully installed numpy-1.23.4


In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM, BatchEncoding,Trainer, TrainingArguments
import torch
from math import ceil
from transformers import DataCollatorForLanguageModeling
import wandb
from sys import getsizeof
from torch.cuda import device_count
from numba import cuda

In [22]:
for i in range(device_count()):
    cuda.select_device(i).reset()

In [4]:
def chunk_array(arr, chunk_size):
    """Chunks an array into chunks of specified size."""
    # For every index in arr stepping by chunk_size, slice arr from that index to index + chunk_size
    return [arr[i:i + chunk_size] for i in range(0, len(arr), chunk_size)]

def tokenize(element,context_length):
    outputs = tokenizer(
        element['text'].to_list(),
        truncation=False
    )
    #print(outputs[2].attention_mask)
    input_batch = []
    for i in range(len(element)):
        temp = outputs[i].ids.copy()
        temp.append(tokenizer.encode(tokenizer.eos_token)[1])
        #outputs[i].ids.append(tokenizer.encode(tokenizer.eos_token)[1])
        input_batch += temp
    print(len(input_batch))
    chunks = chunk_array(input_batch, context_length)
    if len(chunks[-1]) != context_length:
        del chunks[-1]
    #last_chunk = chunks[-1]
    #n_pads = context_length - len(last_chunk)
    #last_chunk += [tokenizer.encode(tokenizer.pad_token)[1]]*n_pads
    
    wrapped_chunks = [BatchEncoding({"input_ids":torch.tensor(chunk), "attention_mask":torch.tensor([1]*context_length)}) for chunk in chunks]
    
    return wrapped_chunks

In [3]:
model_name = 'meta-llama/Llama-2-13b-hf'#'artifacts/checkpoint-cme0pvg4:v2'
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token='')
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             token = '',
                                             torch_dtype=torch.float32,
                                             device_map='auto')

context_length = 4096
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = "right"
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
data = pd.read_json("pre_train_data.json",compression=None)
train, test = train_test_split(data,test_size = 0.05,random_state = 42)
chunks = tokenize(test,context_length)
print(chunks[0])

248603
{'input_ids': tensor([    1,  4721,   862,  ...,   262,  6352, 20642]), 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1])}


In [8]:
train_chunks = tokenize(train,context_length)
#train_chunks = train_chunks[4:]
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([chunks[i] for i in range(8)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

8799651
input_ids shape: torch.Size([8, 4096])
attention_mask shape: torch.Size([8, 4096])
labels shape: torch.Size([8, 4096])


In [None]:
def calculate_perplexity(inputs):
    #inputs = tokenizer(text, return_tensors="pt")
    #print(type(inputs))
    inputs = inputs.to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    log_likelihood = outputs.loss * inputs["input_ids"].shape[1]
    perplexity = torch.exp(log_likelihood / inputs["input_ids"].shape[1])
    return perplexity.item()
perp = list()
for i,chunk in enumerate(train_chunks):
    perp.append(calculate_perplexity(chunk))
    if i % 100 == 0:
        print(i,perp[i])
perplexity = pd.DataFrame(perp)
perplexity.to_json('../../data/perplexity.json',force_ascii=False,compression=None)
print(np.mean(perp))
print(np.var(perp))

0 2.2420549392700195


In [19]:
print(perplexity[0].nlargest(10))

1248    14.434236
868     13.033507
1246    12.901219
1989    12.872516
314     11.619267
1694    11.600303
1247    11.349825
1599    11.312692
1358    11.309628
1123    11.222836
Name: 0, dtype: float64


In [9]:
os.environ["WANDB_PROJECT"] = "LLM-pre-training"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

args = TrainingArguments(
    output_dir="pre_training_April_9",
    per_device_train_batch_size=1,
    #auto_find_batch_size=True,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    eval_steps=5_00,
    logging_steps=1,
    #gradient_accumulation_steps=10,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=3e-4,
    save_steps=5_000,
    fp16=False,
    push_to_hub=False,
    report_to="wandb",
    adam_beta2 = 0.95,
    adam_epsilon = 1e-5
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=train_chunks,
    eval_dataset=chunks
)
trainer.is_model_parallel

True

In [None]:
trainer.evaluate()
trainer.train()
trainer.evaluate()
trainer.save_model('../../data/pretraining/domain_adaptation')
wandb.finish()

Step,Training Loss,Validation Loss


In [5]:
from transformers import AutoModelForCausalLM, TrainingArguments,GenerationConfig, AutoTokenizer, LlamaConfig
from peft import LoraModel, LoraConfig, get_peft_model, PeftModel
import pandas as pd
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from sklearn.model_selection import train_test_split
import datetime
import torch
import evaluate
from datasets import Dataset
import wandb
import pandas as pd

In [4]:
prompt1 = '''[INST] <<SYS>>
Du är en hjälpsam medicinsk assistent som hjälper läkare och sjuksköterskor genom att svara på frågor.
Svara på svenska.
<</SYS>>
Nedan ges en fråga eller ett medicinskt begrepp.
<fråga>'''

prompt2 = '''
</fråga>
Svara på frågan eller förklara begreppet.
[/INST]'''

data = pd.read_csv("test.csv")

formatted = []
for index, row in data.iterrows():
    #print(row)
    elem = {'text':f"<s>{prompt1}\n{row['Question']}{prompt2}\n{row['Answer']} </s>"}
    formatted.append(elem)
    

train, test = train_test_split(formatted,test_size = 0.05,random_state = 42)
train = Dataset.from_list(train)
test = Dataset.from_list(test)

In [5]:
run = wandb.init(name="qa_no_domain")
os.environ["WANDB_PROJECT"] = "LLM-QA"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /workspace/.netrc


In [6]:
args = TrainingArguments(
    output_dir="qa_April_23",
    auto_find_batch_size=True,
    evaluation_strategy="steps",
    eval_steps=700,
    logging_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    bf16=False,
    warmup_steps=2000,
    group_by_length=True,
    lr_scheduler_type="cosine",
    learning_rate=2e-5,
    save_steps=80000,
    fp16=False,
    push_to_hub=False,
    report_to="wandb",
    adam_beta2 = 0.95,
    adam_epsilon = 1e-5
    #neftune_noise_alpha=5
)
response_template = "\n[/INST]"
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer,mlm=False)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collator,
    train_dataset=train,
    eval_dataset=test,
    packing=False,
    max_seq_length=4096,
    dataset_text_field='text',
)

Map:   0%|          | 0/16679 [00:00<?, ? examples/s]

Map:   0%|          | 0/878 [00:00<?, ? examples/s]

In [None]:
trainer.evaluate()
trainer.train()
trainer.evaluate()
trainer.save_model('../../data/pretraining/qa_nodomain')
#trainer.model.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/model')
#trainer.tokenizer.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/tokenizer')
wandb.finish()

Step,Training Loss,Validation Loss
700,1.7609,1.790056
1400,1.5312,2.108809


In [6]:
config = LoraConfig(
        r=64,
        lora_alpha=16,
        bias='none',
        lora_dropout=0.1,
        task_type="CAUSAL_LM",
        use_rslora=True
    )
model = get_peft_model(model, config, "default")

In [8]:
model.print_trainable_parameters()

trainable params: 52,428,800 || all params: 13,068,303,360 || trainable%: 0.4011905643426998


In [None]:
prompt1 = '''[INST] <<SYS>>
Du är en hjälpsam medicinsk assistent som hjälper läkare och sjuksköterskor genom att sammanfatta information om patienter.
Svara på svenska.
<</SYS>>
Nedan ges anamnes för en patient under en dag
<anamnes>'''

prompt2 = '''
</anamnes>
Du ska plocka ut information som passar i mallen nedan. Undvik onödig information och plocka endast ut sådant som rör varje rubrik. Om relevant information saknas så lämnar du rubriken tom.
Formattera ditt svar enligt mallen. Ingen information kan finnas under flera rubriker.
<mall>
*Sjukdomshistoria (Patientens diagnoser, sjukdomshistorik och riskfaktorer (t.ex. sjukdomar i familjen))*

*Sökorsaker (Patientens symtom och/eller datum för ingrepp)*

*Åtgärder (Planerade undersökningar, behandlingar och åtgärder)*
</mall>
[/INST]'''
data = pd.read_parquet("synthetic_229_corrected.parquet")

formatted = []
for index, row in data.iterrows():
    #print(row)
    elem = {'text':f"<s> {prompt1}\n{row['description']}{prompt2}\n{row['summary']} </s>"}
    formatted.append(elem)
    

train, test = train_test_split(formatted,test_size = 0.05,random_state = 42)
train = Dataset.from_list(train)
test = Dataset.from_list(test)

In [None]:
run = wandb.init(name='lora_nodomain')
os.environ["WANDB_PROJECT"] = "LLM-LoRA"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [None]:
args = TrainingArguments(
    output_dir="lora_April_23",
    auto_find_batch_size=True,
    evaluation_strategy="steps",
    eval_steps=40,
    logging_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    bf16=False,
    warmup_steps=30,
    group_by_length=True,
    lr_scheduler_type="cosine",
    learning_rate=2e-4,
    save_steps=8000,
    fp16=False,
    push_to_hub=False,
    report_to="wandb",
    adam_beta2 = 0.95,
    adam_epsilon = 1e-5,
    neftune_noise_alpha=5
    #remove_unused_columns=False
)
response_template = "\n[/INST]"
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer,mlm=False)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collator,
    train_dataset=train,
    eval_dataset=test,
    packing=False,
    max_seq_length=4096,
    dataset_text_field='text',
)

In [None]:
trainer.evaluate()
trainer.train()
trainer.evaluate()
trainer.save_model('../../data/finetuned/lora_mega_no_domain')
#trainer.model.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/model')
#trainer.tokenizer.save_pretrained('../../../data/finetuned/lora_no_domain_adaptation/tokenizer')
wandb.finish()

In [26]:
print(len(train)/2)

217.0
