In [None]:
!pip install -q bitsandbytes datasets accelerate loralib sentencepiece scikit-learn
!pip install tensorboardX
!pip install -q git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/peft.git

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

import numpy as np
import os
import torch
import torch.nn as nn




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin c:\Users\panta\anaconda3\envs\nlp\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll
CUDA SETUP: CUDA runtime path found: C:\Users\panta\anaconda3\envs\nlp\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\panta\anaconda3\envs\nlp\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll...


  warn(msg)
  warn(msg)


In [2]:
BASE_MODEL = "facebook/opt-350m"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

tokenizer.pad_token_id = 0

In [None]:
# BASE_MODEL = "decapoda-research/llama-7b-hf"

# model = LlamaForCausalLM.from_pretrained(
#     BASE_MODEL,
#     load_in_8bit=True,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )

# tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)

# tokenizer.pad_token_id = 0

In [3]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 1572864 || all params: 332769280 || trainable%: 0.472659014678278


In [4]:
def tokenize_sample(item, max_seq_length=1024, add_eos_token=True):
        result = tokenizer(
            item,
            truncation=True,
            max_length=max_seq_length,
            padding=True,
        )
        result = {
            "input_ids": result["input_ids"][:-1],
            "attention_mask": result["attention_mask"][:-1],
        }
        if (
            result["input_ids"][-1] != tokenizer.eos_token_id
            and len(result["input_ids"]) < max_seq_length
            and add_eos_token
        ):
            result["input_ids"].append(tokenizer.eos_token_id)
            result["attention_mask"].append(1)
        
        return result

In [5]:
def generate_prompt(data_point):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Describe the caption with appropriate tags
### Input:
{data_point['caption_string']}
### Response:
{data_point['tag_string']}"""

In [6]:
from datasets import load_dataset

data = load_dataset("json", data_files=r'dataset/train_data.json')
data = data["train"].train_test_split(test_size=0.05, shuffle=True, seed=42)
data = data.map(lambda x: tokenize_sample(generate_prompt(x)))
data

Found cached dataset json (C:/Users/panta/.cache/huggingface/datasets/json/default-fac367448397b4f6/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-a753b907c50531b5.arrow and C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-402f08e87db1eb6b.arrow


Map:   0%|          | 0/18952 [00:00<?, ? examples/s]

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string', 'input_ids', 'attention_mask'],
        num_rows: 18952
    })
    test: Dataset({
        features: ['caption_string', 'tag_string', 'input_ids', 'attention_mask'],
        num_rows: 998
    })
})

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
tag_list = open(r'dictionaries/tag_dict.txt').read().splitlines()
mlb = MultiLabelBinarizer(classes=tag_list)
mlb.fit([list(tag_list)])

In [None]:
from sklearn.metrics import *
from utils import similar_tag


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)[0]

    pred_tags = [x.strip() for x in decoded_preds.split(",")]
    pred_tags_corrected = similar_tag.correct_tags(pred_tags, tag_list)

    tags = [x.strip() for x in decoded_labels.split(",")]

    one_hots_pred = mlb.transform([pred_tags_corrected])
    one_hots_truth = mlb.transform([tags])

    results = {}

    accuracy = accuracy_score(y_true=one_hots_truth, y_pred=one_hots_pred)
    recall = recall_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )
    precision = precision_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )
    f1_micro = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="micro", zero_division=1
    )
    f1_macro = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="macro", zero_division=1
    )
    f1_weighted = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )

    results["accuracy"] = accuracy
    results["recall"] = recall
    results["precision"] = precision
    results["f1_micro"] = f1_micro
    results["f1_macro"] = f1_macro
    results["f1_weighted"] = f1_weighted

    return {k: round(v, 4) for k, v in results.items()}

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/opt-350m",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=32,
    learning_rate=1e-4,
    num_train_epochs=3,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=20,
    logging_steps=5,
    report_to="tensorboard",
    fp16=True,
)

In [10]:
trainer = transformers.Trainer(
    model=model, 
    train_dataset=data['train'],
    eval_dataset=data['test'],
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=training_args
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [11]:
trainer.train()



  0%|          | 0/444 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.4971, 'learning_rate': 9.887387387387388e-05, 'epoch': 0.03}
{'loss': 3.2936, 'learning_rate': 9.774774774774775e-05, 'epoch': 0.07}
{'loss': 3.163, 'learning_rate': 9.662162162162163e-05, 'epoch': 0.1}
{'loss': 3.0409, 'learning_rate': 9.54954954954955e-05, 'epoch': 0.14}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.8741016387939453, 'eval_runtime': 70.6738, 'eval_samples_per_second': 14.121, 'eval_steps_per_second': 1.769, 'epoch': 0.14}
{'loss': 2.9388, 'learning_rate': 9.436936936936938e-05, 'epoch': 0.17}
{'loss': 2.8321, 'learning_rate': 9.324324324324324e-05, 'epoch': 0.2}
{'loss': 2.7495, 'learning_rate': 9.211711711711712e-05, 'epoch': 0.24}
{'loss': 2.6662, 'learning_rate': 9.0990990990991e-05, 'epoch': 0.27}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.491180658340454, 'eval_runtime': 90.4867, 'eval_samples_per_second': 11.029, 'eval_steps_per_second': 1.381, 'epoch': 0.27}
{'loss': 2.5768, 'learning_rate': 8.986486486486487e-05, 'epoch': 0.3}
{'loss': 2.5281, 'learning_rate': 8.873873873873875e-05, 'epoch': 0.34}
{'loss': 2.4579, 'learning_rate': 8.761261261261262e-05, 'epoch': 0.37}
{'loss': 2.4141, 'learning_rate': 8.64864864864865e-05, 'epoch': 0.41}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.3061065673828125, 'eval_runtime': 87.0345, 'eval_samples_per_second': 11.467, 'eval_steps_per_second': 1.436, 'epoch': 0.41}
{'loss': 2.3803, 'learning_rate': 8.536036036036036e-05, 'epoch': 0.44}
{'loss': 2.3599, 'learning_rate': 8.423423423423423e-05, 'epoch': 0.47}
{'loss': 2.3505, 'learning_rate': 8.310810810810811e-05, 'epoch': 0.51}
{'loss': 2.2951, 'learning_rate': 8.198198198198198e-05, 'epoch': 0.54}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.2188708782196045, 'eval_runtime': 82.8665, 'eval_samples_per_second': 12.043, 'eval_steps_per_second': 1.508, 'epoch': 0.54}
{'loss': 2.298, 'learning_rate': 8.085585585585586e-05, 'epoch': 0.57}
{'loss': 2.2781, 'learning_rate': 7.972972972972974e-05, 'epoch': 0.61}
{'loss': 2.2612, 'learning_rate': 7.86036036036036e-05, 'epoch': 0.64}
{'loss': 2.2449, 'learning_rate': 7.747747747747748e-05, 'epoch': 0.68}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.1609365940093994, 'eval_runtime': 80.4803, 'eval_samples_per_second': 12.401, 'eval_steps_per_second': 1.553, 'epoch': 0.68}
{'loss': 2.2251, 'learning_rate': 7.635135135135135e-05, 'epoch': 0.71}
{'loss': 2.2174, 'learning_rate': 7.522522522522523e-05, 'epoch': 0.74}
{'loss': 2.1983, 'learning_rate': 7.40990990990991e-05, 'epoch': 0.78}
{'loss': 2.1872, 'learning_rate': 7.297297297297297e-05, 'epoch': 0.81}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.1142821311950684, 'eval_runtime': 85.5385, 'eval_samples_per_second': 11.667, 'eval_steps_per_second': 1.461, 'epoch': 0.81}
{'loss': 2.1741, 'learning_rate': 7.184684684684685e-05, 'epoch': 0.84}
{'loss': 2.1958, 'learning_rate': 7.072072072072072e-05, 'epoch': 0.88}
{'loss': 2.1606, 'learning_rate': 6.95945945945946e-05, 'epoch': 0.91}
{'loss': 2.1568, 'learning_rate': 6.846846846846847e-05, 'epoch': 0.95}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.0761427879333496, 'eval_runtime': 85.5941, 'eval_samples_per_second': 11.66, 'eval_steps_per_second': 1.46, 'epoch': 0.95}
{'loss': 2.1448, 'learning_rate': 6.734234234234235e-05, 'epoch': 0.98}
{'loss': 2.1517, 'learning_rate': 6.621621621621621e-05, 'epoch': 1.01}
{'loss': 2.1354, 'learning_rate': 6.50900900900901e-05, 'epoch': 1.05}
{'loss': 2.1109, 'learning_rate': 6.396396396396397e-05, 'epoch': 1.08}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.0454249382019043, 'eval_runtime': 89.3975, 'eval_samples_per_second': 11.164, 'eval_steps_per_second': 1.398, 'epoch': 1.08}
{'loss': 2.1246, 'learning_rate': 6.283783783783784e-05, 'epoch': 1.11}
{'loss': 2.118, 'learning_rate': 6.171171171171172e-05, 'epoch': 1.15}
{'loss': 2.1187, 'learning_rate': 6.058558558558559e-05, 'epoch': 1.18}
{'loss': 2.0823, 'learning_rate': 5.9459459459459466e-05, 'epoch': 1.22}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 2.0186610221862793, 'eval_runtime': 84.45, 'eval_samples_per_second': 11.818, 'eval_steps_per_second': 1.48, 'epoch': 1.22}
{'loss': 2.0991, 'learning_rate': 5.833333333333334e-05, 'epoch': 1.25}
{'loss': 2.0839, 'learning_rate': 5.720720720720721e-05, 'epoch': 1.28}
{'loss': 2.0743, 'learning_rate': 5.6081081081081086e-05, 'epoch': 1.32}
{'loss': 2.0739, 'learning_rate': 5.4954954954954966e-05, 'epoch': 1.35}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9957807064056396, 'eval_runtime': 79.681, 'eval_samples_per_second': 12.525, 'eval_steps_per_second': 1.569, 'epoch': 1.35}
{'loss': 2.0784, 'learning_rate': 5.382882882882884e-05, 'epoch': 1.38}
{'loss': 2.0837, 'learning_rate': 5.27027027027027e-05, 'epoch': 1.42}
{'loss': 2.0726, 'learning_rate': 5.157657657657657e-05, 'epoch': 1.45}
{'loss': 2.0255, 'learning_rate': 5.0450450450450445e-05, 'epoch': 1.49}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.976782202720642, 'eval_runtime': 91.888, 'eval_samples_per_second': 10.861, 'eval_steps_per_second': 1.36, 'epoch': 1.49}
{'loss': 2.062, 'learning_rate': 4.9324324324324325e-05, 'epoch': 1.52}
{'loss': 2.0324, 'learning_rate': 4.8198198198198205e-05, 'epoch': 1.55}
{'loss': 2.0526, 'learning_rate': 4.707207207207208e-05, 'epoch': 1.59}
{'loss': 2.0259, 'learning_rate': 4.594594594594595e-05, 'epoch': 1.62}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9602158069610596, 'eval_runtime': 79.9097, 'eval_samples_per_second': 12.489, 'eval_steps_per_second': 1.564, 'epoch': 1.62}
{'loss': 2.0345, 'learning_rate': 4.481981981981982e-05, 'epoch': 1.65}
{'loss': 2.0225, 'learning_rate': 4.369369369369369e-05, 'epoch': 1.69}
{'loss': 2.0366, 'learning_rate': 4.256756756756757e-05, 'epoch': 1.72}
{'loss': 2.0276, 'learning_rate': 4.1441441441441444e-05, 'epoch': 1.76}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9456040859222412, 'eval_runtime': 80.9117, 'eval_samples_per_second': 12.334, 'eval_steps_per_second': 1.545, 'epoch': 1.76}
{'loss': 2.0225, 'learning_rate': 4.031531531531532e-05, 'epoch': 1.79}
{'loss': 2.0252, 'learning_rate': 3.918918918918919e-05, 'epoch': 1.82}
{'loss': 2.0021, 'learning_rate': 3.8063063063063064e-05, 'epoch': 1.86}
{'loss': 2.0136, 'learning_rate': 3.693693693693694e-05, 'epoch': 1.89}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9331198930740356, 'eval_runtime': 84.1341, 'eval_samples_per_second': 11.862, 'eval_steps_per_second': 1.486, 'epoch': 1.89}
{'loss': 2.0026, 'learning_rate': 3.581081081081081e-05, 'epoch': 1.92}
{'loss': 2.0153, 'learning_rate': 3.468468468468469e-05, 'epoch': 1.96}
{'loss': 1.9974, 'learning_rate': 3.355855855855856e-05, 'epoch': 1.99}
{'loss': 1.9959, 'learning_rate': 3.2432432432432436e-05, 'epoch': 2.03}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.923833966255188, 'eval_runtime': 81.2243, 'eval_samples_per_second': 12.287, 'eval_steps_per_second': 1.539, 'epoch': 2.03}
{'loss': 2.0192, 'learning_rate': 3.130630630630631e-05, 'epoch': 2.06}
{'loss': 1.9965, 'learning_rate': 3.0180180180180183e-05, 'epoch': 2.09}
{'loss': 1.9958, 'learning_rate': 2.9054054054054052e-05, 'epoch': 2.13}
{'loss': 1.9957, 'learning_rate': 2.7927927927927926e-05, 'epoch': 2.16}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.914947748184204, 'eval_runtime': 80.6746, 'eval_samples_per_second': 12.371, 'eval_steps_per_second': 1.549, 'epoch': 2.16}
{'loss': 1.9829, 'learning_rate': 2.6801801801801802e-05, 'epoch': 2.2}
{'loss': 1.9927, 'learning_rate': 2.5675675675675675e-05, 'epoch': 2.23}
{'loss': 1.9936, 'learning_rate': 2.454954954954955e-05, 'epoch': 2.26}
{'loss': 1.9944, 'learning_rate': 2.3423423423423425e-05, 'epoch': 2.3}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9083406925201416, 'eval_runtime': 81.3256, 'eval_samples_per_second': 12.272, 'eval_steps_per_second': 1.537, 'epoch': 2.3}
{'loss': 1.9697, 'learning_rate': 2.2297297297297298e-05, 'epoch': 2.33}
{'loss': 1.9896, 'learning_rate': 2.117117117117117e-05, 'epoch': 2.36}
{'loss': 1.9764, 'learning_rate': 2.0045045045045048e-05, 'epoch': 2.4}
{'loss': 1.9707, 'learning_rate': 1.891891891891892e-05, 'epoch': 2.43}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.9014346599578857, 'eval_runtime': 75.6538, 'eval_samples_per_second': 13.192, 'eval_steps_per_second': 1.652, 'epoch': 2.43}
{'loss': 1.977, 'learning_rate': 1.779279279279279e-05, 'epoch': 2.47}
{'loss': 1.9749, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.5}
{'loss': 1.9784, 'learning_rate': 1.554054054054054e-05, 'epoch': 2.53}
{'loss': 1.9792, 'learning_rate': 1.4414414414414416e-05, 'epoch': 2.57}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.8964929580688477, 'eval_runtime': 72.1081, 'eval_samples_per_second': 13.84, 'eval_steps_per_second': 1.734, 'epoch': 2.57}
{'loss': 1.9662, 'learning_rate': 1.3288288288288289e-05, 'epoch': 2.6}
{'loss': 1.9962, 'learning_rate': 1.2162162162162164e-05, 'epoch': 2.63}
{'loss': 1.9788, 'learning_rate': 1.1036036036036037e-05, 'epoch': 2.67}
{'loss': 1.966, 'learning_rate': 9.90990990990991e-06, 'epoch': 2.7}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.8935216665267944, 'eval_runtime': 142.6651, 'eval_samples_per_second': 6.995, 'eval_steps_per_second': 0.876, 'epoch': 2.7}
{'loss': 1.9605, 'learning_rate': 8.783783783783785e-06, 'epoch': 2.74}
{'loss': 1.9534, 'learning_rate': 7.657657657657658e-06, 'epoch': 2.77}
{'loss': 1.9773, 'learning_rate': 6.531531531531532e-06, 'epoch': 2.8}
{'loss': 1.9688, 'learning_rate': 5.405405405405406e-06, 'epoch': 2.84}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.8915883302688599, 'eval_runtime': 106.0494, 'eval_samples_per_second': 9.411, 'eval_steps_per_second': 1.179, 'epoch': 2.84}


KeyboardInterrupt: 

In [12]:
text = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Describe the caption with appropriate tags
### Input:
Minato Aqua, the virtual youtuber from hololive is wearing a blue maid outfit with maid cap and her pink and blue streaked hair is styled in twintails
### Response:
"""

In [18]:
batch = tokenizer(text, return_tensors='pt').to("cuda")

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=200, no_repeat_ngram_size=3)

print(tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Describe the caption with appropriate tags
### Input:
Minato Aqua, the virtual youtuber from hololive is wearing a blue maid outfit with maid cap and her pink and blue streaked hair is styled in twintails
### Response:
1girl, blue_hair, blue-eyed_girl, maid_cap, maid-cap, pink_hair_and_blush, pink, blue, pink-eyes, pink_(curly_hair), maid_caps, maid_(curlish), maid-panties, maid(curly), maid(pant)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub("ooferdoodles/text2tags-opt-350m", use_auth_token=True)

In [19]:
model.save_pretrained(r"loras/opt-350m-test")

In [None]:
%load_ext tensorboard