!pip install -q bitsandbytes datasets accelerate loralib sentencepiece
!pip install tensorboardX
!pip install -q git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/peft.git

import os
os.kill(os.getpid(), 9)

In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

import os
import numpy as np
import torch
import torch.nn as nn




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin c:\Users\panta\anaconda3\envs\nlp\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll
CUDA SETUP: CUDA runtime path found: C:\Users\panta\anaconda3\envs\nlp\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\panta\anaconda3\envs\nlp\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll...


  warn(msg)
  warn(msg)


In [2]:
BASE_MODEL = "google/flan-t5-small"

model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_MODEL,
    # load_in_8bit=True,
    # torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

In [3]:
# model = prepare_model_for_int8_training(model)
# config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q", "v"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="SEQ_2_SEQ_LM",
# )
# model = get_peft_model(model, config)
# model.print_trainable_parameters()

trainable params: 688128 || all params: 77649280 || trainable%: 0.8862001038515747


In [4]:
from datasets import load_dataset

data = load_dataset("json", data_files=r'dataset/train_data.json')
data

Found cached dataset json (C:/Users/panta/.cache/huggingface/datasets/json/default-fac367448397b4f6/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string'],
        num_rows: 19950
    })
})

In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["caption_string"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(
        text_target=examples["tag_string"], max_length=1024, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
split_data = data["train"].train_test_split(test_size=0.05, shuffle=True, seed=42)
tokenized_data = split_data.map(preprocess_function, batched=True)
tokenized_data

Loading cached split indices for dataset at C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-a753b907c50531b5.arrow and C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-402f08e87db1eb6b.arrow
Loading cached processed dataset at C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-f45342928e41ed12.arrow
Loading cached processed dataset at C:\Users\panta\.cache\huggingface\datasets\json\default-fac367448397b4f6\0.0.0\fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e\cache-47cef8e61ebab212.arrow


DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18952
    })
    test: Dataset({
        features: ['caption_string', 'tag_string', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 998
    })
})

In [7]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="outputs/flan-t5",
    auto_find_batch_size=True,
    learning_rate=3e-4,
    num_train_epochs=5,
    predict_with_generate=True,
    metric_for_best_model="f1_weighted",
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    report_to="tensorboard",
    fp16=True
)

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
tag_list = open(r'dictionaries/tag_dict.txt').read().splitlines()
mlb = MultiLabelBinarizer(classes=tag_list)
mlb.fit([list(tag_list)])

In [10]:
from sklearn.metrics import *
from utils import similar_tag


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)[0]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)[0]

    pred_tags = [x.strip() for x in decoded_preds.split(",")]
    pred_tags_corrected = similar_tag.correct_tags(pred_tags, tag_list)

    tags = [x.strip() for x in decoded_labels.split(",")]

    one_hots_pred = mlb.transform([pred_tags_corrected])
    one_hots_truth = mlb.transform([tags])

    results = {}

    accuracy = accuracy_score(y_true=one_hots_truth, y_pred=one_hots_pred)
    recall = recall_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )
    precision = precision_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )
    f1_micro = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="micro", zero_division=1
    )
    f1_macro = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="macro", zero_division=1
    )
    f1_weighted = f1_score(
        y_true=one_hots_truth, y_pred=one_hots_pred, average="weighted", zero_division=1
    )

    results["accuracy"] = accuracy
    results["recall"] = recall
    results["precision"] = precision
    results["f1_micro"] = f1_micro
    results["f1_macro"] = f1_macro
    results["f1_weighted"] = f1_weighted

    return {k: round(v, 4) for k, v in results.items()}

In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics,
)
model.config.use_cache = False

In [12]:
trainer.train()



  0%|          | 0/11845 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.4054, 'learning_rate': 0.0002873364288729421, 'epoch': 0.21}
{'loss': 1.968, 'learning_rate': 0.0002746728577458843, 'epoch': 0.42}
{'loss': 1.8148, 'learning_rate': 0.0002620092866188265, 'epoch': 0.63}
{'loss': 1.7189, 'learning_rate': 0.00024934571549176863, 'epoch': 0.84}




  0%|          | 0/125 [00:00<?, ?it/s]

(1, 11389)
{'eval_loss': 1.340453863143921, 'eval_accuracy': 0.0, 'eval_recall': 0.0882, 'eval_precision': 1.0, 'eval_f1_micro': 0.1579, 'eval_f1_macro': 0.9972, 'eval_f1_weighted': 0.0882, 'eval_runtime': 218.6781, 'eval_samples_per_second': 4.564, 'eval_steps_per_second': 0.572, 'epoch': 1.0}




{'loss': 1.6581, 'learning_rate': 0.00023668214436471083, 'epoch': 1.06}
{'loss': 1.5984, 'learning_rate': 0.000224018573237653, 'epoch': 1.27}
{'loss': 1.5491, 'learning_rate': 0.00021135500211059518, 'epoch': 1.48}
{'loss': 1.5106, 'learning_rate': 0.00019869143098353735, 'epoch': 1.69}
{'loss': 1.4912, 'learning_rate': 0.00018602785985647952, 'epoch': 1.9}


  0%|          | 0/125 [00:00<?, ?it/s]

(1, 11389)
{'eval_loss': 1.1723272800445557, 'eval_accuracy': 0.0, 'eval_recall': 0.0588, 'eval_precision': 1.0, 'eval_f1_micro': 0.1026, 'eval_f1_macro': 0.9969, 'eval_f1_weighted': 0.0588, 'eval_runtime': 210.6145, 'eval_samples_per_second': 4.739, 'eval_steps_per_second': 0.594, 'epoch': 2.0}




{'loss': 1.4683, 'learning_rate': 0.00017336428872942168, 'epoch': 2.11}
{'loss': 1.4288, 'learning_rate': 0.00016070071760236385, 'epoch': 2.32}
{'loss': 1.4132, 'learning_rate': 0.000148037146475306, 'epoch': 2.53}
{'loss': 1.4065, 'learning_rate': 0.0001353735753482482, 'epoch': 2.74}
{'loss': 1.3848, 'learning_rate': 0.00012271000422119037, 'epoch': 2.95}


  0%|          | 0/125 [00:00<?, ?it/s]

(1, 11389)
{'eval_loss': 1.0993340015411377, 'eval_accuracy': 0.0, 'eval_recall': 0.0882, 'eval_precision': 1.0, 'eval_f1_micro': 0.1622, 'eval_f1_macro': 0.9973, 'eval_f1_weighted': 0.0882, 'eval_runtime': 214.9226, 'eval_samples_per_second': 4.644, 'eval_steps_per_second': 0.582, 'epoch': 3.0}




{'loss': 1.3734, 'learning_rate': 0.00011004643309413255, 'epoch': 3.17}
{'loss': 1.3753, 'learning_rate': 9.73828619670747e-05, 'epoch': 3.38}
{'loss': 1.3524, 'learning_rate': 8.471929084001688e-05, 'epoch': 3.59}
{'loss': 1.3319, 'learning_rate': 7.205571971295904e-05, 'epoch': 3.8}


  0%|          | 0/125 [00:00<?, ?it/s]

(1, 11389)
{'eval_loss': 1.0612291097640991, 'eval_accuracy': 0.0, 'eval_recall': 0.0882, 'eval_precision': 1.0, 'eval_f1_micro': 0.15, 'eval_f1_macro': 0.997, 'eval_f1_weighted': 0.0882, 'eval_runtime': 212.3929, 'eval_samples_per_second': 4.699, 'eval_steps_per_second': 0.589, 'epoch': 4.0}


RuntimeError: Loading a quantized checkpoint into non-quantized Linear8bitLt is not supported. Please call module.cuda() before module.load_state_dict()

In [21]:
text = "summarize: Minato Aqua and Hoshimachi Suisei, virtual youtubers from hololive are wearing a deep blue maid outfit with maid cap with pink and blue streaked hair styled in twintails"

In [22]:
summarizer = transformers.pipeline("summarization", model=model, tokenizer=tokenizer,max_length=60)
summarizer(text)

The model 'PeftModelForSeq2SeqLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Your max_length is set to 60, but you input_length is only 55. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=2

[{'summary_text': '1girl, blue_hair, deep_hair, maid, maid_cap, maid_cap, maid_cap, maid_cap, maid_cap, maid_cap, maid_cap, maid_cap, hololive,'}]

In [None]:
%load_ext tensorboard