In [None]:
!pip install -q bitsandbytes datasets accelerate loralib editdistance sentencepiece
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

In [3]:
device = "cuda"
model = r'classifiers/deberta-v3-base/'
model = AutoModelForSequenceClassification.from_pretrained(
    model,
    return_dict=True,
    # torch_dtype=torch.float16,
).to(device)
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
model.config

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DebertaV2Config {
  "_name_or_path": "classifiers/deberta-v3-base/",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1girl",
    "1": "solo",
    "2": "long_hair",
    "3": "breasts",
    "4": "blush",
    "5": "looking_at_viewer",
    "6": "smile",
    "7": "short_hair",
    "8": "open_mouth",
    "9": "bangs",
    "10": "blue_eyes",
    "11": "multiple_girls",
    "12": "blonde_hair",
    "13": "skirt",
    "14": "brown_hair",
    "15": "large_breasts",
    "16": "simple_background",
    "17": "black_hair",
    "18": "eyebrows_visible_through_hair",
    "19": "thighhighs",
    "20": "hair_ornament",
    "21": "hat",
    "22": "red_eyes",
    "23": "gloves",
    "24": "shirt",
    "25": "touhou",
    "26": "1boy",
    "27": "dress",
    "28": "white_background",
    "29": "original",
    "30": "ribbon",
    "31": "long_sleeves"

In [15]:
data = load_dataset("json", data_files=r"dataset/test_data.json")
data

Found cached dataset json (C:/Users/panta/.cache/huggingface/datasets/json/default-5cff6cb24ab66f93/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string'],
        num_rows: 150
    })
})

In [5]:
pipe = pipeline(
    "text-classification",
    model=model,
    device="cuda:0",
    tokenizer=tokenizer,
    return_all_scores=True,
)
test = pipe("a single man in a black suit")
test



[[{'label': '1girl', 'score': 0.729422390460968},
  {'label': 'solo', 'score': 0.65225750207901},
  {'label': 'long_hair', 'score': 0.41337287425994873},
  {'label': 'breasts', 'score': 0.21657173335552216},
  {'label': 'blush', 'score': 0.2117224484682083},
  {'label': 'looking_at_viewer', 'score': 0.223398819565773},
  {'label': 'smile', 'score': 0.23560038208961487},
  {'label': 'short_hair', 'score': 0.23292262852191925},
  {'label': 'open_mouth', 'score': 0.16582940518856049},
  {'label': 'bangs', 'score': 0.07964548468589783},
  {'label': 'blue_eyes', 'score': 0.16772419214248657},
  {'label': 'multiple_girls', 'score': 0.13108181953430176},
  {'label': 'blonde_hair', 'score': 0.13179944455623627},
  {'label': 'skirt', 'score': 0.11498469114303589},
  {'label': 'brown_hair', 'score': 0.13563500344753265},
  {'label': 'large_breasts', 'score': 0.1010945588350296},
  {'label': 'simple_background', 'score': 0.12965784966945648},
  {'label': 'black_hair', 'score': 0.10829297453165054

In [7]:
def get_preds(data_point, threshold=0.1):
    preds = pipe(data_point['caption_string'])
    pred_tags = [x['label'] for x in preds[0] if x['score'] > threshold]

    data_point['pred_tags'] = pred_tags
    data_point['tags'] = data_point['tag_string'].split(', ')

    return data_point
    

In [18]:
data = data.map(get_preds)
data

Map:   0%|          | 0/150 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string', 'pred_tags', 'tags'],
        num_rows: 150
    })
})

In [19]:
def evaluate_accuracy(data_point):
    correct_count = len(set(data_point['tags']).intersection(data_point['pred_tags']))
    # incorrect_count = len(data_point['tags']) - correct_count
    data_point['accuracy'] = correct_count / len(data_point['tags']) * 100
    return data_point

In [20]:
data = data.map(evaluate_accuracy)
data

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['caption_string', 'tag_string', 'pred_tags', 'tags', 'accuracy'],
        num_rows: 150
    })
})

In [21]:
df = data['train'].to_pandas()
df

Unnamed: 0,caption_string,tag_string,pred_tags,tags,accuracy
0,"With her mesmerizing gaze and ethereal wings, ...","1girl, breasts, closed_mouth, gloves, horns, l...","[1girl, solo, long_hair, breasts, blush, looki...","[1girl, breasts, closed_mouth, gloves, horns, ...",47.058824
1,Kisaragi Ai's art features Garma Zabi from Gun...,"1boy, adjusting_hair, bow, bowtie, brown_eyes,...","[1girl, solo, long_hair, smile, short_hair, bl...","[1boy, adjusting_hair, bow, bowtie, brown_eyes...",22.222222
2,Rojer18's art depicts Oozora Hiro from Danball...,"1boy, ahoge, belt, blue_hair, closed_mouth, cr...","[1girl, solo, long_hair, breasts, blush, looki...","[1boy, ahoge, belt, blue_hair, closed_mouth, c...",30.000000
3,"Amidst the chaos of Danganronpa, Criis-chan ca...","2girls, :d, ahoge, arm_up, bangs, black_shirt,...","[1girl, solo, long_hair, breasts, blush, looki...","[2girls, :d, ahoge, arm_up, bangs, black_shirt...",23.809524
4,The artwork is a solo depiction of the charact...,"1boy, ^_^, ahoge, blush, closed_eyes, fate/gra...","[1girl, solo, long_hair, breasts, blush, looki...","[1boy, ^_^, ahoge, blush, closed_eyes, fate/gr...",28.571429
...,...,...,...,...,...
145,The artwork features the character Bokuto Kout...,"1boy, artist_name, bangs, black_shirt, collarb...","[1girl, solo, long_hair, breasts, blush, looki...","[1boy, artist_name, bangs, black_shirt, collar...",21.739130
146,"""Too_mizuguchi's artwork portrays Oswald Chest...","1boy, bags_under_eyes, batman_(series), black_...","[1girl, solo, long_hair, breasts, blush, looki...","[1boy, bags_under_eyes, batman_(series), black...",41.176471
147,Denjinq's artistic prowess comes to life with ...,"1boy, 5girls, aqua_hair, ass, bangs, black_hai...","[1girl, solo, long_hair, breasts, blush, looki...","[1boy, 5girls, aqua_hair, ass, bangs, black_ha...",25.925926
148,The artwork features a single female character...,"1girl, :o, bangs, black_shirt, boots, bronya_z...","[1girl, solo, long_hair, breasts, blush, looki...","[1girl, :o, bangs, black_shirt, boots, bronya_...",42.424242


In [14]:
df['accuracy'].mean()

25.293576628724896

In [None]:
ogey = pipe(data['train'][0])

In [None]:
print(ogey['tags'])
print(ogey['raw_preds'])