In [1]:
!pip install -U adapter-transformers
!pip install datasets

In [2]:
from datasets import load_dataset

dataset = load_dataset("junliang/symptom")
dataset.num_rows

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("./mcbert")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=64, truncation=True, padding="max_length")

# Encode the input data
dataset = dataset.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
dataset = dataset.rename_column("label", "labels")
# Transform to pytorch tensors and only output the required columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [4]:
from transformers import BertConfig, BertModelWithHeads
import torch

config = BertConfig.from_pretrained(
    "./mcbert",
    num_labels=36,
)
model = BertModelWithHeads.from_pretrained(
    "./mcbert",
    config=config,
)
model.load_state_dict(torch.load('../input/bestsongever/best song ever.ckpt'),False)

In [5]:
# Add a new adapter
model.add_adapter("symptom_adapter")
# Add a matching classification head
model.add_classification_head(
    "symptom_adapter",
    num_labels=36,
    id2label= {19:"消化内科",16:"整形美容科",27:"耳鼻喉科",14:"心胸外科",18:"泌尿外科",30:"肾内科",21:"男科",3:"产科",23:"眼科",35:"骨外科",5:"儿科综合",28:"肛肠科",29:"肝胆外科",13:"心理科",34:"风湿免疫科",15:"性病科",24:"神经内科",31:"肿瘤科",33:"遗传病科",8:"呼吸内科",9:"妇科",26:"精神科",20:"烧伤科",6:"内分泌科",22:"皮肤科",17:"普外科",7:"口腔科",0:"不孕不育",12:"心内科",2:"中医骨伤科",32:"血液科",1:"中医综合",4:"传染科",11:"小儿外科",25:"神经外科",10:"小儿内科"}
  )
# Activate the adapter
model.train_adapter("symptom_adapter")

In [7]:
# import numpy as np
# from transformers import TrainingArguments, AdapterTrainer, EvalPrediction

# training_args = TrainingArguments(
#     learning_rate=1e-4,
#     num_train_epochs=6,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     logging_steps=200,
#     output_dir="./training_output",
#     overwrite_output_dir=True,
#     # The next line is important to ensure the dataset labels are properly passed to the model
#     remove_unused_columns=False,
# )

# def compute_accuracy(p: EvalPrediction):
#     preds = np.argmax(p.predictions, axis=1)
#     return {"acc": (preds == p.label_ids).mean()}

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
#     eval_dataset=dataset["validation"],
    compute_metrics=compute_accuracy,
)

In [None]:
trainer.train()