In [29]:
import os
import json
import codecs
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from transformers import (
    Trainer,
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
    TrainingArguments,
    EarlyStoppingCallback,
    default_data_collator,
    DataCollatorWithPadding,
    TextClassificationPipeline,
)
DIR = "E:/datasets/hotel_comment/"

In [8]:
dataset = load_dataset(
    "json",
    data_files = {
        "train": os.path.join(DIR, "trainl.json"),
        "test": os.path.join(DIR, "testl.json"),
    }
)
dataset

Using custom data configuration default-3ff479c2ba161dd9


Downloading and preparing dataset json/default to C:\Users\chris\.cache\huggingface\datasets\json\default-3ff479c2ba161dd9\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\chris\.cache\huggingface\datasets\json\default-3ff479c2ba161dd9\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1553
    })
})

In [9]:
label_list = dataset["train"].unique("label")
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)

# label_2_id
label_2_id = dict()
id_2_label = dict()
for index, ele in enumerate(label_list):
    label_2_id[ele] = index
    id_2_label[index] = ele
label_2_id

{'不好': 0, '好': 1}

In [10]:
MODEL = "E:\\pretrained_moel\\chinese-bert-wwm-ext"
tokenizer = BertTokenizer.from_pretrained(MODEL)
bert_config = BertConfig.from_pretrained(MODEL, num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained(MODEL, config=bert_config)
model.config.label2id = label_2_id
model.config.id2label = id_2_label

Some weights of the model checkpoint at E:\pretrained_moel\chinese-bert-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from t

In [11]:
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["text"],
        padding="max_length",
        max_length=128,
        truncation=True
    )

    # Map labels to IDs
    result["label"] = [(label_2_id[item] if item != -1 else -1) for item in examples["label"]]
    return result

raw_datasets = dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/7 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [22]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    report = classification_report(
        y_true=labels,
        y_pred=pred,
        output_dict=True
    )

    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"]
    }

In [23]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
training_args = TrainingArguments(
    f"confident_learing",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,  # train arguments
    train_dataset=raw_datasets["train"],  # train dataset
    eval_dataset=raw_datasets["test"],  # evaluate datasets
    compute_metrics=compute_metrics,  # compute metric when evaluate
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # early stopping
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [24]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 6212
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7780
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1553
  Batch size = 16
Saving model checkpoint to confident_learing\checkpoint-389
Configuration saved in confident_learing\checkpoint-389\config.json
Model weights saved in confident_learing\checkpoint-389\pytorch_model.bin
tokenizer config file saved in confident_learing\checkpoint-389\tokenizer_config.json
Special tokens file saved in confident_learing\checkpoint-389\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1553
  Batch size = 16
Saving model checkpoint to confident_learing\checkpoint-778
Configuration saved in confident_learing\checkpoint-778\config.json
Model weights saved in confident_learing\checkpoin

TrainOutput(global_step=1945, training_loss=0.10920466150906522, metrics={'train_runtime': 178.5984, 'train_samples_per_second': 695.639, 'train_steps_per_second': 43.561, 'total_flos': 2043057344870400.0, 'train_loss': 0.10920466150906522, 'epoch': 5.0})

In [31]:
def jsonl_reader(file: str) -> list:
    """Get Json_list from jsonl File

    :param file: JSONL file
    :return:
    """
    result = list()
    with codecs.open(file, "r", encoding="utf_8") as f:
        for line in f.readlines():
            json_ele = json.loads(line)
            result.append(json_ele)
    f.close()
    return result

list_test = jsonl_reader(
    os.path.join(
        DIR,
        "testl.json"
    )
)

In [36]:
classifer = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    return_all_scores = True,
    device=0
)
list_text = [ele["text"][:510] for ele in list_test]
list_scores = classifer(list_text)
print(list_scores[:3])

[[{'label': '不好', 'score': 0.0025735984090715647}, {'label': '好', 'score': 0.9974263310432434}], [{'label': '不好', 'score': 0.9902770519256592}, {'label': '好', 'score': 0.009722927585244179}], [{'label': '不好', 'score': 0.002062177285552025}, {'label': '好', 'score': 0.9979377388954163}]]


In [41]:
import cleanlab
s = [label_2_id[ele["label"]] for ele in list_test]
psx = list()
for ele in list_scores:
    psx.append([
        ele[0]["score"],
        ele[1]["score"],
    ])

# Method 5：C+NR
cl_both = cleanlab.pruning.get_noise_indices(
    np.array(s),
    np.array(psx),
    prune_method='both',
    sorted_index_method='prob_given_label'
)
print(
      "The total Number of Test_Dataset is {} and there are {} bad data in Test_Dataset which accounts for {} %".format(
          len(list_test),
          len(cl_both),
          len(cl_both)*100/len(list_test)

))
print(cl_both)

The total Number of Test_Dataset is 1553 and there are 99 bad data in Test_Dataset which accounts for 6.374758531873793 %
[ 965   60  338  940  458 1210 1459  631  316 1381  938   62 1386  987
 1374  776 1349 1194 1545  942  505 1521  153  119  131  986  335 1350
 1395  829  593  367 1415 1184  361 1401  659  184  777 1391 1506  366
  698  978 1526  103  122   84   92  369 1432 1281  665  362 1131   72
 1302 1291  163  446  809 1323 1118  239  990 1501  782  872  244 1257
  880 1249  642  647  571  192  892 1067  637   46  313 1054   93  983
 1420 1499  734  648 1198   63  416   82  127  677  855  493   87  716
 1537]


In [39]:
for ele in cl_both:
    print(list_test[ele])
    print(list_scores[ele])

{'text': '打车太不方便，我在门口等了40分钟；早餐品种少。宽带还比较快！房间空调调不了。苏州还有一个茉莉花假日，下次试试。补充点评2008年4月22日：刚接到酒店预订部的电话，对所提的意见进行了跟踪反馈，赞一个。与我14日入住的上海华港雅阁酒店（虹桥机场）比起来，真是天壤之别。24日又到苏州，不过这次我已经预订了另外一家酒店，下次到苏州，我会考虑再次入住友联。希望真的已经改进。', 'label': '不好'}
[{'label': '不好', 'score': 0.002219702349975705}, {'label': '好', 'score': 0.9977802634239197}]
{'text': '前台服务态度挺好。但酒店设施较简单陈旧，感觉连二星的都比不上，房间隔音效果也不好，外面卡拉ok的声音听的清清楚楚，晚上要是失眠了可以听听歌。宾馆反馈2007年10月30日：尊敬的宾客：非常感谢您的光临及提出宝贵的意见，本酒店目前的房间设施设备是以简约时尚为前提，关于隔音改造工程正在完善中，相信我们会以更优质的服务和舒适的客房、便利的商务环境，迎接您的再次光临！', 'label': '不好'}
[{'label': '不好', 'score': 0.0022937506437301636}, {'label': '好', 'score': 0.9977061748504639}]
{'text': '10月24日入住西楼,房间很宽敞,但设施旧了一些,它的房间有两道门,这点很少见,所以晚上很安静,走廊的声音会小很多.当时相同的价格,如果去附近的步行街上找一找,会有惊喜.还好,电视信号还有,那天嫦娥升空,真巧.', 'label': '不好'}
[{'label': '不好', 'score': 0.002881971187889576}, {'label': '好', 'score': 0.9971179962158203}]
{'text': '和山里的农庄比算是好的了吧。墙壁都开裂了，房间还算干净，离镇上有1公里吧，叫车只要2元。镇上有上海华联，还挺大的，买些吃的喝的用的都挺方便。', 'label': '不好'}
[{'label': '不好', 'score': 0.0030554654076695442}, {'label': '好',