In [29]:
import os
import json
import codecs
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from transformers import (
    Trainer,
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
    TrainingArguments,
    EarlyStoppingCallback,
    default_data_collator,
    DataCollatorWithPadding,
    TextClassificationPipeline,
)
DIR = "E:/datasets/hotel_comment/"

In [8]:
dataset = load_dataset(
    "json",
    data_files = {
        "train": os.path.join(DIR, "trainl.json"),
        "test": os.path.join(DIR, "testl.json"),
    }
)
dataset

Using custom data configuration default-3ff479c2ba161dd9


Downloading and preparing dataset json/default to C:\Users\chris\.cache\huggingface\datasets\json\default-3ff479c2ba161dd9\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to C:\Users\chris\.cache\huggingface\datasets\json\default-3ff479c2ba161dd9\0.0.0\ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1553
    })
})

In [9]:
label_list = dataset["train"].unique("label")
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)

# label_2_id
label_2_id = dict()
id_2_label = dict()
for index, ele in enumerate(label_list):
    label_2_id[ele] = index
    id_2_label[index] = ele
label_2_id

{'不好': 0, '好': 1}

In [46]:
MODEL = "E:\\pretrained_moel\\chinese-bert-wwm-ext"
tokenizer = BertTokenizer.from_pretrained(MODEL)
bert_config = BertConfig.from_pretrained(MODEL, num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained(MODEL, config=bert_config)
model.config.label2id = label_2_id
model.config.id2label = id_2_label

Didn't find file E:\pretrained_moel\chinese-bert-wwm-ext\added_tokens.json. We won't load it.
Didn't find file E:\pretrained_moel\chinese-bert-wwm-ext\special_tokens_map.json. We won't load it.
Didn't find file E:\pretrained_moel\chinese-bert-wwm-ext\tokenizer_config.json. We won't load it.
loading file E:\pretrained_moel\chinese-bert-wwm-ext\vocab.txt
loading file None
loading file None
loading file None
loading file E:\pretrained_moel\chinese-bert-wwm-ext\tokenizer.json
loading configuration file E:\pretrained_moel\chinese-bert-wwm-ext\config.json
Model config BertConfig {
  "_name_or_path": "E:\\pretrained_moel\\chinese-bert-wwm-ext",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type"

In [47]:
def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["text"],
        padding="max_length",
        max_length=128,
        truncation=True
    )

    # Map labels to IDs
    result["label"] = [(label_2_id[item] if item != -1 else -1) for item in examples["label"]]
    return result

raw_datasets = dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/7 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [48]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    report = classification_report(
        y_true=labels,
        y_pred=pred,
        output_dict=True
    )

    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"]
    }

In [49]:
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
training_args = TrainingArguments(
    f"confident_learing",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    load_best_model_at_end=True,
    fp16=True,
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,  # train arguments
    train_dataset=raw_datasets["train"],  # train dataset
    eval_dataset=raw_datasets["test"],  # evaluate datasets
    compute_metrics=compute_metrics,  # compute metric when evaluate
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # early stopping
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [50]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 6212
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3900
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1553
  Batch size = 32
Saving model checkpoint to confident_learing\checkpoint-195
Configuration saved in confident_learing\checkpoint-195\config.json
Model weights saved in confident_learing\checkpoint-195\pytorch_model.bin
tokenizer config file saved in confident_learing\checkpoint-195\tokenizer_config.json
Special tokens file saved in confident_learing\checkpoint-195\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1553
  Batch size = 32
Saving model checkpoint to confident_learing\checkpoint-390
Configuration saved in confident_learing\checkpoint-390\config.json
Model weights saved in confident_learing\checkpoin

TrainOutput(global_step=975, training_loss=0.1627717785957532, metrics={'train_runtime': 136.3001, 'train_samples_per_second': 911.518, 'train_steps_per_second': 28.613, 'total_flos': 2043057344870400.0, 'train_loss': 0.1627717785957532, 'epoch': 5.0})

In [51]:
def jsonl_reader(file: str) -> list:
    """Get Json_list from jsonl File

    :param file: JSONL file
    :return:
    """
    result = list()
    with codecs.open(file, "r", encoding="utf_8") as f:
        for line in f.readlines():
            json_ele = json.loads(line)
            result.append(json_ele)
    f.close()
    return result

list_test = jsonl_reader(
    os.path.join(
        DIR,
        "testl.json"
    )
)

In [52]:
classifer = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    return_all_scores = True,
    device=0
)
list_text = [ele["text"][-510:] for ele in list_test]
list_scores = classifer(list_text)
print(list_scores[:3])

[[{'label': '不好', 'score': 0.005702445283532143}, {'label': '好', 'score': 0.9942975640296936}], [{'label': '不好', 'score': 0.990449845790863}, {'label': '好', 'score': 0.009550112299621105}], [{'label': '不好', 'score': 0.0070745195262134075}, {'label': '好', 'score': 0.9929255247116089}]]


In [53]:
import cleanlab
s = [label_2_id[ele["label"]] for ele in list_test]
psx = list()
for ele in list_scores:
    psx.append([
        ele[0]["score"],
        ele[1]["score"],
    ])

# Method 5：C+NR
cl_both = cleanlab.pruning.get_noise_indices(
    np.array(s),
    np.array(psx),
    prune_method='both',
    sorted_index_method='prob_given_label'
)
print(
      "The total Number of Test_Dataset is {} and there are {} bad data in Test_Dataset which accounts for {} %".format(
          len(list_test),
          len(cl_both),
          len(cl_both)*100/len(list_test)

))
print(cl_both)

The total Number of Test_Dataset is 1553 and there are 58 bad data in Test_Dataset which accounts for 3.734707018673535 %
[ 965 1184  987   62  296  369  338 1415   46  940 1302 1249   72 1499
   87  654  191  153  924  184  819  631  776  316  782 1008 1526   92
 1194  872 1386  446 1019   60  669  313 1070  880  103 1349  458  716
 1054  343 1165   93  956  637  498  938  655 1539 1131 1158  983 1208
 1331  214]


In [54]:
for ele in cl_both:
    print(list_test[ele])
    print(list_scores[ele])

{'text': '打车太不方便，我在门口等了40分钟；早餐品种少。宽带还比较快！房间空调调不了。苏州还有一个茉莉花假日，下次试试。补充点评2008年4月22日：刚接到酒店预订部的电话，对所提的意见进行了跟踪反馈，赞一个。与我14日入住的上海华港雅阁酒店（虹桥机场）比起来，真是天壤之别。24日又到苏州，不过这次我已经预订了另外一家酒店，下次到苏州，我会考虑再次入住友联。希望真的已经改进。', 'label': '不好'}
[{'label': '不好', 'score': 0.008587555028498173}, {'label': '好', 'score': 0.9914124011993408}]
{'text': '卫生间门是坏的，面巾纸也没有，房间内也不暖和，服务更不怎么样，还比不上当地4星的酒店，特别是前台和礼宾。住了就不想再住', 'label': '好'}
[{'label': '不好', 'score': 0.988831102848053}, {'label': '好', 'score': 0.011168954893946648}]
{'text': '传说中的四星挂牌，2星标准，倒也恰当。房间的地毯确实很脏，其他还行啦！卫生纸确实如传说中的少，两个人有点不够用。服务嘛，不知道，因为是跟携程团去的，一大早就走了，没机会领略。早餐，确实品种少，除了牛奶（是奶粉加糖价开水，亲眼所见）和粥，该冷的是冷的，该热的也还是冷的。稍微晚去就排队加排队。酒店周围都是卖土特产，水果店和小便利店（杂货店）也多。补充点评2007年10月22日：客房提供的居然不是袋泡茶，而是新鲜的绿茶，感觉很好。没有咖啡，有需要的得自带。没有免费矿泉水。', 'label': '不好'}
[{'label': '不好', 'score': 0.011512244120240211}, {'label': '好', 'score': 0.9884877800941467}]
{'text': '感觉环境,服务方面还不够位,房间里面的洗浴设施有点发黄,需要改善!!', 'label': '不好'}
[{'label': '不好', 'score': 0.014473974704742432}, {'label': '好', 'score': 0.9855259656906128}]
{