In [34]:
import os
import pandas as pd
from tqdm import tqdm
import torch
import jieba
import codecs
import json
from sklearn.utils import shuffle
from typing import List, Dict
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from transformers import (
    Trainer,
    BertTokenizer,
    TrainingArguments,
    BertForMaskedLM,
    EarlyStoppingCallback
)

os.environ["CUDA_VISIBLE_DEVICES"]="1"

source_data_path = "/data/christmas.wang/project/classification_base_project/data/origin/few_shot/"
bert_name_path = "/data/tmp/christmas.wang/chinese_wwm_ext_pytorch"

label_2_id = {"好":1, "差":0}
id_2_label = {"1":"好", "0":"差"}

data_dict = {
    "train": os.path.join(source_data_path, "hotel_review_few_shot_train.csv"),
    "test": os.path.join(source_data_path, "hotel_review_few_shot_test.csv")
}

# f->few shot; z->zero_shot
z_or_f = "f"

In [35]:
tokenizer = BertTokenizer.from_pretrained(bert_name_path)
model = BertForMaskedLM.from_pretrained(bert_name_path)

Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/added_tokens.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/special_tokens_map.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer_config.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer.json. We won't load it.
loading file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02

In [36]:
def compute_metrics(pred):
    labels = pred.label_ids[:, 3]
    preds = pred.predictions[:, 3].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [37]:
df_train = pd.read_csv(data_dict["train"], encoding="utf-8")
df_test = pd.read_csv(data_dict["test"], encoding="utf-8")
df_train

Unnamed: 0.1,Unnamed: 0,label,review
0,5322,0,标准间太差房间还不如3星的而且设施非常陈旧.建议酒店把老的标准间从新改善.
1,5323,0,服务态度极其差，前台接待好象没有受过培训，连基本的礼貌都不懂，竟然同时接待几个客人；大堂副理...
2,5324,0,地理位置还不错，到哪里都比较方便，但是服务不象是豪生集团管理的，比较差。下午睡了一觉并洗了一...
3,5325,0,1。我住的是靠马路的标准间。房间内设施简陋，并且的房间玻璃窗户外还有一层幕墙玻璃，而且不能打...
4,5326,0,我这次是第5次住在长春的雁鸣湖大酒店。昨晚夜里停电。深夜我睡着了。我的钱包被内贼进入我的房间...
5,5327,0,前台checkin花了20分钟，checkout25分钟，这是服务态度和没有做到位。信用卡刷...
6,5328,0,"有或者很少房!梯部不吸,但是有一些吸者仍然有服!我是不抽的人,成二手的受害者!(中13人口中..."
7,5329,0,酒店服务态度极差，设施很差，建议还是不要到那儿去。
8,5330,0,我3.6预定好的180的标间，当我到的时候竟然说有会议房间满了，我订的房间没有了，太不讲信誉...
9,5331,0,"房间的环境非常差,而且房间还不隔音，住的不舒服。"


In [61]:
text = []
label = []
punc = "＂!＃＄％＆＇?（）()/＊＋，－／：；,.＜＝＞＠［＼］\"＾＿｀｛｜｝～｟｠｢｣､　、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。"
for index, row in tqdm(iterable=df_train.iterrows(), total=df_train.shape[0]):
    sentence = row["review"]
    words = jieba.lcut(sentence)
    for i in range(len(words)):
        sentence_train = "".join(words[:i])+"，酒店[MASK]，"+"".join(words[i:])
        sentence_test = "".join(words[:i])+"，酒店"+id_2_label[str(row["label"])]+"，"+"".join(words[i:])
        text.append(sentence_train)
        label.append(sentence_test)
text, label = shuffle(text, label)
print(len(text))
print(text[:3])
print(label[:3])
        

100%|██████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 470.79it/s]

1355
['地理位置还不错，到哪里都比较方便，但是服务不象是豪生集团管理的，比较差。下午睡了一觉并洗了一个澡，本来想让酒店再来打扫，酒店[MASK]，一下，所以，打开了，请打扫的服务灯，可是到晚上回酒店，发现打扫得服务灯被关掉了，而房间还是没有打扫过。', '我这次是第5次住在长春的雁鸣湖大酒店。昨晚夜里停电。深夜我睡着了。我的钱包被内贼进入我的房间，偷了我近1000元和4张信用卡。。。我的证件和外币，数码相机等都在房间的保险箱里，原封不动。我打了好几个小时的长途电话来处理我的信用卡的冻结。我报案了，这个4星酒店的保安摄像探头竟然坏了，没有修理！保安还查房卡入门时间，就是没有其他人在深夜进入我的房间。难道内贼不会用其他高明的方式进入吗？我的羽绒服也被这个内贼放在地上！我醒来时没有多想！近中午时我才发觉钱包少了现金和信用卡！还有，这家酒店的态度很差！没有同情心！我之前授权的2000元，我打了国际电话，银行说两天前我入酒店的2000元授权了，可是酒店的财务不领情，说中国银行没有授权。我又打了国际电话，我的银行说通过了！这家4星级的酒店不负责，认为不可能发生，我报案了，我下次再也不住这个，酒店[MASK]，1星不到的服务态度，很可耻！我还要把这个事件说给那些想定这个酒店的住客。酒店为何停电，摄像头坏得也太凑巧了来让大家知道这种内贼行为是要强力打击的。好了，不说了！！！千元丢了小事。酒店的处理态度我很反感！我强力告诉大家和提醒其他人不要到该酒店！', '宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的，酒店[MASK]，自助游朋友~比较划算，附近特色小吃很多~']
['地理位置还不错，到哪里都比较方便，但是服务不象是豪生集团管理的，比较差。下午睡了一觉并洗了一个澡，本来想让酒店再来打扫，酒店差，一下，所以，打开了，请打扫的服务灯，可是到晚上回酒店，发现打扫得服务灯被关掉了，而房间还是没有打扫过。', '我这次是第5次住在长春的雁鸣湖大酒店。昨晚夜里停电。深夜我睡着了。我的钱包被内贼进入我的房间，偷了我近1000元和




In [39]:
def dataset_builder(x: List[str], y: List[str], tokenizer: BertTokenizer, max_len: int) -> Dataset:
    data_dict = {'text': x, 'label_text': y}
    result = Dataset.from_dict(data_dict)
    def preprocess_function(examples):
        text_token = tokenizer(examples['text'], padding=True,truncation=True, max_length=max_len)
        text_token['labels'] = np.array(tokenizer(examples['label_text'], padding=True,truncation=True, max_length=max_len)["input_ids"])
        return text_token
    result = result.map(preprocess_function, batched=True)
    return result

In [62]:
eval_dataset = dataset_builder(text[:130], label[:130], tokenizer, 512)
train_dataset = dataset_builder(text[130:], label[130:], tokenizer, 512)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [66]:
args = TrainingArguments(
    output_dir="/data/christmas.wang/project/classification_base_project/output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    num_train_epochs=6,
    seed=20,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [67]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: label_text, text.
***** Running training *****
  Num examples = 1225
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 462
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,No log,0.0,1.0,1.0,1.0,1.0
200,No log,0.0,1.0,1.0,1.0,1.0
300,No log,0.0,1.0,1.0,1.0,1.0
400,No log,0.0,1.0,1.0,1.0,1.0


The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: label_text, text.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16
Saving model checkpoint to /data/christmas.wang/project/classification_base_project/output/checkpoint-100
Configuration saved in /data/christmas.wang/project/classification_base_project/output/checkpoint-100/config.json
Model weights saved in /data/christmas.wang/project/classification_base_project/output/checkpoint-100/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: label_text, text.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16
Saving model checkpoint to /data/christmas.wang/project/classification_base_project/output/checkpoint-200
Configuration saved in /data/christmas.wang/project/classification_base_project/output/checkpoint-200/config.jso

TrainOutput(global_step=462, training_loss=6.282679510839057e-05, metrics={'train_runtime': 321.0703, 'train_samples_per_second': 22.892, 'train_steps_per_second': 1.439, 'total_flos': 1839892910335200.0, 'train_loss': 6.282679510839057e-05, 'epoch': 6.0})

In [68]:
pred = []
true = []
external_words = []
df_test.dropna(how="any", axis=0, inplace=True)
for index, row in tqdm(iterable=df_test.iterrows(), total=df_test.shape[0]):
    text = "酒店[MASK]，" + row["review"]
    tokenized_text = tokenizer.tokenize(text)
    if len(tokenized_text) > 512:
        tokenized_text = tokenized_text[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
    segments_tensors = torch.tensor([segments_ids]).to('cuda')
    
    masked_index = tokenized_text.index('[MASK]')
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
    predicted_index = torch.argmax(predictions[0][0][masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    # print(predicted_token+str(row["label"]))
    if predicted_token not in ["差", "好"]:
        external_words.append(predicted_token)
        predicted_token = "差"
    y_pred = label_2_id[predicted_token]
    pred.append(y_pred)
    true.append(row["label"])
precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='binary')
acc = accuracy_score(true, pred)
print({'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall})
print(external_words)
print(len(external_words))

100%|███████████████████████████████████████████████████████████| 3999/3999 [00:48<00:00, 82.80it/s]

{'accuracy': 0.5356339084771192, 'f1': 0.5460767538499144, 'precision': 0.5341941654710665, 'recall': 0.5585}
['多', '多', '多', '多', '豪', '居', '重', '多', '多', '全', '多', '微', '居', '重', '远', '电', '重', '多', '多', '微', '明', '惠', '[PAD]', '[PAD]', '多', '微', '多', '店', '光', '台', '全', '高', '远', '全', '超', '微', '多', '多', '多', '高', '尾', '[PAD]', '高', '多', '多', '全', '多', '全', '店', '店', '提', '部', '多', '角', '店', '超', '全', '大', '多', '店', '[PAD]', '多', '店', '远', '高', '店', '装', '店', '结', '全', '高', '多', '明', '多', '多', '近', '多', '全', '超', '多', '微', '多', '全', '多', '微', '多', '多', '店', '多', '台', '大', '微', '家', '店', '店', '明', '内', '店', '多', '尾', '有', '店', '多', '多', '多', '大', '多', '豪', '超', '店', '高', '店', '多', '全', '店', '高', '有', '多', '[PAD]', '微', '[PAD]', '多', '面', '重', '多', '正', '全', '大', '响', '多', '角', '瑞', '多', '有', '全', '全', '多', '粗', '多', '店', '多', '庄', '惠', '装', '重', '檔', '装', '多', '店', '多', '高', '全', '高', '多', '微', '店', '店', '多', '微', '全', '高', '有', '重', '等', '惠', '多', '多', '多', '装', '店', '高', '多', '台', 




In [51]:
good_words = ["美", "舒", "服", "豪", "华", "丽", "亮", "错", "大", "宜", "明"]
def get_label(words: List[str]) -> int:
    for key, val in label_2_id.items():
        if key in words:
            return val
    for word in words:
        if word in good_words:
            return 0
    return 1

In [65]:
pred = []
true = []

for index, row in tqdm(iterable=df_test.iterrows(), total=df_test.shape[0]):
    text = "酒店[MASK]，" + row["review"]
    tokenized_text = tokenizer.tokenize(text)
    if len(tokenized_text) > 512:
        tokenized_text = tokenized_text[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
    segments_tensors = torch.tensor([segments_ids]).to('cuda')
    
    masked_index = tokenized_text.index('[MASK]')
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
    top_k = torch.topk(predictions[0][0][masked_index].flatten(), 5).indices.tolist()
    words = []
    for word in top_k:
        predicted_token = tokenizer.convert_ids_to_tokens([word])[0]
        words.append(predicted_token)
    pred.append(get_label(words))
    true.append(row["label"])
precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='binary')
acc = accuracy_score(true, pred)
print({'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall})

100%|███████████████████████████████████████████████████████████| 3999/3999 [00:48<00:00, 82.02it/s]

{'accuracy': 0.5253813453363341, 'f1': 0.6699130434782609, 'precision': 0.5136, 'recall': 0.963}





In [None]:
def add_mask(data: List[Dict])->None:
    for ele in tqdm(data):
        sentence = ele["text"].strip()
        new_label = "酒店"+id_2_label[str(ele["label"])]+"，"+sentence
        new_sentence = "酒店[MASK]，"+sentence
        ele["new_text"] = new_sentence
        ele["label_text"] = new_label
        

In [None]:
origin_data_path = "/data/tmp/nlp-data/open_source_data/classification/ChnSentiCorp_htl_all.csv"
df_origin_data = pd.read_csv(origin_data_path, encoding='utf-8')
df_good = df_origin_data[df_origin_data["label"] == 0].head(10)
df_bad = df_origin_data[df_origin_data["label"] == 1].head(10)

df_few = shuffle(pd.concat([df_good,df_bad]))
df_few

Unnamed: 0,label,review
6,1,价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
5326,0,我这次是第5次住在长春的雁鸣湖大酒店。昨晚夜里停电。深夜我睡着了。我的钱包被内贼进入我的房间...
5323,0,服务态度极其差，前台接待好象没有受过培训，连基本的礼貌都不懂，竟然同时接待几个客人；大堂副理...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"
7,1,不错，在同等档次酒店中应该是值得推荐的！
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
9,1,1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。2。早餐还可以，只是品种不是很多。3。...
5327,0,前台checkin花了20分钟，checkout25分钟，这是服务态度和没有做到位。信用卡刷...


In [None]:
add_mask(train_json)
add_mask(eval_json)
add_mask(test_json)

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4765/4765 [00:00<00:00, 271222.70it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 388265.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 437849.26it/s]


In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_name_path)
model = BertForMaskedLM.from_pretrained(bert_name_path)

Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/added_tokens.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/special_tokens_map.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer_config.json. We won't load it.
Didn't find file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/tokenizer.json. We won't load it.
loading file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/vocab.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
loading configuration file /data/tmp/christmas.wang/chinese_wwm_ext_pytorch/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02

In [None]:
def dataset_builder(data: List[Dict], tokenizer: BertTokenizer, max_len: int) -> Dataset:
    x = []
    y = []
    for ele in tqdm(data):
        x.append(ele["new_text"])
        y.append(ele["label_text"])
    data_dict = {'text': x, 'label_text': y}
    result = Dataset.from_dict(data_dict)
    def preprocess_function(examples):
        text_token = tokenizer(examples['text'], padding=True,truncation=True, max_length=max_len)
        text_token['labels'] = np.array(tokenizer(examples['label_text'], padding=True,truncation=True, max_length=max_len)["input_ids"])
        return text_token
    result = result.map(preprocess_function, batched=True)
    return result

In [None]:
train_dataset = dataset_builder(train_json, tokenizer, 64)
eval_dataset = dataset_builder(eval_json, tokenizer, 64)
test_dataset = dataset_builder(test_json, tokenizer, 64)
test_dataset

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4765/4765 [00:00<00:00, 638566.64it/s]


  0%|          | 0/5 [00:00<?, ?ba/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 1064364.07it/s]


  0%|          | 0/2 [00:00<?, ?ba/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 587108.62it/s]


  0%|          | 0/2 [00:00<?, ?ba/s]

Dataset({
    features: ['attention_mask', 'input_ids', 'label_text', 'labels', 'text', 'token_type_ids'],
    num_rows: 1500
})

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids[:, 3]
    preds = pred.predictions[:, 3].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [None]:
args = TrainingArguments(
    output_dir="/data/christmas.wang/project/classification_base_project/output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=1e-5,
    num_train_epochs=6,
    seed=20,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=test_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# import wandb
# wandb.init()

# trainer.train()

In [None]:
pred = []
true = []
external_words = []


counter = 0
for ele in tqdm(train_json):
    if counter > 2:
        break
    tokenized_text = tokenizer.tokenize(ele['new_text'])
    if len(tokenized_text) > 512:
        tokenized_text = tokenized_text[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
    segments_tensors = torch.tensor([segments_ids]).to('cuda')
    
    masked_index = tokenized_text.index('[MASK]')
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
    predicted_index = torch.argmax(predictions[0][0][masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    # print(predicted_token+str(row["label"]))
    if predicted_token not in ["差", "好"]:
        external_words.append(predicted_token)
        predicted_token = "差"
    y_pred = label_2_id[predicted_token]
    pred.append(y_pred)
    true.append(ele["label"])
precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='binary')
acc = accuracy_score(true, pred)
print({'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall})
# print(external_words)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4765/4765 [00:48<00:00, 98.59it/s]

{'accuracy': 0.3246589716684155, 'f1': 0.024257125530624625, 'precision': 0.8695652173913043, 'recall': 0.012300123001230012}





In [None]:
good_words = ["美", "舒", "服", "豪", "华", "丽", "亮"]
def get_label(words: List[str]) -> int:
    for key, val in label_2_id.items():
        if key in words:
            return val
    for word in words:
        if word in good_words:
            return 0
    return 1

In [None]:
pred = []
true = []
external_words = []


counter = 0
for ele in tqdm(train_json):
    if counter > 2:
        break
    tokenized_text = tokenizer.tokenize(ele['new_text'])
    if len(tokenized_text) > 512:
        tokenized_text = tokenized_text[:512]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
    segments_tensors = torch.tensor([segments_ids]).to('cuda')
    
    masked_index = tokenized_text.index('[MASK]')
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)
    top_k = torch.topk(predictions[0][0][masked_index].flatten(), 5).indices.tolist()
    words = []
    for word in top_k:
        predicted_token = tokenizer.convert_ids_to_tokens([word])[0]
        words.append(predicted_token)
    pred.append(get_label(words))
    true.append(ele["label"])
precision, recall, f1, _ = precision_recall_fscore_support(true, pred, average='binary')
acc = accuracy_score(true, pred)
print({'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall})
print(external_words)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4765/4765 [00:48<00:00, 98.47it/s]

{'accuracy': 0.683105981112277, 'f1': 0.8113914564076942, 'precision': 0.6832141354648716, 'recall': 0.998769987699877}
[]



