# Chinese NER Test Pipeline 
## By: Cameron B.

### Download model from ModelScope

In [1]:
from modelscope.hub.snapshot_download import snapshot_download

# load model (only done first time)
model_dir = snapshot_download('iic/nlp_raner_named-entity-recognition_chinese-base-generic', cache_dir='/scratch/ssd004/scratch/cambish/CNER')



### Create NER pipeline and load data

In [2]:
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.models import Model

from datasets import load_dataset

# set up tokenization from https://huggingface.co/docs/datasets/en/use_dataset?
# model seems to work fine without specifying tokenizer

# tokenizer = BertTokenizer.from_pretrained()
cfp = load_dataset('/scratch/ssd004/scratch/cambish/cmmlu-v1.0.1/cmmlu.py', "chinese_foreign_policy", split="dev")


# cner_model = AutoModel.from_pretrained(model_dir,local_files_only=True)


ner_model = Model.from_pretrained(model_dir)
ner_pipeline = pipeline('named-entity-recognition', model=ner_model)

# test sample
ner_input = ['尼克松在回顾1972年访华时说：“我知道，1954年在……时福斯特·杜勒斯（当时的美国国务卿）拒绝同周（周恩来）握手，使他身受侮辱。因此我走完梯级时决心伸出我的手，一边向他走去。当我们的手相握的时候，一个时代结束了，另一个时代开始了。”对上述材料理解正确的是',
            '尼克松与周恩来握手意味着美国彻底放弃了遏制中国政策', '尼克松否定了福斯特·杜勒斯的对华态度', '中美关系对20世纪70年代的国际关系有重要影响', '省略部分应是万隆会议']
outputs = ner_pipeline(ner_input)
print(outputs)
    

2024-10-16 10:33:04,253 - modelscope - INFO - initialize model from /scratch/ssd004/scratch/cambish/CNER/iic/nlp_raner_named-entity-recognition_chinese-base-generic
2024-10-16 10:33:06,501 - modelscope - INFO - head has no _keys_to_ignore_on_load_missing
  state_dict = torch.load(ckpt_file, map_location='cpu')
2024-10-16 10:33:08,044 - modelscope - INFO - All model checkpoint weights were used when initializing ModelForTokenClassificationWithCRF.

2024-10-16 10:33:08,045 - modelscope - INFO - All the weights of ModelForTokenClassificationWithCRF were initialized from the model checkpoint If your task is similar to the task the model of the checkpoint was trained on, you can already use ModelForTokenClassificationWithCRF for predictions without further training.


[{'output': [{'type': 'PER', 'start': np.int64(0), 'end': np.int64(3), 'prob': np.float32(0.6087498), 'span': '尼克松'}, {'type': 'GPE', 'start': np.int64(12), 'end': np.int64(13), 'prob': np.float32(0.9466083), 'span': '华'}, {'type': 'PER', 'start': np.int64(30), 'end': np.int64(37), 'prob': np.float32(0.6117986), 'span': '福斯特·杜勒斯'}, {'type': 'GPE', 'start': np.int64(41), 'end': np.int64(43), 'prob': np.float32(0.6031834), 'span': '美国'}, {'type': 'PER', 'start': np.int64(50), 'end': np.int64(51), 'prob': np.float32(0.93963814), 'span': '周'}, {'type': 'PER', 'start': np.int64(52), 'end': np.int64(55), 'prob': np.float32(0.43081495), 'span': '周恩来'}]}, {'output': [{'type': 'PER', 'start': np.int64(0), 'end': np.int64(3), 'prob': np.float32(0.64598316), 'span': '尼克松'}, {'type': 'PER', 'start': np.int64(4), 'end': np.int64(7), 'prob': np.float32(0.32906184), 'span': '周恩来'}, {'type': 'GPE', 'start': np.int64(12), 'end': np.int64(14), 'prob': np.float32(0.58992296), 'span': '美国'}, {'type': 'GPE

### Visualize Output w/ SpaCy

In [3]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy

def visualize_ner(ner_input, ner_output):
    nlp = spacy.blank("zh")
    # store annotations for rendering
    docs = []
    for text, annotations in zip(ner_input, ner_output):
        doc = nlp(text)
        ents = []
        # required to convert annotations dict of lists into annotation dict
        for annotation in annotations['output']:
            span = doc.char_span(annotation['start'], annotation['end'], label=annotation['type'])
            ents.append(span)
        doc.ents = ents
        docs.append(doc)

    displacy.render(docs, style="ent")

visualize_ner(ner_input, outputs)



In [23]:

questions = [qa["Question"] for qa in cfp]
ner_questions = [ner_pipeline(q) for q in questions]
answers = [qa[qa["Answer"]] for qa in cfp]
ner_answers = [ner_pipeline(a) for a in answers]

print("Questions:")
visualize_ner(questions, ner_questions)
print("Answers:")
visualize_ner(answers, ner_answers)

def filter_qa_pairs(dataset):
    questions = [qa["Question"] for qa in dataset]
    ner_questions = [ner_pipeline(q)["output"] for q in questions]
    answers = [qa[qa["Answer"]] for qa in dataset]
    ner_answers = [ner_pipeline(a)["output"] for a in answers]
        
    # find better way to do this
    dataset = dataset.add_column("q_ner", ner_questions)
    dataset = dataset.add_column("a_ner", ner_answers)    
    
    return [qa for qa in dataset if qa["q_ner"] and qa["a_ner"]]

filtered_questions = filter_qa_pairs(cfp)
print(filtered_questions)


Questions:


Answers:


[]
