In [22]:


model_name = 'ethanyt/guwen-quote'

text = '子曰学而时习之不亦说乎有朋自远方来不亦乐乎人不知而不愠不亦君子乎有子曰其为人也孝弟而好犯上者鲜矣不好犯上而好作乱者未之有也君子务本本立而道生孝弟也者其为仁之本与子曰巧言令色鲜矣仁曾子曰吾日三省吾身为人谋而不忠乎与朋友交而不信乎传不习乎子曰道千乘之国敬事而信节用而爱人使民以时'


# 普通模型


In [23]:
from transformers import BertTokenizer, RobertaForTokenClassification


tokenizer = BertTokenizer.from_pretrained(model_name)
model = RobertaForTokenClassification.from_pretrained(model_name)

tokens = tokenizer(text, return_tensors='pt')
logits = model(**tokens).logits
logit_result = logits.argmax(axis=2).tolist()
print(logit_result)

Some weights of the model checkpoint at ./models/guwen-quote were not used when initializing RobertaForTokenClassification: ['crf.start_transitions', 'crf.end_transitions', 'crf.transitions']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[[0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 2, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 2, 2, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2

# CRF模型

In [24]:
from transformers import BertTokenizer
from crf.crf_roberta import CRFRobertaForTokenClassification

tokenizer = BertTokenizer.from_pretrained(model_name)
model = CRFRobertaForTokenClassification.from_pretrained(model_name)

tokens = tokenizer(text, return_tensors='pt')
logits = model(**tokens).logits
crf_result = model.decode(logits, mask=None)
print(crf_result)

[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0]]


# 贪心法

将 `1 2 0 0 2 0` 转换为 `1 2 2 2 2 0`

In [25]:
def greedy(logits):
    in_span = False
    result = []
    for tag in reversed(logits):
        if tag == 1:
            in_span = False
        if tag == 2:
            in_span = True
        if in_span:
            tag = 2
        result.append(tag)
    result.reverse()
    return result


In [26]:
greedy_result = greedy(logits.argmax(axis=2).tolist()[0])

# 可视化

In [27]:

def visualization(tags):
    in_span = False
    result = []
    for i, (tag, token) in enumerate(zip(tags, text + " ")):
        if tag in (0, 1) and in_span:
            result.append("」")
            in_span = False
        if tag == 1:
            result.append("「")
            in_span = True
        result.append(token)
    print(''.join(result))

In [28]:
visualization(logit_result[0][1:])

子曰「学而」时习之不亦说乎「有朋」自远方来不亦乐乎人不知而不愠不亦君子乎有子曰「其」为人也孝弟而好犯上者鲜矣不好犯上而好作乱者未之有也君子务本本立而道生孝弟也者其为仁之本与子曰「巧言令色」鲜矣仁曾子曰「吾」日三省吾身为人谋而不忠乎与朋友交而不信乎传不习乎子曰「道」千乘之国敬事而信节用而爱人使民以时 


In [29]:
visualization(crf_result[0][1:])

子曰「学而时习之不亦说乎有朋自远方来不亦乐乎人不知而不愠不亦君子乎」有子曰「其为人也孝弟而好犯上者鲜矣不好犯上而好作乱者未之有也」君子务本本立而道生孝弟也者其为仁之本与子曰「巧言令色鲜矣仁」曾子曰「吾日三省吾身为人谋而不忠乎与朋友交而不信乎传不习乎」子曰「道千乘之国敬事而信节用而爱人使民以时」 


In [30]:
visualization(greedy_result[1:])

子曰「学而时习之不亦说乎」「有朋自远方来不亦乐乎人不知而不愠不亦君子乎」有子曰「其为人也孝弟而好犯上者鲜矣不好犯上而好作乱者未之有也君子务本本立而道生孝弟也者其为仁之本与」子曰「巧言令色鲜矣仁」曾子曰「吾日三省吾身为人谋而不忠乎与朋友交而不信乎传不习乎」子曰「道千乘之国敬事而信节用而爱人使民以时」 
