In [1]:
%cd /home/aistudio/work/

/home/aistudio/work


In [2]:
!pip install --upgrade paddlenlp==2.3.4

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.2.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from scipy.special import softmax
from scipy import spatial
import os
import paddle
from paddle import inference
import paddlenlp as ppnlp
from paddlenlp.data import Tuple, Pad

In [4]:
def convert_example_recall_infer(example,
                    tokenizer,
                    max_seq_length=512,
                    pad_to_max_seq_len=False):
    """
    Builds model inputs from a sequence.
        
    A BERT sequence has the following format:
    - single sequence: ``[CLS] X [SEP]``
    Args:
        example(obj:`list(str)`): The list of text to be converted to ids.
        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization. 
            Sequences longer than this will be truncated, sequences shorter will be padded.
        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
    Returns:
        input_ids(obj:`list[int]`): The list of query token ids.
        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
    """

    result = []
    for key, text in example.items():
        encoded_inputs = tokenizer(
            text=text,
            max_seq_len=max_seq_length,
            pad_to_max_seq_len=pad_to_max_seq_len)
        input_ids = encoded_inputs["input_ids"]
        token_type_ids = encoded_inputs["token_type_ids"]
        result += [input_ids, token_type_ids]
    return result

In [5]:
tokenizer = ppnlp.transformers.AutoTokenizer.from_pretrained('roberta-wwm-ext-large')

[2022-08-06 11:14:03,000] [    INFO] - We are using <class 'paddlenlp.transformers.roberta.tokenizer.RobertaChineseTokenizer'> to load 'roberta-wwm-ext-large'.


In [12]:
class RecallPredictor(object):
    def __init__(self,
                 model_dir,
                 device="gpu",
                 max_seq_length=128,
                 batch_size=32,
                 use_tensorrt=False,
                 precision="fp32",
                 cpu_threads=10,
                 enable_mkldnn=False):
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size

        model_file = model_dir + "/model.get_pooled_embedding.pdmodel"
        params_file = model_dir + "/model.get_pooled_embedding.pdiparams"
        if not os.path.exists(model_file):
            raise ValueError("not find model file path {}".format(model_file))
        if not os.path.exists(params_file):
            raise ValueError("not find params file path {}".format(params_file))
        config = paddle.inference.Config(model_file, params_file)

        # 去除 Paddle Inference 运行中的 LOG
        config.disable_glog_info()

        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
            config.enable_use_gpu(100, 0)
            precision_map = {
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32,
                "int8": inference.PrecisionType.Int8
            }
            precision_mode = precision_map[precision]

            if use_tensorrt:
                config.enable_tensorrt_engine(
                    max_batch_size=batch_size,
                    min_subgraph_size=30,
                    precision_mode=precision_mode)
        elif device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
        elif device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)

        config.switch_use_feed_fetch_ops(False)
        self.predictor = paddle.inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handle = self.predictor.get_output_handle(
            self.predictor.get_output_names()[0])

    def extract_embedding(self, data, tokenizer):
        """
        Predicts the data labels.
        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        Returns:
            results(obj:`dict`): All the feature vectors.
        """

        examples = []
        for text in data:
            input_ids, segment_ids = convert_example_recall_infer(text, tokenizer)
            examples.append((input_ids, segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)

        input_ids, segment_ids = batchify_fn(examples)
        print(input_ids)
        self.input_handles[0].copy_from_cpu(input_ids)
        self.input_handles[1].copy_from_cpu(segment_ids)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        return logits

    def predict(self, data, tokenizer):
        """
        Predicts the data labels.
        Args:
            data (obj:`List(str)`): The batch data whose each element is a raw text.
            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer` 
                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
        Returns:
            results(obj:`dict`): All the predictions probs.
        """

        examples = []
        for idx, text in enumerate(data):
            input_ids, segment_ids = convert_example_recall_infer({idx: text[0]}, tokenizer)
            title_ids, title_segment_ids = convert_example_recall_infer({
                idx: text[1]
            }, tokenizer)
            examples.append(
                (input_ids, segment_ids, title_ids, title_segment_ids))

        batchify_fn = lambda samples, fn=Tuple(
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
        ): fn(samples)


        query_ids, query_segment_ids, title_ids, title_segment_ids = batchify_fn(
            examples)
        self.input_handles[0].copy_from_cpu(query_ids)
        self.input_handles[1].copy_from_cpu(query_segment_ids)
        self.predictor.run()
        query_logits = self.output_handle.copy_to_cpu()

        self.input_handles[0].copy_from_cpu(title_ids)
        self.input_handles[1].copy_from_cpu(title_segment_ids)
        self.predictor.run()
        title_logits = self.output_handle.copy_to_cpu()

        result = [
            float(1 - spatial.distance.cosine(arr1, arr2))
            for arr1, arr2 in zip(query_logits, title_logits)
        ]
        return result

In [13]:
model_dir = 'roberta-wwm-ext-large/infer_model'
device='cpu'
max_seq_length=150
use_tensorrt = False
batch_size =32 
precision = 'fp32'
cpu_threads = 1
enable_mkldnn =False
predictor = RecallPredictor(model_dir, device, max_seq_length,
                          batch_size, use_tensorrt, precision,
                          cpu_threads, enable_mkldnn)


id2corpus = {0: '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
res = predictor.extract_embedding(corpus_list, tokenizer)
print('抽取向量')
print(res.shape)
print(res)

[[ 101 1744 3300  821  689 2471 1057 7478 1744 3300 6598 3315 2190 1158
  3173 5327 3126 4638 2512 1510  100  100 1825  754 1169 6863  689 1744
  3300  677 2356 1062 1385 4638 5307 7741 6395 2945  102]]
抽取向量
(1, 256)
[[ 2.96637956e-02 -1.14179980e-02 -4.56955563e-03  2.11348027e-01
  -9.85053331e-02 -5.04758134e-02  9.16355103e-03  1.75997019e-02
  -8.36932287e-02  1.30000217e-02  6.90231174e-02  7.55028892e-03
   3.53238732e-02  3.63998045e-03 -3.23725790e-02  1.47362212e-02
   1.06476955e-01 -1.23755224e-02  1.46541661e-02  3.39983441e-02
   1.49334326e-01  2.06963010e-02 -3.09278965e-02 -9.50645562e-03
  -1.06861949e-01 -4.77590822e-02 -3.50476848e-03  6.69689551e-02
   5.20728761e-04  1.76204350e-02  3.61470319e-02  6.13850094e-02
  -5.85609935e-02 -2.41857413e-02  1.31345419e-02 -3.66793014e-02
  -9.53514874e-03  1.46531686e-03  1.42472992e-02  2.42421478e-02
  -2.50277594e-02 -5.54643348e-02 -4.60097417e-02 -3.86831425e-02
   3.86080565e-03  1.68746207e-02 -1.63225515e-03 -5.9640