In [4]:
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

In [46]:
class config(object):
    vocab_size = 5000
    hidden_size = 200
    pad_token_id = 0
    max_position_embeddings = 512
    type_vocab_size = 2
    layer_norm_eps = 200
    hidden_dropout_prob = 0.3
    pos_tags = 5
    pos_tags_pad_idx = 4

In [5]:
class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
            
        '''
        part of speech embedding
        
        '''
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


In [43]:
myEmbedding = BertEmbeddings(config)

In [44]:
myEmbedding

BertEmbeddings(
  (word_embeddings): Embedding(5000, 200, padding_idx=0)
  (position_embeddings): Embedding(512, 200)
  (token_type_embeddings): Embedding(2, 200)
  (LayerNorm): LayerNorm((200,), eps=200, elementwise_affine=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [23]:
import numpy as np
inputs = np.array([[1,2,3],[4,5,6]])
inputs = torch.tensor(inputs)
myEmbedding(inputs)

tensor([[[ 0.2169, -0.0000,  0.0000,  ...,  0.1897,  0.1692,  0.0899],
         [ 0.2335, -0.0070,  0.0000,  ...,  0.1481, -0.0000,  0.0441],
         [ 0.0000, -0.2173,  0.1230,  ...,  0.4322, -0.0000, -0.0000]],

        [[ 0.1773, -0.0917,  0.0000,  ...,  0.0000, -0.0185,  0.0000],
         [ 0.3133,  0.1682,  0.0000,  ...,  0.1793, -0.0000, -0.0368],
         [ 0.0000, -0.0000, -0.0000,  ...,  0.3805,  0.0735, -0.0516]]],
       grad_fn=<MulBackward0>)

In [45]:
myEmbedding(inputs).shape # batch_size, seq_length, embedding_size

torch.Size([2, 3, 200])

## pos_tag_embedding

In [37]:
help(nn.Embedding)

Help on class Embedding in module torch.nn.modules.sparse:

class Embedding(torch.nn.modules.module.Module)
 |  A simple lookup table that stores embeddings of a fixed dictionary and size.
 |  
 |  This module is often used to store word embeddings and retrieve them using indices.
 |  The input to the module is a list of indices, and the output is the corresponding
 |  word embeddings.
 |  
 |  Args:
 |      num_embeddings (int): size of the dictionary of embeddings
 |      embedding_dim (int): the size of each embedding vector
 |      padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
 |                                       (initialized to zeros) whenever it encounters the index.
 |      max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
 |                                  is renormalized to have norm :attr:`max_norm`.
 |      norm_type (float, optional): The p of the p-norm to comput

In [39]:
'''
词性标注类别
n / v / adj / adv / others
'''

pos_embedding = nn.Embedding(5, config.hidden_size, padding_idx=4)
print(pos_embedding)

Embedding(5, 200, padding_idx=4)


In [27]:
from hanlp_restful import HanLPClient
HanLP = HanLPClient('https://www.hanlp.com/api', auth=None, language='zh')

In [28]:
HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。'])["pos/ctb"]

[['NT',
  'NR',
  'P',
  'NN',
  'NN',
  'VV',
  'JJ',
  'NN',
  'AD',
  'JJ',
  'DEG',
  'CD',
  'NN',
  'NR',
  'NN',
  'PU']]

## 词性标注类别
n / v / adj / adv / others

使用nn.Embedding将标注词性后的list: [seq_len, 1] => 目标POS Embedding: [seq_len, embedding_size]


In [57]:
pos_tags_dict = {'n':0, 'v':1, 'adj':2, 'adv':3, 'others':4}

In [41]:
pos_embedding(inputs[0])

tensor([[-1.4300e+00, -5.3774e-01, -5.2510e-01,  1.0911e+00,  7.7005e-01,
         -1.0175e-01, -1.6265e+00,  1.9913e-01, -9.0708e-01,  1.1805e+00,
         -3.3585e-01, -9.4566e-01,  3.5788e-02,  6.7033e-01, -1.3948e+00,
          1.0559e+00, -3.5674e-01,  2.9011e-01, -2.1798e+00, -4.9722e-01,
         -4.8710e-01,  2.1732e+00, -3.1241e-01, -1.7443e-01,  2.3178e-01,
          8.7436e-01,  6.7179e-01,  3.7027e-01,  8.7233e-02, -8.9843e-01,
          5.6757e-01,  2.5561e+00, -5.1986e-01,  4.0348e-01,  1.0794e+00,
          9.5100e-01,  5.4640e-01, -8.3164e-01, -1.2846e+00,  2.3533e-01,
         -3.3951e-02, -7.9214e-01, -1.0669e+00,  2.6399e+00,  1.4651e+00,
         -6.2061e-01,  7.3026e-01, -6.5011e-01,  2.1966e+00, -1.6741e+00,
          8.6019e-01, -3.7664e-01,  5.3755e-02, -7.0222e-01, -9.2179e-01,
          1.1501e+00, -5.3511e-01, -1.3370e+00,  3.4710e-01, -1.2905e-01,
         -2.7979e+00,  1.1589e+00,  1.0360e+00,  9.6928e-02, -1.7484e+00,
         -9.7682e-01,  1.5360e+00,  6.

In [53]:
class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # pos_tags_embedding
        self.pos_embeddings = nn.Embedding(config.pos_tags, config.hidden_size,config.pos_tags_pad_idx)
        
        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

    def forward(
        self, input_ids=None, input_tags_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
            
        '''
        part of speech embedding
        
        '''
        embeddings += self.position_embeddings(input_tags_ids)
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


In [54]:
myEmbedding = BertEmbeddings(config)
myEmbedding

BertEmbeddings(
  (word_embeddings): Embedding(5000, 200, padding_idx=0)
  (position_embeddings): Embedding(512, 200)
  (token_type_embeddings): Embedding(2, 200)
  (pos_embeddings): Embedding(5, 200, padding_idx=4)
  (LayerNorm): LayerNorm((200,), eps=200, elementwise_affine=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [56]:
inputs_tags = np.array([[1,2,3],[2,1,3]])
inputs_tags = torch.tensor(inputs_tags)
'''
inputs:词索引
    [batch_size, seq_len, embedding_size]
inputs_tags:词性索引
    [batch_size, seq_len, embedding_size]
'''
myEmbedding(inputs,inputs_tags)

tensor([[[ 0.0104,  0.0000, -0.0660,  ..., -0.0044,  0.3370,  0.0000],
         [-0.1986, -0.1207,  0.1081,  ...,  0.3014,  0.0404,  0.1837],
         [ 0.0621, -0.1167,  0.1323,  ...,  0.0051, -0.0000, -0.0000]],

        [[-0.0000, -0.0507, -0.0000,  ...,  0.0000,  0.2103,  0.0416],
         [ 0.0000, -0.1439,  0.0000,  ...,  0.0078,  0.2333, -0.1386],
         [ 0.1539,  0.0104,  0.0000,  ..., -0.1067,  0.1080,  0.0215]]],
       grad_fn=<MulBackward0>)