<a href="https://colab.research.google.com/github/2hip3ng/lihang-code/blob/master/GAT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 挂载云盘

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
cd '/content/drive/My Drive/match-gat'

/content/drive/My Drive/match-gat


In [3]:
!pwd

/content/drive/My Drive/match-gat


## 导入相关包

In [0]:
import argparse
import glob
import json
import logging
import os
import random
import math
from collections import Counter
from tqdm import tqdm, trange

import pickle
import codecs

import numpy as np
import torch
from torch.autograd import Variable
from sklearn.metrics import f1_score, accuracy_score
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler

logger = logging.getLogger(__name__)
LayerNorm = torch.nn.LayerNorm

## 激活函数

In [0]:
def gelu(x):
    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

## 模型配置

In [0]:
class config():
    def __init__(
        self,
        vocab_size=50000,
        embedding_size=300,
        hidden_size=300,
        num_hidden_layers=5,
        num_attention_heads=5,
        intermediate_size=2048,
        hidden_act="relu",
        embedding_dropout_prob=0.2,
        num_labels = 3,
        hidden_dropout_prob=0.2,
        attention_probs_dropout_prob=0.2,
        max_position_embeddings_a=32,
        max_position_embeddings_b=16,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        norm_eps=1e-12,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.num_labels = num_labels
        self.embedding_dropout_prob = embedding_dropout_prob
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings_a = max_position_embeddings_a
        self.max_position_embeddings_b = max_position_embeddings_b
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.norm_eps = norm_eps

## Model

### CharacterEmbedding

In [0]:
class CharacterEmbedding(nn.Module):
    """
    Character embedding module.
    :param char_embedding_input_dim: The input dimension of character embedding layer.
    :param char_embedding_output_dim: The output dimension of character embedding layer.
    :param char_conv_filters: The filter size of character convolution layer.
    :param char_conv_kernel_size: The kernel size of character convolution layer.
    Examples:
        >>> import torch
        >>> character_embedding = CharacterEmbedding()
        >>> x = torch.ones(10, 32, 16, dtype=torch.long)
        >>> x.shape
        torch.Size([10, 32, 16])
        >>> character_embedding(x).shape
        torch.Size([10, 32, 100])
    """

    def __init__(
        self,
        char_embedding_input_dim: int = 100,
        char_embedding_output_dim: int = 8,
        char_conv_filters: int = 300,
        char_conv_kernel_size: int = 5
    ):
        """Init."""
        super().__init__()
        self.char_embedding = nn.Embedding(
            num_embeddings=char_embedding_input_dim,
            embedding_dim=char_embedding_output_dim
        )
        self.conv = nn.Conv1d(
            in_channels=char_embedding_output_dim,
            out_channels=char_conv_filters,
            kernel_size=char_conv_kernel_size
        )

    def forward(self, x):
        """Forward."""
        embed_x = self.char_embedding(x)

        batch_size, seq_len, word_len, embed_dim = embed_x.shape

        embed_x = embed_x.contiguous().view(-1, word_len, embed_dim)

        embed_x = self.conv(embed_x.transpose(1, 2))
        embed_x = torch.max(embed_x, dim=-1)[0]

        embed_x = embed_x.view(batch_size, seq_len, -1)
        return embed_x

### Embedding

In [0]:
class EmbeddingLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
        # self.max_position_embeddings = max(config.max_position_embeddings_a, config.max_position_embeddings_b)
        # self.position_embeddings = nn.Embedding(self.max_position_embeddings, config.embedding_size)
        self.dropout = nn.Dropout(config.embedding_dropout_prob)
        
        if not os.path.exists('word_embedding_snli.pkl'):
            print('load embedding ... ')
            with open('snli/vocab.txt', "r", encoding="utf-8") as f:
                lines = f.readlines()
            word2id = {}
            vocab = []
            for (index, line) in enumerate(lines):
                word = line.strip()
                vocab.append(word)
                word2id[word] = index

            embedding = np.zeros((config.vocab_size, 300))
            tar_count = 0
            glove_vocab = {}
            with open('glove.840B.300d.txt') as f:
                for line in f:
                    elems = line.rstrip().split()
                    if len(elems) != 300 + 1:
                        continue
                    token = elems[0]

                    # token = token.lower()
                    if token in vocab:
                        index = vocab.index(token)
                        vector = [float(x) for x in elems[1:]]
                        embedding[index] = vector
                        if token not in glove_vocab.keys():
                            tar_count += 1
                            glove_vocab[token] = 1
                    else:
                        token = token.lower()
                        if token in vocab and token not in glove_vocab.keys():
                            index = vocab.index(token)
                            vector = [float(x) for x in elems[1:]]
                            embedding[index] = vector
                            tar_count += 1
                            glove_vocab[token] = 1

            print('oov:', len(vocab) - tar_count, ' 比例：', (len(vocab) - tar_count) / len(vocab))
            
            with open('word_embedding_snli.pkl', 'wb') as f:
                pickle.dump(embedding,f)
        else:
            with open('word_embedding_snli.pkl', 'rb') as f:
                embedding = pickle.load(f)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding))

        


    def forward(self, input_ids, position=None):
        word_embeddings = self.word_embeddings(input_ids)
        word_embeddings = self.dropout(word_embeddings)    
        return word_embeddings

### Self Attention Layer

In [0]:
class SelfAttLayer(nn.Module):
    """docstring for GatLayer"""
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.norm = LayerNorm(config.hidden_size, eps=config.norm_eps)

    def transpose_for_scores(self, x):
        # x : batch_size * max_seq * dim
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  
        # new_x_shape: batch_size * max_seq * attention_heads * head_size
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
        # return shape: batch_size * attention_heads * max_seq * head_size


    def forward(self, hidden_states, attention_mask):
        # hidden_states_a: batch_size * max_seq_a * embedding_dim
        # hidden_states_b: batch_size * max_seq_b * embedding_dim

        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        extended_attention_mask = attention_mask[:, None, None, :]
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        attention_mask = extended_attention_mask

        # Self-Attention
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_a * max_seq_b
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_scores = attention_scores + attention_mask
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

        return context_layer

### FeedForward

In [0]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.dense_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.norm = LayerNorm(config.hidden_size, eps=config.norm_eps)
    
    def forward(self, hidden_states):
        output = self.dense_1(hidden_states)
        # x = nn.functional.relu(hidden_states)
        output = gelu(output)
        output = self.dropout(output)
        output = self.dense_2(output)
        output = self.norm(hidden_states + output)
        
        return output

### Encoder

In [0]:
class Encoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.selfattlayer = SelfAttLayer(config)
        self.feedforward = FeedForward(config)
        self.norm = LayerNorm(config.hidden_size, eps=config.norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, attention_mask):
        output = self.selfattlayer(hidden_states, attention_mask)
        output = self.dropout(output)
        output = self.norm(output + hidden_states)
        output = self.feedforward(output)

        return output

### CrossAttLayer

In [0]:
class CrossAttLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.dense = nn.Linear(config.hidden_size * 4, config.hidden_size)    ### 87.9
        # self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size)    ### 88.0 20epoch
        self.norm = LayerNorm(config.hidden_size, eps=config.norm_eps)

    def transpose_for_scores(self, x):
        # x : batch_size * max_seq * dim
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  
        # new_x_shape: batch_size * max_seq * attention_heads * head_size
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
        # return shape: batch_size * attention_heads * max_seq * head_size


    def forward(self, hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b):
        # hidden_states_a: batch_size * max_seq_a * embedding_dim
        # hidden_states_b: batch_size * max_seq_b * embedding_dim

        mixed_query_layer_a = hidden_states_a
        mixed_key_layer_a = hidden_states_a
        mixed_value_layer_a = hidden_states_a

        query_layer_a = self.transpose_for_scores(mixed_query_layer_a)
        key_layer_a = self.transpose_for_scores(mixed_key_layer_a)
        value_layer_a = self.transpose_for_scores(mixed_value_layer_a)

        mixed_query_layer_b = hidden_states_b
        mixed_key_layer_b = hidden_states_b
        mixed_value_layer_b = hidden_states_b

        query_layer_b = self.transpose_for_scores(mixed_query_layer_b)
        key_layer_b = self.transpose_for_scores(mixed_key_layer_b)
        value_layer_b = self.transpose_for_scores(mixed_value_layer_b)

        extended_attention_mask_a = attention_mask_a[:, None, None, :]
        extended_attention_mask_a = (1.0 - extended_attention_mask_a) * -10000.0
        attention_mask_a = extended_attention_mask_a

        extended_attention_mask_b = attention_mask_b[:, None, None, :]
        extended_attention_mask_b = (1.0 - extended_attention_mask_b) * -10000.0
        attention_mask_b = extended_attention_mask_b


        attention_scores_a2b = torch.matmul(query_layer_a, key_layer_b.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_a * max_seq_b
        attention_scores_a2b = attention_scores_a2b / math.sqrt(self.attention_head_size)
        attention_scores_a2b = attention_scores_a2b + attention_mask_b
        attention_probs_a2b = nn.Softmax(dim=-1)(attention_scores_a2b)
        attention_probs_a2b = self.dropout(attention_probs_a2b)
        context_layer_a2b = torch.matmul(attention_probs_a2b, value_layer_b)
        context_layer_a2b = context_layer_a2b.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_a2b = context_layer_a2b.size()[:-2] + (self.all_head_size,)
        context_layer_a2b = context_layer_a2b.view(*new_context_layer_shape_a2b)


        
        attention_scores_b2a = torch.matmul(query_layer_b, key_layer_a.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_b * max_seq_a
        attention_scores_b2a = attention_scores_b2a / math.sqrt(self.attention_head_size)
        attention_scores_b2a = attention_scores_b2a + attention_mask_a
        attention_probs_b2a = nn.Softmax(dim=-1)(attention_scores_b2a)
        attention_probs_b2a = self.dropout(attention_probs_b2a)
        context_layer_b2a = torch.matmul(attention_probs_b2a, value_layer_a)
        context_layer_b2a = context_layer_b2a.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_b2a = context_layer_b2a.size()[:-2] + (self.all_head_size,)
        context_layer_b2a = context_layer_b2a.view(*new_context_layer_shape_b2a)

        context_layer_a = torch.cat([hidden_states_a, context_layer_a2b, 
                    hidden_states_a - context_layer_a2b, hidden_states_a * context_layer_a2b], -1)  ## 87.9

        context_layer_b = torch.cat([hidden_states_b, context_layer_b2a, 
                    hidden_states_b - context_layer_b2a, hidden_states_b * context_layer_b2a], -1) ## 87.9
        
        # context_layer_a = torch.cat([hidden_states_a, context_layer_a2b], -1)     ### 88.0 20epoch

        # context_layer_b = torch.cat([hidden_states_b, context_layer_b2a], -1)
        

        context_layer_a = self.dense(context_layer_a)
        context_layer_a = gelu(context_layer_a)

        context_layer_b = self.dense(context_layer_b)
        context_layer_b = gelu(context_layer_b)

        context_layer_a = self.dropout(context_layer_a)
        context_layer_b = self.dropout(context_layer_b)

        context_layer_a = self.norm(hidden_states_a + context_layer_a)
        context_layer_b = self.norm(hidden_states_b + context_layer_b)
        
        outputs = (context_layer_a, context_layer_b)
        
        return outputs



### Block

In [0]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = Encoder(config)
        self.crossattlayer = CrossAttLayer(config)
        
        self.norm = LayerNorm(config.hidden_size, eps=config.norm_eps)
    
    def forward(self, hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b):
        out_a = self.encoder(hidden_states_a, attention_mask_a)
        out_b = self.encoder(hidden_states_b, attention_mask_b)
        out_a, out_b = self.crossattlayer(out_a, out_b, attention_mask_a, attention_mask_b)
        return out_a, out_b

### CNN

In [0]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        num_filters = 256
        filter_sizes = (2, 3, 4)
        embed = 300
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, num_filters, (k, embed)) for k in filter_sizes])
        
        self.dropout = nn.Dropout(0.2)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, x):
        out = x.unsqueeze(1)   # b * 1 * s * e
        out = torch.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1)
        # out = self.dropout(0.2)
        return out


### Pooling & Prediction

In [0]:
class Pooling(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = CNN()

    def forward(self, x, mask):
        x = self.cnn(x)

        return x


        '''
        x_avg = torch.sum(x * mask.unsqueeze(1).transpose(2, 1), dim=1)\
                            / torch.sum(mask, dim=1, keepdim=True)
        # max pooling
        # print('mask:', mask)
        extended_mask = mask[:, :, None]
        extended_mask = (1.0 - extended_mask) * (-100000)
        mask = extended_mask
        # print('extended_mask:', mask)
        x = x + mask
        # return torch.cat([x.max(dim=1)[0], x_avg], -1)
        return x.max(dim=1)[0]
        '''
    
class Prediction(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.dense_1 = nn.Linear(256 * 3 * 5, config.hidden_size * 2)
        # self.dense_1 = nn.Linear(config.hidden_size * 5, config.hidden_size * 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dense_2 = nn.Linear(config.hidden_size * 2, config.num_labels)
    def forward(self, a, b):
        outputs = torch.cat([a, b, a - b, a * b, torch.abs(a-b)], dim=-1)
        outputs = self.dropout(outputs)
        outputs = self.dense_1(outputs)
        outputs = self.dropout(outputs)
        outputs = self.dense_2(outputs)
        return outputs

### Focal Loss

 $ Loss(x, class) = - \alpha (1-softmax(x)[class])^ {gamma} \log(softmax(x)[class]) $

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):
    """
        This criterion is a implemenation of Focal Loss, which is proposed in 
        Focal Loss for Dense Object Detection.

            Loss(x, class) = - \alpha (1-softmax(x)[class])^ {gamma} \log(softmax(x)[class])

        The losses are averaged across observations for each minibatch.

        Args:
            alpha(1D Tensor, Variable) : the scalar factor for this criterion
            gamma(float, double) : gamma > 0; reduces the relative loss for well-classiﬁed examples (p > .5), 
                                   putting more focus on hard, misclassiﬁed examples
            size_average(bool): By default, the losses are averaged over observations for each minibatch.
                                However, if the field size_average is set to False, the losses are
                                instead summed for each minibatch.


    """
    def __init__(self, class_num=3, alpha=None, gamma=2, size_average=True):
        super(FocalLoss, self).__init__()
        if alpha is None:
            self.alpha = Variable(torch.ones(class_num, 1))
        else:
            if isinstance(alpha, Variable):
                self.alpha = alpha
            else:
                self.alpha = Variable(alpha)
        self.gamma = gamma
        self.class_num = class_num
        self.size_average = size_average

    def forward(self, inputs, targets):
        N = inputs.size(0)
        C = inputs.size(1)
        P = F.softmax(inputs)

        class_mask = inputs.data.new(N, C).fill_(0)
        class_mask = Variable(class_mask)
        ids = targets.view(-1, 1)
        class_mask.scatter_(1, ids.data, 1.)
        #print(class_mask)


        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        alpha = self.alpha[ids.data.view(-1)]

        probs = (P*class_mask).sum(1).view(-1,1)

        log_p = probs.log()
        #print('probs size= {}'.format(probs.size()))
        #print(probs)

        batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 
        #print('-----bacth_loss------')
        #print(batch_loss)


        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
        return loss

### MatchModel

In [0]:
class MatchModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = EmbeddingLayer(config)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
        self.pooling = Pooling()
        self.prediction = Prediction(config)

        # self.init_weights()
        self.loss_fct = CrossEntropyLoss()
        # self.loss_fct = FocalLoss()
         
        
        # self.init_weights()

    def init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids_a, input_ids_b, attention_mask_a, attention_mask_b, labels):
        hidden_states_a = self.embedding(input_ids_a)
        hidden_states_b = self.embedding(input_ids_b)
        for i, layer in enumerate(self.blocks):
            hidden_states_a, hidden_states_b = layer(hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b)

        # print('hidden_states_a:', hidden_states_a, hidden_states_a.shape)
        # print('hidden_states_b:', hidden_states_b, hidden_states_b.shape)
        # os._exit(1)

        outputs_a = self.pooling(hidden_states_a, attention_mask_a)
        outputs_b = self.pooling(hidden_states_b, attention_mask_b)

        outputs = self.prediction(outputs_a, outputs_b)
        # loss = self.loss_fct(outputs.view(-1, config.num_labels), labels.view(-1))
        loss = self.loss_fct(outputs, labels)
        # print(loss)
        # print(outputs)
        outputs = (loss, outputs)

        return outputs

    """
    def init_weights(self, module):
        # Initialize the weights 
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, norm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
    """


## Utils

In [0]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def sentence2ids(args, sentence, word2id):
    if args.do_lower_case:
        sentence = sentence.lower()
    ids = []
    for word in  sentence.strip().split():
        if word not in word2id.keys():
            ids.append(word2id['<UNK>'])
        else:
            ids.append(word2id[word])
    return ids


def load_vocab(args):
    import codecs
    vocab_path = os.path.join(args.data_dir, 'vocab.txt')

    if not os.path.exists(vocab_path):
        vocab = Counter()
        files = os.listdir(args.data_dir)
        for file in files:
            if not os.path.isdir(file) and file != '.DS_Store':
                # print('file:', file)
                f = codecs.open(os.path.join(args.data_dir, file), 'r')
                for line in f.readlines():
                    text_a, text_b, label = line.strip().split('\t')
                    if args.do_lower_case:
                        text_a = text_a.lower()
                        text_b = text_b.lower()
                    vocab.update(text_a.split())
                    vocab.update(text_b.split())
                f.close()
        f = codecs.open(os.path.join(args.data_dir+'/vocab.txt'), 'w')
        vocab = vocab.items()
        vocab = sorted(vocab, key=lambda x:x[1], reverse=True)
        f.write('<PAD>\n<UNK>\n')
        for _ in vocab:
            f.write(_[0] + '\n')   

    with open(vocab_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    word2id = {}
    vocab = []
    for (index, line) in enumerate(lines):
        word = line.strip()
        vocab.append(word)
        word2id[word] = index

    return vocab, word2id

def load_dataset(args, word2id, data_type):
    data_path = os.path.join(args.data_dir, data_type+'.txt')

    # Read Data
    with open(data_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    examples = []
    for (i, line) in enumerate(lines):
        if len(line.strip().split('\t')) == 3:
            text_a, text_b, label = line.strip().split('\t')
        examples.append((text_a, text_b, label))

    # Convert to features
    features = []
    for (ex_index, example) in enumerate(examples):
        len_examples = len(examples)
        if ex_index % 10000 == 0:
            logger.info("Writing example %d/%d" % (ex_index, len_examples))

        input_ids_a = sentence2ids(args, example[0], word2id)
        attention_mask_a = [1] * len(input_ids_a)
        padding_length_a = args.max_seq_length_a - len(input_ids_a)
        input_ids_a = input_ids_a + ([0] * padding_length_a)
        attention_mask_a = attention_mask_a + ([0] * padding_length_a)

        input_ids_b = sentence2ids(args, example[1], word2id)
        attention_mask_b = [1] * len(input_ids_b)
        padding_length_b = args.max_seq_length_b - len(input_ids_b)
        input_ids_b = input_ids_b + ([0] * padding_length_b)
        attention_mask_b = attention_mask_b + ([0] * padding_length_b)

        if example[2] not in ['0', '1', '2']:
            # print(example[0], example[1], example[2])
            continue
        label = int(example[2])

        input_ids_a = input_ids_a[:args.max_seq_length_a]
        attention_mask_a = attention_mask_a[:args.max_seq_length_a]

        input_ids_b = input_ids_b[:args.max_seq_length_b]
        attention_mask_b = attention_mask_b[:args.max_seq_length_b]
        features.append((input_ids_a, attention_mask_a, input_ids_b, attention_mask_b, label))
        
        # if ex_index == 1:
        #     print('input_sentence_a: ', example[0])
        #     print('input_ids_a: ', input_ids_a)
        #     print('attention_mask_a: ', attention_mask_a)
            
        #     print('input_sentence_b: ', example[1])
        #     print('input_ids_b: ', input_ids_b)
        #     print('attention_mask_b: ', attention_mask_b)
            
        #     print('input_label: ', example[2])
        #     print('label: ', label)
    
    all_input_ids_a = torch.tensor([f[0] for f in features], dtype=torch.long)
    all_attention_mask_a = torch.tensor([f[1] for f in features], dtype=torch.long)
    all_input_ids_b = torch.tensor([f[2] for f in features], dtype=torch.long)
    all_attention_mask_b = torch.tensor([f[3] for f in features], dtype=torch.long)
    all_labels = torch.tensor([f[4] for f in features], dtype=torch.long)

    # print('1:', len(all_labels))
    dataset = TensorDataset(all_input_ids_a, all_attention_mask_a, all_input_ids_b, all_attention_mask_b, all_labels)
    # print('return dataset')
    return dataset

## 训练 评估  测试

In [0]:
def train(args, train_dataset, model, tokenizer, word2id):
    """ Train the model """
    for module in model.modules():
        if isinstance(module, nn.Linear):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=0.02)
        elif isinstance(module, LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()


    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    """
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]


    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
    """

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    """
    
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    """
    # optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    
    # optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.1)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps,
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    # print('cnm')
    logger.info("  Total optimization steps = %d", t_total)

    # print('nmsl')
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    """
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
    """
    tr_loss, logging_loss = 0.0, 0.0
    # print('model zero grad start')
    model.zero_grad()
#     train_iterator = trange(
#         epochs_trained, int(args.num_train_epochs), desc="Epoch",
#     )
    # print('model zero grad end')
    train_iterator = range(
        epochs_trained, int(args.num_train_epochs)
    )
    set_seed(args)  # Added here for reproductibility
    
    loss_show = []
    # print(' start training....')
    for epoch, _ in enumerate(train_iterator):
        epoch_loss = 0.0
        
        # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids_a": batch[0], "attention_mask_a": batch[1], 
                "input_ids_b": batch[2], "attention_mask_b": batch[3],"labels": batch[4]}

            outputs = model(**inputs)
            # outputs = model(batch[0], batch[1], batch[2], batch[3], batch[4])
            loss = outputs[0]  # model outputs are always tuple 

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            epoch_loss += loss.item()
            
            loss_show.append(loss.item())
            if(step + 1) % 100 == 0:
                print('epochs:', epoch, 'train step:', step, 'total step:', len(epoch_iterator), 'loss:', loss.item())
            
            if (step + 1) % args.gradient_accumulation_steps == 0:
                # torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                # scheduler.step()  # Update learning rate schedule
                # print("Decaying learning rate to %g" % scheduler.get_lr()[0])

                base_ratio = args.min_learning_rate / args.learning_rate
                if global_step < args.warmup_steps:
                    ratio = base_ratio + (1. - base_ratio) / max(1., args.warmup_steps) * global_step
                else:
                    ratio = max(base_ratio, args.lr_decay_rate ** math.floor((global_step - args.warmup_steps) /
                                                                                args.lr_decay_steps))
                optimizer.param_groups[0]['lr'] = args.learning_rate * ratio

                model.zero_grad()
                global_step += 1

            

        logger.info(" train average loss = %s", epoch_loss / step)
        

        dev_dataset = load_dataset(args, word2id, 'test')
        f1, preds = evaluate(args, dev_dataset, model, tokenizer, word2id)

        f = codecs.open('snli/test.txt', 'r')
        f_out = codecs.open('snli_bad_case/bad_case_'+str(epoch) +'.txt', 'w')
        lines = f.readlines()
        for i, line in enumerate(lines):
            if int(line.strip()[-1]) != preds[i]:
                f_out.write(line.strip() + '\t' + str(preds[i]) + '\n')
        print('write bad case!!!')
        
    # show loss pic
    x = []
    y = loss_show
    for i in range(len(y)):
        x.append(i)
    
    import matplotlib
    import matplotlib.pyplot as plt
    from matplotlib.pyplot import MultipleLocator
    #从pyplot导入MultipleLocator类，这个类用于设置刻度间隔

    %matplotlib inline
    plt.plot(x, y)
    plt.title('Loss Change')
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.grid(True)

#     x_major_locator=MultipleLocator(0.05)
#     #把x轴的刻度间隔设置为0.05，并存在变量里
#     y_major_locator=MultipleLocator(0.1)
#     #把y轴的刻度间隔设置为0.1，并存在变量里
#     ax=plt.gca()
#     #ax为两条坐标轴的实例
#     ax.xaxis.set_major_locator(x_major_locator)
#     #把x轴的主刻度设置为0.05的倍数
#     ax.yaxis.set_major_locator(y_major_locator)
#     #把y轴的主刻度设置为0.1的倍数
#     plt.xlim(0, 1)
#     #把x轴的刻度范围设置为0到1
#     plt.ylim(0, 5)
    #把y轴的刻度范围设置为0到5

    # plt.savefig('CrossEntropyLoss.png')
    plt.show()


    return global_step, tr_loss / global_step


def evaluate(args, eval_dataset, model, tokenizer, word2id):

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    # for batch in tqdm(eval_dataloader, desc="Evaluating"):
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids_a": batch[0], "attention_mask_a": batch[1], 
            "input_ids_b": batch[2], "attention_mask_b": batch[3],"labels": batch[4]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)
    print(preds[:20])


    # result = f1_score(out_label_ids, preds)
    result = accuracy_score(out_label_ids, preds)
    logger.info("eval average loss = %s, accuracy_score = %s", eval_loss, result)
        

    # output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    # with open(output_eval_file, "w") as writer:
    #     logger.info("***** Eval results {} *****".format(prefix))
    #     for key in sorted(result.keys()):
    #         logger.info("  %s = %s", key, str(result[key]))
    #         writer.write("%s = %s\n" % (key, str(result[key])))

    


    return result, list(preds)





## 运行

In [0]:
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir", default='snli', type=str)
    parser.add_argument("--output_dir", default='output', type=str)

    # Other parameters
    parser.add_argument("--max_seq_length_a", default=32, type=int)
    parser.add_argument("--max_seq_length_b", default=32, type=int)
    parser.add_argument("--num_train_epochs", default=25, type=float)
    parser.add_argument("--do_train", default=True, type=bool)
    parser.add_argument("--do_test", default=True, type=bool)
    parser.add_argument("--do_lower_case", default=True, type=bool)
    parser.add_argument("--per_gpu_train_batch_size", default=512, type=int)
    parser.add_argument("--per_gpu_eval_batch_size", default=512, type=int)
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument("--min_learning_rate", default=6e-5, type=float)
    parser.add_argument("--lr_decay_rate", default=0.95, type=float)
    parser.add_argument("--lr_decay_steps", default=10000, type=float)
    parser.add_argument("--weight_decay", default=0.01, type=float)
    parser.add_argument("--adam_epsilon", default=1e-8, type=float)
    parser.add_argument("--no_cuda", default=False, type=bool)
    parser.add_argument("--seed", default=42, type=int)
    
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.",)
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",)
    parser.add_argument("--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",)
    

    # args = parser.parse_args()
    args = parser.parse_args(args=[])

    # Setup CUDA, GPU
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO,)
    logger.warning("device: %s, n_gpu: %s,", device, args.n_gpu)

    # Set seed
    set_seed(args)
    
    # Set Label
    label_list = ['0', '1', '2']
    
    # Set Vocab
    vocab, word2id = load_vocab(args)
    
    # Build Model
    model = MatchModel(config())
    model.to(args.device)

    logger.info("Training parameters %s", args)
    # print('do train')
    # Training
    if args.do_train:
        train_dataset = load_dataset(args, word2id, 'train')
        # print('load dataset finish')
        global_step, tr_loss = train(args, train_dataset, model, vocab, word2id)
        logger.info("global_step = %s, average loss = %s", global_step, tr_loss)
        
    
    
    """
    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
    

        
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)
        

    # Evaluation
    results = {}
    
    if args.do_eval:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    

    return results

    """
main()

04/21/2020 16:23:21 - INFO - __main__ -   Training parameters Namespace(adam_epsilon=1e-08, data_dir='snli', device=device(type='cuda'), do_lower_case=True, do_test=True, do_train=True, eval_all_checkpoints=False, gradient_accumulation_steps=1, learning_rate=0.0001, logging_steps=500, lr_decay_rate=0.95, lr_decay_steps=10000, max_grad_norm=1.0, max_seq_length_a=32, max_seq_length_b=32, max_steps=-1, min_learning_rate=6e-05, n_gpu=1, no_cuda=False, num_train_epochs=25, output_dir='output', per_gpu_eval_batch_size=512, per_gpu_train_batch_size=512, save_steps=500, seed=42, warmup_steps=0, weight_decay=0.01)
04/21/2020 16:23:23 - INFO - __main__ -   Writing example 0/549367
04/21/2020 16:23:23 - INFO - __main__ -   Writing example 10000/549367
04/21/2020 16:23:23 - INFO - __main__ -   Writing example 20000/549367
04/21/2020 16:23:23 - INFO - __main__ -   Writing example 30000/549367
04/21/2020 16:23:23 - INFO - __main__ -   Writing example 40000/549367
04/21/2020 16:23:23 - INFO - __main_

epochs: 0 train step: 99 total step: 1073 loss: 0.8616246581077576
epochs: 0 train step: 199 total step: 1073 loss: 0.6995905637741089
epochs: 0 train step: 299 total step: 1073 loss: 0.6369693279266357
epochs: 0 train step: 399 total step: 1073 loss: 0.619914174079895
epochs: 0 train step: 499 total step: 1073 loss: 0.5312556624412537
epochs: 0 train step: 599 total step: 1073 loss: 0.5963566899299622
epochs: 0 train step: 699 total step: 1073 loss: 0.6133249402046204
epochs: 0 train step: 799 total step: 1073 loss: 0.5435006618499756
epochs: 0 train step: 899 total step: 1073 loss: 0.5709443092346191
epochs: 0 train step: 999 total step: 1073 loss: 0.5392552614212036


04/21/2020 16:34:03 - INFO - __main__ -    train average loss = 0.6469453636056451
04/21/2020 16:34:03 - INFO - __main__ -   Writing example 0/9824
04/21/2020 16:34:03 - INFO - __main__ -   ***** Running evaluation *****
04/21/2020 16:34:03 - INFO - __main__ -     Num examples = 9824
04/21/2020 16:34:03 - INFO - __main__ -     Batch size = 512
04/21/2020 16:34:07 - INFO - __main__ -   eval average loss = 0.4546026736497879, accuracy_score = 0.8254275244299675


[2 0 2 1 1 1 0 1 0 1 0 1 1 0 2 2 0 1 1 2]
write bad case!!!
epochs: 1 train step: 99 total step: 1073 loss: 0.4869741201400757


一些想法:
1. bad case 里面 一些由于一个词不一样，导致label区别大；交互层不做a-b, a * b; 最后一层对每个词提取的交互特征结合源该词语义特征，进行一个线性层提取；
最后pooling层换textcnn


In [0]:
class EmbeddingLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
        # self.max_position_embeddings = max(config.max_position_embeddings_a, config.max_position_embeddings_b)
        # self.position_embeddings = nn.Embedding(self.max_position_embeddings, config.embedding_size)
        # self.norm = MatchGatNorm(config.embedding_size, eps=config.norm_eps)
        self.dropout = nn.Dropout(config.embedding_dropout_prob)
        # self.lstm = torch.nn.LSTM(config.embedding_size, config.embedding_size, 1, batch_first=True, dropout=0.2)

        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # h_0 = Variable(torch.randn(1, batch_size, 512))
        # c_0 = Variable(torch.randn(1, batch_size, 512))
        # h_0 = h_0.to(device)
        # c_0 = c_0.to(device)

        if not os.path.exists('word_embedding_snli.pkl'):
            print('load embedding ... ')
            with open('snli/vocab.txt', "r", encoding="utf-8") as f:
                lines = f.readlines()
            word2id = {}
            vocab = []
            for (index, line) in enumerate(lines):
                word = line.strip()
                vocab.append(word)
                word2id[word] = index

            embedding = np.zeros((config.vocab_size, 300))
            tar_count = 0
            glove_vocab = {}
            with open('glove.840B.300d.txt') as f:
                for line in f:
                    elems = line.rstrip().split()
                    if len(elems) != 300 + 1:
                        continue
                    token = elems[0]

                    # token = token.lower()
                    if token in vocab:
                        index = vocab.index(token)
                        vector = [float(x) for x in elems[1:]]
                        embedding[index] = vector
                        if token not in glove_vocab.keys():
                            tar_count += 1
                            glove_vocab[token] = 1
                    else:
                        token = token.lower()
                        if token in vocab and token not in glove_vocab.keys():
                            index = vocab.index(token)
                            vector = [float(x) for x in elems[1:]]
                            embedding[index] = vector
                            tar_count += 1
                            glove_vocab[token] = 1

            print('oov:', len(vocab) - tar_count, ' 比例：', (len(vocab) - tar_count) / len(vocab))
            
            with open('word_embedding_snli.pkl', 'wb') as f:
                pickle.dump(embedding,f)
        else:
            with open('word_embedding_snli.pkl', 'rb') as f:
                embedding = pickle.load(f)
        self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding))



    def forward(self, input_ids_a, input_ids_b, position_a=None, position_b=None):
        """
        input_shape_a = input_ids_a.size()
        input_shape_b = input_ids_b.size()
        seq_length_a = input_shape_a[1]
        seq_length_b = input_shape_b[1]

        device = input_ids_a.device

        batch_size = input_shape_a[0]

        h_0 = Variable(torch.randn(1, batch_size, 512))
        c_0 = Variable(torch.randn(1, batch_size, 512))
        h_0 = h_0.to(device)
        c_0 = c_0.to(device)

        input_a_word_embeddings = self.word_embeddings(input_ids_a)
        input_b_word_embeddings = self.word_embeddings(input_ids_b)

        if position_a:
            input_a_position_embeddings = self.position_embeddings(position_a)
            input_b_position_embeddings = self.position_embeddings(position_b)

            a_embeddings = input_a_word_embeddings + input_a_position_embeddings
            a_embeddings = self.norm(a_embeddings)
            a_embeddings = self.dropout(a_embeddings)

            b_embeddings = input_b_word_embeddings + input_b_position_embeddings
            b_embeddings = self.norm(b_embeddings)
            b_embeddings = self.dropout(b_embeddings)
        else:
            a_embeddings = input_a_word_embeddings

            # a_embeddings = self.norm(a_embeddings)
            a_embeddings = self.dropout(a_embeddings)

            b_embeddings = input_b_word_embeddings
            # b_embeddings = self.norm(b_embeddings)
            b_embeddings = self.dropout(b_embeddings)
            
            
            # # print('random h, c')
            # a_embeddings, (h_x, c_x) = self.lstm(a_embeddings, (h_0, c_0))
            # # print('lstm train')
            # b_embeddings, (h_x, c_x) = self.lstm(b_embeddings, (h_0, c_0))
        """
        input_a_word_embeddings = self.word_embeddings(input_ids_a)
        input_b_word_embeddings = self.word_embeddings(input_ids_b)

        a_embeddings = input_a_word_embeddings
        a_embeddings = self.dropout(a_embeddings)

        b_embeddings = input_b_word_embeddings
        b_embeddings = self.dropout(b_embeddings)
        return a_embeddings, b_embeddings

In [0]:
class CrossAttLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query_a = nn.Linear(config.hidden_size, self.all_head_size)
        self.key_a = nn.Linear(config.hidden_size, self.all_head_size)
        self.value_a = nn.Linear(config.hidden_size, self.all_head_size)

        self.query_b = nn.Linear(config.hidden_size, self.all_head_size)
        self.key_b = nn.Linear(config.hidden_size, self.all_head_size)
        self.value_b = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.dense = nn.Linear(config.hidden_size * 5, config.hidden_size)
        self.norm = MatchGatNorm(config.hidden_size, eps=config.norm_eps)

    def transpose_for_scores(self, x):
        # x : batch_size * max_seq * dim
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  
        # new_x_shape: batch_size * max_seq * attention_heads * head_size
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
        # return shape: batch_size * attention_heads * max_seq * head_size


    def forward(self, hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b):
        # hidden_states_a: batch_size * max_seq_a * embedding_dim
        # hidden_states_b: batch_size * max_seq_b * embedding_dim

        mixed_query_layer_a = self.query_a(hidden_states_a)
        mixed_key_layer_a = self.key_a(hidden_states_a)
        mixed_value_layer_a = self.value_a(hidden_states_a)

        query_layer_a = self.transpose_for_scores(mixed_query_layer_a)
        key_layer_a = self.transpose_for_scores(mixed_key_layer_a)
        value_layer_a = self.transpose_for_scores(mixed_value_layer_a)

        mixed_query_layer_b = self.query_b(hidden_states_b)
        mixed_key_layer_b = self.key_b(hidden_states_b)
        mixed_value_layer_b = self.value_b(hidden_states_b)

        query_layer_b = self.transpose_for_scores(mixed_query_layer_b)
        key_layer_b = self.transpose_for_scores(mixed_key_layer_b)
        value_layer_b = self.transpose_for_scores(mixed_value_layer_b)

        extended_attention_mask_a = attention_mask_a[:, None, None, :]
        extended_attention_mask_a = (1.0 - extended_attention_mask_a) * -10000.0
        attention_mask_a = extended_attention_mask_a

        extended_attention_mask_b = attention_mask_b[:, None, None, :]
        extended_attention_mask_b = (1.0 - extended_attention_mask_b) * -10000.0
        attention_mask_b = extended_attention_mask_b


        attention_scores_a2b = torch.matmul(query_layer_a, key_layer_b.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_a * max_seq_b
        attention_scores_a2b = attention_scores_a2b / math.sqrt(self.attention_head_size)
        attention_scores_a2b = attention_scores_a2b + attention_mask_b
        attention_probs_a2b = nn.Softmax(dim=-1)(attention_scores_a2b)
        attention_probs_a2b = self.dropout(attention_probs_a2b)
        context_layer_a2b = torch.matmul(attention_probs_a2b, value_layer_b)
        context_layer_a2b = context_layer_a2b.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_a2b = context_layer_a2b.size()[:-2] + (self.all_head_size,)
        context_layer_a2b = context_layer_a2b.view(*new_context_layer_shape_a2b)


        
        attention_scores_b2a = torch.matmul(query_layer_b, key_layer_a.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_b * max_seq_a
        attention_scores_b2a = attention_scores_b2a / math.sqrt(self.attention_head_size)
        attention_scores_b2a = attention_scores_b2a + attention_mask_a
        attention_probs_b2a = nn.Softmax(dim=-1)(attention_scores_b2a)
        # attention_probs_b2a = self.dropout(attention_probs_b2a)
        context_layer_b2a = torch.matmul(attention_probs_b2a, value_layer_a)
        context_layer_b2a = context_layer_b2a.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_b2a = context_layer_b2a.size()[:-2] + (self.all_head_size,)
        context_layer_b2a = context_layer_b2a.view(*new_context_layer_shape_b2a)

        '''
        context_layer_a = torch.cat([context_layer_a2a, context_layer_a2b], -1)
        context_layer_a = torch.cat([context_layer_a, context_layer_a2a - context_layer_a2b], -1)
        context_layer_a = torch.cat([context_layer_a, context_layer_a2a * context_layer_a2b], -1)
        context_layer_a = torch.cat([hidden_states_a, context_layer_a], -1)


        context_layer_b = torch.cat([context_layer_b2b, context_layer_b2a], -1)
        context_layer_b = torch.cat([context_layer_b, context_layer_b2b - context_layer_b2a], -1)
        context_layer_b = torch.cat([context_layer_b, context_layer_b2b * context_layer_b2a], -1)
        context_layer_b = torch.cat([hidden_states_b, context_layer_b], -1)


        # print('context_layer_a shape:', context_layer_a.shape)
        # print('context_layer_b shape:', context_layer_b.shape)


        context_layer_a = self.dense(context_layer_a)
        context_layer_a = gelu(context_layer_a)

        context_layer_b = self.dense(context_layer_b)
        context_layer_b = gelu(context_layer_b)

        context_layer_a = self.norm(hidden_states_a + context_layer_a)
        context_layer_b = self.norm(hidden_states_b + context_layer_b)
        '''

        context_layer_a = self.norm(hidden_states_a + context_layer_a2b)
        context_layer_b = self.norm(hidden_states_b + context_layer_b2a)
        
        outputs = (context_layer_a, context_layer_b)
        
        return outputs



In [0]:
"""match-gat model"""

class GatLayer(nn.Module):
    """docstring for GatLayer"""
    def __init__(self, config):
        super().__init__()
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.align = nn.Linear(config.hidden_size * 2, config.hidden_size)
        # self.align_a2b = nn.Linear(config.hidden_size * 2, config.hidden_size)
        # self.align_b2a = nn.Linear(config.hidden_size * 2, config.hidden_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        self.dense = nn.Linear(config.hidden_size * 5, config.hidden_size)

        self.norm = MatchGatNorm(config.hidden_size, eps=config.norm_eps)


    def transpose_for_scores(self, x):
        # x : batch_size * max_seq * dim
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  
        # new_x_shape: batch_size * max_seq * attention_heads * head_size
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)
        # return shape: batch_size * attention_heads * max_seq * head_size


    def forward(self, hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b):
        # hidden_states_a: batch_size * max_seq_a * embedding_dim
        # hidden_states_b: batch_size * max_seq_b * embedding_dim

        mixed_query_layer_a = self.query(hidden_states_a)
        mixed_key_layer_a = self.key(hidden_states_a)
        mixed_value_layer_a = self.value(hidden_states_a)

        query_layer_a = self.transpose_for_scores(mixed_query_layer_a)
        key_layer_a = self.transpose_for_scores(mixed_key_layer_a)
        value_layer_a = self.transpose_for_scores(mixed_value_layer_a)

        mixed_query_layer_b = self.query(hidden_states_b)
        mixed_key_layer_b = self.key(hidden_states_b)
        mixed_value_layer_b = self.value(hidden_states_b)

        query_layer_b = self.transpose_for_scores(mixed_query_layer_b)
        key_layer_b = self.transpose_for_scores(mixed_key_layer_b)
        value_layer_b = self.transpose_for_scores(mixed_value_layer_b)

        extended_attention_mask_a = attention_mask_a[:, None, None, :]
        extended_attention_mask_a = (1.0 - extended_attention_mask_a) * -10000.0
        attention_mask_a = extended_attention_mask_a

        extended_attention_mask_b = attention_mask_b[:, None, None, :]
        extended_attention_mask_b = (1.0 - extended_attention_mask_b) * -10000.0
        attention_mask_b = extended_attention_mask_b


        
        

        attention_scores_a2b = torch.matmul(query_layer_a, key_layer_b.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_a * max_seq_b
        attention_scores_a2b = attention_scores_a2b / math.sqrt(self.attention_head_size)
        attention_scores_a2b = attention_scores_a2b + attention_mask_b
        attention_probs_a2b = nn.Softmax(dim=-1)(attention_scores_a2b)
        attention_probs_a2b = self.dropout(attention_probs_a2b)
        context_layer_a2b = torch.matmul(attention_probs_a2b, value_layer_b)
        context_layer_a2b = context_layer_a2b.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_a2b = context_layer_a2b.size()[:-2] + (self.all_head_size,)
        context_layer_a2b = context_layer_a2b.view(*new_context_layer_shape_a2b)


        
        attention_scores_b2a = torch.matmul(query_layer_b, key_layer_a.transpose(-1, -2)) 
        # batch_size * attention_heads * max_seq_b * max_seq_a
        attention_scores_b2a = attention_scores_b2a / math.sqrt(self.attention_head_size)
        attention_scores_b2a = attention_scores_b2a + attention_mask_a
        attention_probs_b2a = nn.Softmax(dim=-1)(attention_scores_b2a)
        # attention_probs_b2a = self.dropout(attention_probs_b2a)
        context_layer_b2a = torch.matmul(attention_probs_b2a, value_layer_a)
        context_layer_b2a = context_layer_b2a.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape_b2a = context_layer_b2a.size()[:-2] + (self.all_head_size,)
        context_layer_b2a = context_layer_b2a.view(*new_context_layer_shape_b2a)

        '''
        context_layer_a = torch.cat([context_layer_a2a, context_layer_a2b], -1)
        context_layer_a = torch.cat([context_layer_a, context_layer_a2a - context_layer_a2b], -1)
        context_layer_a = torch.cat([context_layer_a, context_layer_a2a * context_layer_a2b], -1)
        context_layer_a = torch.cat([hidden_states_a, context_layer_a], -1)


        context_layer_b = torch.cat([context_layer_b2b, context_layer_b2a], -1)
        context_layer_b = torch.cat([context_layer_b, context_layer_b2b - context_layer_b2a], -1)
        context_layer_b = torch.cat([context_layer_b, context_layer_b2b * context_layer_b2a], -1)
        context_layer_b = torch.cat([hidden_states_b, context_layer_b], -1)


        # print('context_layer_a shape:', context_layer_a.shape)
        # print('context_layer_b shape:', context_layer_b.shape)


        context_layer_a = self.dense(context_layer_a)
        context_layer_a = gelu(context_layer_a)

        context_layer_b = self.dense(context_layer_b)
        context_layer_b = gelu(context_layer_b)

        context_layer_a = self.norm(hidden_states_a + context_layer_a)
        context_layer_b = self.norm(hidden_states_b + context_layer_b)
        '''
        context_layer_a = self.norm(hidden_states_a  + context_layer_a2b)
        context_layer_b = self.norm(hidden_states_b  + context_layer_b2a)
        
        outputs = (context_layer_a, context_layer_b)
        # outputs = (context_layer_a2b, context_layer_b2a)
        
        ### 拼接原来的a 与 对a对b做过attention的结果
        """
        outputs_a = torch.cat([hidden_states_a, context_layer_a2b], dim = -1)
        outputs_a = self.align(outputs_a)

        outputs_b = torch.cat([hidden_states_b, context_layer_b2a], dim = -1)
        outputs_b = self.align(outputs_b)

        outputs = (outputs_a, outputs_b)
        """

        return outputs


In [0]:
class MatchModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = EmbeddingLayer(config)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
        self.pooling = Pooling()
        self.prediction = Prediction(config)

        # self.init_weights()
        self.loss_fct = CrossEntropyLoss()
         
        
        # self.init_weights()

    def init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, MatchGatNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, input_ids_a, input_ids_b, attention_mask_a, attention_mask_b, labels):
        hidden_states_a, hidden_states_b = self.embedding(input_ids_a, input_ids_b)
        for i, layer in enumerate(self.blocks):
            hidden_states_a, hidden_states_b = layer(hidden_states_a, hidden_states_b, attention_mask_a, attention_mask_b)

        # print('hidden_states_a:', hidden_states_a, hidden_states_a.shape)
        # print('hidden_states_b:', hidden_states_b, hidden_states_b.shape)
        # os._exit(1)

        outputs_a = self.pooling(hidden_states_a, attention_mask_a)
        outputs_b = self.pooling(hidden_states_b, attention_mask_b)

        outputs = self.prediction(outputs_a, outputs_b)
        # loss = self.loss_fct(outputs.view(-1, config.num_labels), labels.view(-1))
        loss = self.loss_fct(outputs, labels)
        # print(loss)
        # print(outputs)
        outputs = (loss, outputs)

        return outputs

    """
    def init_weights(self, module):
        # Initialize the weights 
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, norm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
    """
