# init

In [1]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [2]:
from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import random
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.utils.data.distributed import DistributedSampler


try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter


from tqdm import tqdm, trange

#from transformers.configuration_albert import BertConfig as AlbertConfig
#from transformers.tokenization_albert import BertTokenizer as AlbertTokenizer

from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
logger = logging.getLogger(__name__)

#ALL_MODELS = sum((tuple(BertConfig.pretrained_config_archive_map.keys()), ()))


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

In [4]:
from transformers import BertPreTrainedModel,BertModel
import torch.nn as nn
import torch.nn.functional as F
class BertForSequenceClassification_CNN(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """
    def __init__(self, config):
        super().__init__(config)
        print(args)
        self.num_labels = config.num_labels
        
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.convs = nn.ModuleList([nn.Conv2d(1, args.filter_num, (k, config.hidden_size)) for k in args.filter_sizes])
        self.fc_cnn = nn.Linear(args.filter_num * len(args.filter_sizes), self.config.num_labels)

        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None, real_token_len=None):

        
        #print(input_ids.shape)  #torch.Size([2, 512])   [batch_size, max_seq_length]
        
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)

        #outputs: `tuple`, [last_hidden_state, pooler_output]

        last_hidden_state = outputs[0]
        #last_hidden_state.shape: [batch_size, sequence_length, hidden_size]
        
        x = last_hidden_state.unsqueeze(1)
        #x.shape: [batch_size, 1, sequence_length, hidden_size]
        
    
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logits = self.fc_cnn(x)

        #pooled_output = self.dropout(pooled_output)
        #logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here. outputs的最后两个是hidden_states和attention

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)


class BertForSequenceClassification_LSTM(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """
    def __init__(self, config):
        super(BertForSequenceClassification_LSTM, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.lstm = []
        for i in range(args.lstm_layers):
            self.lstm.append( nn.LSTM(config.hidden_size if i==0 else args.lstm_hidden_size*4, args.lstm_hidden_size,num_layers=1,bidirectional=True,batch_first=True).cuda() )
        self.lstm = nn.ModuleList(self.lstm)

        self.classifier = nn.Linear(args.lstm_hidden_size*2, self.config.num_labels)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)

        last_hidden_state = outputs[0]
        #last_hidden_state.shape: [batch_size, sequence_length, hidden_size]

        for lstm in self.lstm:
            try:
                lstm.flatten_parameters() 
            except:
                pass
            output, (h_n, c_n) = lstm(last_hidden_state)
            #h_n.shape: [batch, num_layers*num_directions == 2, gru_hidden_size]    batch_size first
            
        x = h_n.permute(1,0,2).reshape(input_ids.size(0),-1).contiguous()
        #x.shape: [batch, 2 * gru_hidden_size]

        x = self.dropout(x)
        logits = self.classifier(x)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)     


class BertForSequenceClassification_GRU(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.gru = []
        for i in range(args.gru_layers):
            self.gru.append( nn.GRU(config.hidden_size if i==0 else args.gru_hidden_size*4, args.gru_hidden_size,num_layers=1,bidirectional=True,batch_first=True).cuda() )
        self.gru = nn.ModuleList(self.gru)

        self.classifier = nn.Linear(args.gru_hidden_size*2, self.config.num_labels)


        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, labels=None):

        #print(input_ids.shape)  #torch.Size([2, 512])   [batch_size, max_seq_length]

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds)

        last_hidden_state = outputs[0]
        #last_hidden_state.shape: [batch_size, sequence_length, hidden_size]

        for gru in self.gru:
            try:
                gru.flatten_parameters() 
            except:
                pass
            output, h_n = gru(last_hidden_state)
            #h_n.shape: [batch, num_layers*num_directions == 2, gru_hidden_size]    batch_size first
            
        x = h_n.permute(1,0,2).reshape(input_ids.size(0),-1).contiguous()
        #x.shape: [batch, 2 * gru_hidden_size]

        x = self.dropout(x)
        logits = self.classifier(x)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)        



In [5]:
from transformers import XLNetPreTrainedModel
class XLNetForSequenceClassification_LSTM(XLNetPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **mems**: (`optional`, returned when ``config.mem_len > 0``)
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = XLNetModel(config)
        self.sequence_summary = SequenceSummary(config)
        #self.logits_proj = nn.Linear(config.d_model, config.num_labels)

        self.lstm = []
        for i in range(args.lstm_layers):
            self.lstm.append( nn.GRU(config.d_model if i==0 else args.lstm_hidden_size*4, args.lstm_hidden_size,num_layers=1,bidirectional=True,batch_first=True).cuda() )
        self.lstm = nn.ModuleList(self.lstm)

        #self.classifier = nn.Linear(args.gru_hidden_size*2, self.config.num_labels)
        self.logits_proj = nn.Linear(args.lstm_hidden_size*2, config.num_labels)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
                                               perm_mask=perm_mask,
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
                                               head_mask=head_mask,
                                               inputs_embeds=inputs_embeds)
        last_hidden_state = transformer_outputs[0]
        #last_hidden_state.shape: [batch_size, sequence_length, hidden_size]


        for lstm in self.lstm:
            try:
                lstm.flatten_parameters()
            except:
                pass
            output, h_n = lstm(last_hidden_state)
            #h_n.shape: [batch, num_layers*num_directions == 2, gru_hidden_size]    batch_size first
            
        x = h_n.permute(1,0,2).reshape(input_ids.size(0),-1).contiguous()
        #x.shape: [batch, 2 * gru_hidden_size]

        


        #x = self.sequence_summary(x)
        #print("==========================")
        #print(x.shape)
        #print("==========================")

        logits = self.logits_proj(x)

        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)

class XLNetForSequenceClassification_GRU(XLNetPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **mems**: (`optional`, returned when ``config.mem_len > 0``)
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = XLNetModel(config)
        self.sequence_summary = SequenceSummary(config)
        #self.logits_proj = nn.Linear(config.d_model, config.num_labels)

        self.gru = []
        for i in range(args.gru_layers):
            self.gru.append( nn.GRU(config.d_model if i==0 else args.gru_hidden_size*4, args.gru_hidden_size,num_layers=1,bidirectional=True,batch_first=True).cuda() )
        self.gru = nn.ModuleList(self.gru)

        #self.classifier = nn.Linear(args.gru_hidden_size*2, self.config.num_labels)
        self.logits_proj = nn.Linear(args.gru_hidden_size*2, config.num_labels)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
                                               perm_mask=perm_mask,
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
                                               head_mask=head_mask,
                                               inputs_embeds=inputs_embeds)
        last_hidden_state = transformer_outputs[0]


        for gru in self.gru:
            try:
                gru.flatten_parameters() 
            except:
                pass
            output, h_n = gru(last_hidden_state)
            #h_n.shape: [batch, num_layers*num_directions == 2, gru_hidden_size]    batch_size first
            
        x = h_n.permute(1,0,2).reshape(input_ids.size(0),-1).contiguous()
        #x.shape: [batch, 2 * gru_hidden_size]

        


        #x = self.sequence_summary(x)
        logits = self.logits_proj(x)

        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)


In [6]:
from transformers import (BertConfig,BertTokenizer,
                          BertForSequenceClassification, 
                          XLNetConfig,XLNetTokenizer,
                          XLNetForSequenceClassification,
                          AlbertConfig,AlbertTokenizer,
                          AlbertForSequenceClassification)

MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'bert_cnn': (BertConfig, BertForSequenceClassification_CNN, BertTokenizer),
    'bert_lstm': (BertConfig, BertForSequenceClassification_LSTM, BertTokenizer),
    'bert_gru': (BertConfig, BertForSequenceClassification_GRU, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlnet_lstm': (XLNetConfig, XLNetForSequenceClassification_LSTM, XLNetTokenizer),
    'xlnet_gru': (XLNetConfig, XLNetForSequenceClassification_GRU, XLNetTokenizer),
    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
    #'albert': (BertConfig, AlbertForSequenceClassification, BertTokenizer)
}

In [7]:
class Args:
    data_dir = "../dataset/policy"
    output_dir = "../results/policy/xlnet"
    model_name_or_path = "hfl/chinese-xlnet-base"
    model_type = "xlnet"
    task_name = "policy"
    output_mode = None

    ## Other parameters
    config_name=""
    tokenizer_name=""
    cache_dir=""
    max_seq_length=512
    do_train=True
    do_eval=True
    do_predict=True
    evaluate_during_training=True
    do_lower_case=True

    gradient_accumulation_steps=2
    per_gpu_train_batch_size=1
    per_gpu_eval_batch_size=16
    learning_rate=2e-5
    weight_decay=0.0
    adam_epsilon=1e-8
    max_grad_norm=1.0
    num_train_epochs=3.0
    max_steps=-1
    warmup_steps=0

    logging_steps=14923
    save_steps=14923
    eval_all_checkpoints=False
    no_cuda=False
    overwrite_output_dir=True
    overwrite_cache=False
    seed=42

    fp16=False
    fp16_opt_level='O1'
    local_rank=-1

    # Additional layer parameters
    # CNN
    filter_num=256
    filter_sizes=[3,4,5]

    # LSTM
    lstm_hidden_size=512
    lstm_layers=1
    lstm_dropout=0.1

    # GRU
    gru_hidden_size=512
    gru_layers=1
    gru_dropout=0.1

args = Args()
#是否覆盖输出目录
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
    raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend='nccl')
    args.n_gpu = 1
args.device = device

# Setup logging
logging.basicConfig(format = '%(asctime)s-%(levelname)s-%(name)s | %(message)s',
                    datefmt = '%Y/%m/%d %H:%M:%S',
                    level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

# Set seed
set_seed(args)



In [8]:
# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

# train

In [9]:
import os
import csv
import sys
import copy
import json
import logging

#from .utils import DataProcessor, InputExample, InputFeatures
from transformers.file_utils import is_tf_available

if is_tf_available():
    import tensorflow as tf

logger = logging.getLogger(__name__)


class InputExample(object):
    """
    仅仅将文本转化为类
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        '''在打印InputExample或者对象时显示__repr__定义的信息'''
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)   #self.__dict__: 包含InputExample对象所有属性及其值的字典
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class InputFeatures(object):
    """
    A single set of features of data.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        label: Label corresponding to the input
    """

    def __init__(self, input_ids, attention_mask, token_type_ids, label, real_token_len):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label = label
        self.real_token_len = real_token_len

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """Gets an example from a dict with tensorflow tensors

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    def tfds_map(self, example):
        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
        This method converts examples to the correct format."""
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example

    @classmethod
    def _read_csv(cls, input_file, quotechar=None):
        """Reads a tab/comma separated value file.
        
        将数据以每行的形式转换为lines tuple
        """

        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter=",", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:    #检查python的版本是不是python2
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines



In [10]:
class THUNewsProcessor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(tensor_dict['idx'].numpy(),
                            tensor_dict['sentence'].numpy().decode('utf-8'),
                            None,
                            str(tensor_dict['label'].numpy()))

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(os.path.join(data_dir, "train.csv"), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(os.path.join(data_dir, "dev.csv"), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        return self._create_examples(os.path.join(data_dir, "test.csv"), "test")

    def get_labels(self):
        """设置当前数据集的标签"""
        return [
            "财税扶持",
            "资质认定",
            "资金补贴",
            "人才激励",
            "课题项目",
            "政府采购",
            "赛事活动",
        ]

    def _create_examples(self, path, set_type):
        """Creates examples for the training/dev/test sets."""
        import pandas as pd
        df = pd.read_csv(path)
        print(df)
        examples = []
        for i in range(len(df)):
            guid = "%s-%s" % (set_type, i)
            row = df.iloc[i]
            text_a = str(row["text"])
            if set_type == 'test':
                label = '财税扶持'
            else:
                label = str(row["label"])
            if text_a is None:
                continue
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

In [11]:
tasks_num_labels = {
    "policy": 7,
}

processors = {
    "policy": THUNewsProcessor,
}

output_modes = {
    "policy": "classification",
}

In [12]:
# Prepare task
args.task_name = args.task_name.lower()
if args.task_name not in processors:
    raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]()
args.output_mode = output_modes[args.task_name] #为classification
label_list = processor.get_labels() #label_list为标签列表
num_labels = len(label_list)

In [13]:
args.model_type = args.model_type.lower()

config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                      num_labels=num_labels,
                                      finetuning_task=args.task_name,
                                      cache_dir=args.cache_dir if args.cache_dir else None)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                            do_lower_case=args.do_lower_case,
                                            cache_dir=args.cache_dir if args.cache_dir else None)
model = model_class.from_pretrained(args.model_name_or_path,
                                    from_tf=bool('.ckpt' in args.model_name_or_path),
                                    config=config,
                                    cache_dir=args.cache_dir if args.cache_dir else None)

#Log输出模型超参数
print("Model config %s", str(config))

if args.local_rank == 0:
    torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

model.to(args.device)

#log输出训练/评估超参数
print("==== Training/Evaluation Parameters: =====")
for attr, value in sorted(args.__dict__.items()):    
    print('\t{}={}'.format(attr, value))
print("==== Parameters End =====\n")

Some weights of the model checkpoint at hfl/chinese-xlnet-base were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-xlnet-base and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for p

Model config %s XLNetConfig {
  "_name_or_path": "hfl/chinese-xlnet-base",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "relu",
  "finetuning_task": "policy",
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "output_past": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type"

In [14]:
def convert_examples_to_features(examples, tokenizer,
                                      max_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    print(len(examples))
    print(examples[0])
    
    if task is not None:
        processor = processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    
    print("begin progress")
    for (ex_index, example) in enumerate(examples):
#         if ex_index % 10 == 0:
        print("Writing example %d" % (ex_index))

        #inputs: dict
        inputs = tokenizer.encode_plus(
            example.text_a,
            example.text_b,
            add_special_tokens=True,
            max_length=max_length,
        )
        #input_ids: 输入数据token在词汇表中的索引
        #token_type_ids: 分段token索引，类似segment embedding
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        real_token_len = len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

        if output_mode == "classification":
            label = label_map[example.label]    #label => index
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)


        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("real_token_len: %s" % (real_token_len))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label,
                              real_token_len=real_token_len))

    return features

def load_and_cache_examples(args, task, tokenizer, evaluate=False, predict=False):
    '''
        将dataset转换为features，并保存在目录cached_features_file中。
    
    args:
        evaluate: False. 若为True，则对dev.csv进行转换
        predict: False. 若为True，则则对test.csv进行转换

    return:
        dataset
    '''

    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()  #THUNewsProcessor()
    output_mode = output_modes[task]    #classification

    # Load data features from cache or dataset file
    #cached_features_file 为数据集的构造的特征的保存目录
    if evaluate:
        exec_model = 'dev'
    elif predict:
        exec_model = 'test'
    else:
        exec_model = 'train'

    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        exec_model,
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))

    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s\n", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir)
        elif predict:
            examples = processor.get_test_examples(args.data_dir)
        else:
            examples = processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples,
                                                tokenizer,
                                                label_list=label_list,
                                                max_length=args.max_seq_length,
                                                output_mode=output_mode,
                                                pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
 
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet', 'albert'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            
            if args.model_type in ['bert_cnn']:
                #inputs['real_token_len'] = batch[4]
                pass
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps  #每个batch都将loss除以gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            epoch_iterator.set_description("loss {}".format(round(loss.item(), 5)))

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:  #过gradient_accumulation_steps后才将梯度清零
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                #每logging_steps，进行evaluate
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = 'eval_{}'.format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs['learning_rate'] = learning_rate_scalar
                    logs['loss'] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{'step': global_step}}))

                #每save_steps保存checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Training
if args.do_train:
    train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

2021/09/29 21:13:37-INFO-__main__ | Creating features from dataset file at ../dataset/policy


     label                                               text
0     人才激励  各省、自治区、直辖市财政厅（局），新疆生产建设兵团财政局，中共中央直属机关事务管理局财务管理...
1     资质认定  中国对外经济贸易文告2008年第七十一期\n商务部规章及政策措施\n1、商务部关于公开征求对...
2     人才激励  各区动态\n您现在的位置\n首页\n>\n政务公开\n>\n政府信息公开\n>\n各区动态\...
3     资质认定  通知公告\n您现在的位置\n首页\n>\n政务公开\n>\n政府信息公开\n>\n通知公告\...
4     资金补贴  关于开展第九届北京市文学艺术奖（文学作品）评选工作的通知\n时间：2020-03-10\n来...
...    ...                                                ...
6109  资质认定  【公示】关于公示北京市2021年度第一批拟更名高新技术企业名单的通知\n时间：2021-04...
6110  资金补贴  关于2020年疫情期间“房租通”资金拟支持企业公示名单\n时间：2020-09-08\n来源...
6111  资金补贴  农业农村部种植业管理司关于印发《2020年第三届全国农业行业职业技能大赛（农作物植保员）实施...
6112  资金补贴  教育部\n科技部印发《关于规范高等学校SCI论文相关指标使用树立正确价导向的若干意见》的通知...
6113  资金补贴  宣传册页设计制作及投放项目遴选公告\n时间：2020-06-11\n来源：北京市文旅局\n点...

[6114 rows x 2 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021/09/29 21:13:38-INFO-__main__ | *** Example ***
2021/09/29 21:13:38-INFO-__main__ | guid: train-0
2021/09/29 21:13:38-INFO-__main__ | real_token_len: 168
2021/09/29 21:13:38-INFO-__main__ | input_ids: 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 

6114
{
  "guid": "train-0",
  "label": "\u4eba\u624d\u6fc0\u52b1",
  "text_a": "\u5404\u7701\u3001\u81ea\u6cbb\u533a\u3001\u76f4\u8f96\u5e02\u8d22\u653f\u5385\uff08\u5c40\uff09\uff0c\u65b0\u7586\u751f\u4ea7\u5efa\u8bbe\u5175\u56e2\u8d22\u653f\u5c40\uff0c\u4e2d\u5171\u4e2d\u592e\u76f4\u5c5e\u673a\u5173\u4e8b\u52a1\u7ba1\u7406\u5c40\u8d22\u52a1\u7ba1\u7406\u529e\u516c\u5ba4\uff0c\u56fd\u5bb6\u673a\u5173\u4e8b\u52a1\u7ba1\u7406\u5c40\u8d22\u52a1\u7ba1\u7406\u53f8\uff0c\u5317\u4eac\u3001\u4e0a\u6d77\u3001\u53a6\u95e8\u56fd\u5bb6\u4f1a\u8ba1\u5b66\u9662\uff1a\n\u7ecf\u7814\u7a76\u51b3\u5b9a\uff0c2020\u5e74\u5ea6\u56fd\u9645\u5316\u9ad8\u7aef\u4f1a\u8ba1\u4eba\u624d\u57f9\u517b\u9009\u62d4\u7b14\u8bd5\u5b9a\u4e8e2020\u5e749\u670812\u65e5\u4e3e\u884c\uff0c\u65f6\u95f4\u4e3a\u4e0a\u53488:30-12:00\uff0c\u5730\u70b9\u7531\u5404\u7701\u7ea7\u8d22\u653f\u90e8\u95e8\u548c\u4e2d\u592e\u6709\u5173\u4e3b\u7ba1\u5355\u4f4d\u5206\u522b\u786e\u5b9a\u5e76\u901a\u77e5\u53c2\u52a0\u8003\u8bd5\u4eba\u5458\u3

2021/09/29 21:15:04-INFO-__main__ | Saving features into cached file ../dataset/policy/cached_train_chinese-xlnet-base_512_policy


Writing example 6111
Writing example 6112
Writing example 6113


2021/09/29 21:15:07-INFO-__main__ | ***** Running training *****
2021/09/29 21:15:07-INFO-__main__ |   Num examples = 6114
2021/09/29 21:15:07-INFO-__main__ |   Num Epochs = 3
2021/09/29 21:15:07-INFO-__main__ |   Instantaneous batch size per GPU = 1
2021/09/29 21:15:07-INFO-__main__ |   Total train batch size (w. parallel, distributed & accumulation) = 8
2021/09/29 21:15:07-INFO-__main__ |   Gradient Accumulation steps = 2
2021/09/29 21:15:07-INFO-__main__ |   Total optimization steps = 2292
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

loss 0.6881:   0%|          | 0/1529 [00:13<?, ?it/s][A
loss 0.6881:   0%|          | 1/1529 [00:13<5:49:07, 13.71s/it][A
loss 0.98416:   0%|          | 1/1529 [00:14<5:49:07, 13.71s/it][A
loss 0.98416:   0%|          | 2/1529 [00:14<4:07:10,  9.71s/it][A
loss 1.077:   0%|          | 2/1529 [00:14<4:07:10,  9.71s/it]  [A
loss 1.077:   0%|          | 3/1529 [00:14<2:55:20,  6.89s/it][A
loss 0.94112:   0%|          | 3/1529 [00:14<2:55:20,  6.89s/i

In [15]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
    # Create output directory if needed
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    logger.info("Saving model checkpoint to %s", args.output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

    # Load a trained model and vocabulary that you have fine-tuned
    model = model_class.from_pretrained(args.output_dir)
    tokenizer = tokenizer_class.from_pretrained(args.output_dir)
    model.to(args.device)

2021/09/29 21:41:51-INFO-__main__ | Saving model checkpoint to ../results/policy/xlnet


# 测试

In [16]:
from metrics import acc_and_f1

In [23]:
def evaluate(args, model, tokenizer, prefix=""):
    results = {}

    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    print("***** Running evaluation {} *****".format(prefix))
    print("  Num examples = %d" % len(eval_dataset))
    print("  Batch size = %d" % args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None    #为预测值
    out_label_ids = None    #为真实标签
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet', 'albert'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    print(args.output_mode)
    if args.output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif args.output_mode == "regression":
        preds = np.squeeze(preds)
    result = acc_and_f1(preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(args.output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        print("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            print("  %s = %s"% (key, str(result[key])))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results

# Evaluation
results = {}
if args.do_eval and args.local_rank in [-1, 0]:
    tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    checkpoints = [args.output_dir]
    if args.eval_all_checkpoints:
        checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    print("Evaluate the following checkpoints: %s"% checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""

        model = model_class.from_pretrained(checkpoint)
        model.to(args.device)
        result = evaluate(args, model, tokenizer, prefix=prefix)
        result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

Evaluate the following checkpoints: ['../results/policy/xlnet']


2021/09/29 22:53:17-INFO-__main__ | Loading features from cached file ../dataset/policy/cached_dev_chinese-xlnet-base_512_policy

Evaluating:   0%|          | 0/23 [00:00<?, ?it/s]

***** Running evaluation  *****
  Num examples = 1529
  Batch size = 64


Evaluating: 100%|██████████| 23/23 [00:30<00:00,  1.33s/it]

classification
***** Eval results  *****
  acc = 0.8152173913043478
  acc_and_f1 = 0.7535155703290776
  f1 = 0.6918137493538074





# 预测

In [24]:
def predict(args, model, tokenizer, prefix=""):
    #results = {}

    pred_dataset = load_and_cache_examples(args, args.task_name, tokenizer, predict=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(pred_dataset)
    eval_dataloader = DataLoader(pred_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    print("***** Running predict {} *****".format(prefix))
    print("  Num examples = %d", len(pred_dataset))
    print("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None    #为预测值
    #out_label_ids = None    #为真实标签
    for batch in tqdm(eval_dataloader, desc="Predicting"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            #out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            #out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args.output_mode == "classification":
        preds = np.argmax(preds, axis=1)

    pd.DataFrame(preds).to_csv(os.path.join(args.output_dir, "predicted.csv"), index=False)
    #preds.to_csv(os.path.join(args.output_dir, "predicted.csv"))
    #print(preds)


    #elif args.output_mode == "regression":
    #    preds = np.squeeze(preds)
    #result = acc_and_f1(preds, out_label_ids)
    #results.update(result)

    '''
    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    '''

#Predict
print("++++++++++++++++++++++++++=")
print("executing!")
if args.do_predict and args.local_rank in [-1, 0]:
    tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    checkpoints = [args.output_dir]
    if args.eval_all_checkpoints:
        checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
        logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
    print("Evaluate the following checkpoints: %s", checkpoints)
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""

        model = model_class.from_pretrained(checkpoint)
        model.to(args.device)
        print("Predict...")
        result = predict(args, model, tokenizer, prefix=prefix)
        #result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        #results.update(result)

++++++++++++++++++++++++++=
executing!
Evaluate the following checkpoints: %s ['../results/policy/xlnet']


2021/09/29 23:00:21-INFO-__main__ | Creating features from dataset file at ../dataset/policy


Predict...
                                                    text
0      内蒙古自治区发展和改革委员会关于公开2020年度本级和所属各单位决算信息的函\n来源：\n内...
1      内蒙古自治区发展和改革委员会关于征求《内蒙古自治区关于加快推进快递包装绿色转型的若干措施（征...
2      鄂尔多斯市蒙元煤炭有限公司点石沟铁路专用线项目核准咨询评估服务中标（成交）结果公告\n来源：...
3      新建内蒙古吉林郭勒矿区铁路专用线项目咨询评估服务中标（成交）结果公告\n来源：\n内蒙古自治...
4      达拉特粮食综合物资仓储加工物流园铁路专用线项目咨询评估服务中标（成交）结果公告\n来源：\n...
...                                                  ...
13627  通知公告\n您现在的位置\n首页\n>\n政务公开\n>\n政府信息公开\n>\n通知公告\...
13628                                                NaN
13629  <!--<$[信息内容]>begin-->各有关单位：根据科技部《关于发布国家重点研发计划“...
13630  <!--<$[信息内容]>begin-->京妇发〔2021〕9号北京市妇联等八部门关于印发《...
13631  <!--<$[信息内容]>begin-->各有关单位：根据《关于建立实施中关村知识产权质押融...

[13632 rows x 1 columns]


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021/09/29 23:00:23-INFO-__main__ | *** Example ***
2021/09/29 23:00:23-INFO-__main__ | guid: test-0
2021/09/29 23:00:23-INFO-__main__ | real_token_len: 512
2021/09/29 23:00:23-INFO-__main__ | input_ids: 19 10162 868 24 1520 887 1549 3808 10311 1550 152 725 24 10446 291 2467 5171 2213 1638 20 4680 19 6348 30 19 10162 868 24 1520 887 19 2072 1109 30 162 286 13 3647 13 77 19 172 30 376 19 7591 64 30 19 23321 11319 16972 30 173 51 19 395 1016 152 7567 1683 36 10162 5659 2511 1549 17123 10311 22 154 5171 2213 24 3007 5171 2213 3808 8703 4867 31 10 194 6140 1177 13511 162 286 13664 69 1028 246 11 1854 553 17 2895 378 1341 34

13632
{
  "guid": "test-0",
  "label": "\u8d22\u7a0e\u6276\u6301",
  "text_a": "\u5185\u8499\u53e4\u81ea\u6cbb\u533a\u53d1\u5c55\u548c\u6539\u9769\u59d4\u5458\u4f1a\u5173\u4e8e\u516c\u5f002020\u5e74\u5ea6\u672c\u7ea7\u548c\u6240\u5c5e\u5404\u5355\u4f4d\u51b3\u7b97\u4fe1\u606f\u7684\u51fd\n\u6765\u6e90\uff1a\n\u5185\u8499\u53e4\u81ea\u6cbb\u533a\u53d1\u5c55\u548c\u6539\u9769\u59d4\u5458\u4f1a\n\u53d1\u5e03\u65f6\u95f4\uff1a2021-09-10\n16:24\n\u5206\u4eab\u5230\uff1a\n\u6d4f\u89c8\u6b21\u6570\uff1a183\n\u6253\u5370\u672c\u9875\n\u6839\u636e\u300a\u5185\u8499\u53e4\u81ea\u6cbb\u533a\u8d22\u653f\u5385\u5173\u4e8e\u505a\u597d2020\u5e74\u653f\u5e9c\u51b3\u7b97\u548c\u90e8\u95e8\u51b3\u7b97\u516c\u5f00\u5de5\u4f5c\u7684\u901a\u77e5\u300b\uff08\u5185\u8d22\u5e93\ufe5d2021\ufe5e870\u53f7\uff09\u6587\u4ef6\u8981\u6c42\uff0c\u73b0\u5728\u6211\u59d4\u95e8\u6237\u7f51\u7ad9\u7684\u201c\u9884\u51b3\u7b97\u516c\u5f00\u4e13\u680f\u201d\u548c\u201c\u901a\u77e5\u516c\u544a\u4e13\u680f\u201d\u4e0a\u516c\

2021/09/29 23:02:15-INFO-__main__ | Saving features into cached file ../dataset/policy/cached_test_chinese-xlnet-base_512_policy


Writing example 13631


Predicting:   0%|          | 0/213 [00:00<?, ?it/s]

***** Running predict  *****
  Num examples = %d 13632
  Batch size = %d 64


Predicting: 100%|██████████| 213/213 [04:44<00:00,  1.33s/it]
