In [1]:
import csv
import random
from nltk.tokenize import wordpunct_tokenize
dataset=[]
label_convert={'positive':1,'negative':0}
with open('IMDB Dataset.csv',encoding='utf-8')as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        if row==['review', 'sentiment']:
            continue
        dataset.append([row[0].replace('<br />',' '),label_convert[row[1]]])

In [2]:
text=[]
label=[]
for row in dataset:
    # 对文本 分解为逐个单词
    text.append(wordpunct_tokenize(row[0].lower()))
    label.append(row[1])

In [3]:
word_dict={'PADDING':0}
for sent in text:    
    for token in sent:        
        if token not in word_dict:
            word_dict[token]=len(word_dict)

In [4]:
from tqdm import tqdm 
import numpy as np
news_words = []
for sent in text:       
    sample=[]
    for token in sent:     
        sample.append(word_dict[token])
    sample = sample[:512]
    # 取前512个tokens 之后用 0 补充
    news_words.append(sample+[0]*(512-len(sample)))


In [5]:
print(news_words[0])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 20, 3, 33, 34, 8, 35, 32, 36, 13, 37, 38, 39, 40, 41, 42, 2, 43, 24, 44, 45, 46, 23, 47, 3, 48, 49, 20, 50, 32, 24, 26, 27, 51, 52, 53, 54, 3, 55, 56, 57, 58, 20, 26, 53, 59, 60, 61, 31, 62, 63, 64, 24, 65, 57, 43, 20, 38, 27, 66, 24, 46, 3, 67, 68, 2, 3, 48, 20, 69, 27, 70, 13, 25, 8, 27, 3, 71, 72, 63, 3, 73, 74, 75, 76, 77, 20, 69, 78, 79, 80, 81, 82, 24, 83, 84, 85, 2, 3, 86, 87, 88, 3, 89, 90, 91, 92, 40, 93, 94, 24, 95, 96, 27, 51, 97, 80, 3, 98, 20, 99, 82, 27, 100, 63, 101, 102, 103, 24, 104, 24, 105, 24, 106, 24, 107, 24, 108, 24, 109, 40, 110, 111, 95, 112, 24, 113, 114, 24, 115, 116, 40, 117, 118, 22, 119, 120, 121, 20, 122, 123, 124, 3, 125, 126, 2, 3, 53, 27, 127, 63, 3, 128, 8, 69, 129, 87, 4, 130, 131, 16, 132, 133, 20, 134, 135, 136, 137, 54, 138, 139, 24, 134, 140, 24, 134, 141, 142, 13, 143, 16, 132, 144, 145, 20, 3, 33, 14, 122, 146, 147, 35, 32, 25

In [6]:

news_words=np.array(news_words,dtype='int32') 
label=np.array(label,dtype='int32') 

In [7]:
index=np.arange(len(label))
train_index=index[:25000]
np.random.shuffle(train_index)
test_index=index[25000:]

In [8]:
import os
import logging
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [9]:
from transformers import BertConfig
from transformers.models.bert.modeling_bert import BertSelfOutput, BertIntermediate, BertOutput
config=BertConfig.from_json_file('config.json')


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import torch
import torch.nn as nn

class AttentionPooling(nn.Module):
    def __init__(self, config):
        self.config = config
        super(AttentionPooling, self).__init__()
        self.att_fc1 = nn.Linear(config.hidden_size, config.hidden_size)
        self.att_fc2 = nn.Linear(config.hidden_size, 1)
        self.apply(self.init_weights)
        
    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
            
                
    def forward(self, x, attn_mask=None):
        bz = x.shape[0]
        e = self.att_fc1(x)
        e = nn.Tanh()(e)
        alpha = self.att_fc2(e)
        alpha = torch.exp(alpha)
        if attn_mask is not None:
            alpha = alpha * attn_mask.unsqueeze(2)
        alpha = alpha / (torch.sum(alpha, dim=1, keepdim=True) + 1e-8)
        x = torch.bmm(x.permute(0, 2, 1), alpha)
        x = torch.reshape(x, (bz, -1))  
        return x


In [11]:
class FlipSelfAttention(nn.Module):
    def __init__(self, config, **kwargs):
        super(FlipSelfAttention, self).__init__()
        self.config = config
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" %
                (config.hidden_size, config.num_attention_heads))

        self.attention_head_size = int(config.hidden_size /config.num_attention_heads)
        self.num_attention_heads = config.num_attention_heads
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)

        self.softmax = nn.Softmax(dim=-1)

        self.length_flip_index_dict={}

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
                                       self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    

    def forward(self, hidden_states, attention_mask):
        # batch_size, seq_len, num_head * head_dim
        batch_size, seq_len, _ = hidden_states.shape 
        pading_tensor_length = int(2**np.ceil(np.log2(seq_len)))
        if pading_tensor_length not in self.length_flip_index_dict:
            tempindex=np.arange(pading_tensor_length)
            #bias = [0]+[random.randint(1,pading_tensor_length-1) for _ in range(3)]
            tensors=[]
            #for j in bias:
            if self.num_attention_heads<=int(np.ceil(np.log2(seq_len))):
                for i in range(self.num_attention_heads):
                    pp=int(2**(np.ceil(np.log2(seq_len))-1-i))
                    temp=np.reshape(tempindex,(pp,pading_tensor_length//pp))
                    tensors.append(np.flip((temp)%pading_tensor_length,axis=-1).flatten())
            else:
                for i in range(self.num_attention_heads):
                    pp=int(2**(np.ceil(np.log2(seq_len))-1-i%int(np.ceil(np.log2(seq_len)))))
                    temp=np.reshape(tempindex,(pp,pading_tensor_length//pp))
                    if i<np.ceil(np.log2(seq_len)) and i<self.num_attention_heads//2:
                        bias = 0
                    else:
                        bias = random.randint(1,pading_tensor_length-1)
                    tensors.append(np.flip((temp+bias)%pading_tensor_length,axis=-1).flatten())
            self.length_flip_index_dict[pading_tensor_length] = torch.LongTensor(np.array(tensors)).to(hidden_states.device)

        mixed_query_layer = self.query(hidden_states).view(-1,seq_len,self.num_attention_heads,self.attention_head_size).transpose(1, 2)
        mixed_key_layer = self.key(hidden_states).view(-1,seq_len,self.num_attention_heads,self.attention_head_size).transpose(1, 2)
      
        newten=[]
        for i in range(self.num_attention_heads):
            newten.append(torch.index_select(mixed_key_layer[:,i], 1, self.length_flip_index_dict[pading_tensor_length][i]))
        fliper=torch.stack(newten,dim=1)
        mixed_query_layer = mixed_query_layer*mixed_query_layer 
        mixed_query_layer = mixed_query_layer.transpose(1, 2).reshape(batch_size,seq_len,self.num_attention_heads*self.attention_head_size)

        return mixed_query_layer 

In [12]:

class FlipAttention(nn.Module):
    def __init__(self, config):
        super(FlipAttention, self).__init__()
        self.self = FlipSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output

class FlipformerLayer(nn.Module):
    def __init__(self, config):
        super(FlipformerLayer, self).__init__()
        self.attention = FlipAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
    
class FlipformerEncoder(nn.Module):
    def __init__(self, config, pooler_count=1):
        super(FlipformerEncoder, self).__init__()
        self.config = config
        self.encoders = nn.ModuleList([FlipformerLayer(config) for _ in range(config.num_hidden_layers)])
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # support multiple different poolers with shared bert encoder.
        self.poolers = nn.ModuleList()
        if config.pooler_type == 'weightpooler':
            for _ in range(pooler_count):
                self.poolers.append(AttentionPooling(config))
        logging.info(f"This model has {len(self.poolers)} poolers.")

        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
                with torch.no_grad():
                    module.weight[module.padding_idx].fill_(0)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self, 
                input_embs, 
                attention_mask, 
                pooler_index=0):
        #input_embs: batch_size, seq_len, emb_dim
        #attention_mask: batch_size, seq_len, emb_dim

        extended_attention_mask = attention_mask.unsqueeze(1)
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        batch_size, seq_length, emb_dim = input_embs.shape
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_embs.device)
        position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = input_embs + position_embeddings
        
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        
        
        
        all_hidden_states = [embeddings]
        for i, layer_module in enumerate(self.encoders):
            layer_outputs = layer_module(all_hidden_states[-1], extended_attention_mask)
            all_hidden_states.append(layer_outputs)
        assert len(self.poolers) > pooler_index
        output = self.poolers[pooler_index](all_hidden_states[-1], attention_mask)

        return output 


In [13]:
    
class Model(torch.nn.Module):

    def __init__(self,config):
        super(Model, self).__init__()
        self.config = config
        self.dense_linear = nn.Linear(config.hidden_size,2)
        self.word_embedding = nn.Embedding(len(word_dict),config.hidden_size,padding_idx=0)
        self.fastformer_model = FlipformerEncoder(config)
        self.criterion = nn.CrossEntropyLoss() 
        self.apply(self.init_weights)
        
    def init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if isinstance(module, (nn.Embedding)) and module.padding_idx is not None:
                with torch.no_grad():
                    module.weight[module.padding_idx].fill_(0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
    
    def forward(self,input_ids,targets):
        mask=input_ids.bool().float()
        embds=self.word_embedding(input_ids)
        text_vec = self.fastformer_model(embds,mask)
        score = self.dense_linear(text_vec)
        loss = self.criterion(score, targets) 
        return loss, score

In [14]:
def acc(y_true, y_hat):
    y_hat = torch.argmax(y_hat, dim=-1)
    tot = y_true.shape[0]
    hit = torch.sum(y_true == y_hat)
    return hit.data.float() * 1.0 / tot

In [15]:
import math
model = Model(config)
import torch.optim as optim
optimizer = optim.Adam([ {'params': model.parameters(), 'lr': 1e-3}])
model.cuda()

Model(
  (dense_linear): Linear(in_features=256, out_features=2, bias=True)
  (word_embedding): Embedding(103893, 256, padding_idx=0)
  (fastformer_model): FlipformerEncoder(
    (encoders): ModuleList(
      (0-1): 2 x FlipformerLayer(
        (attention): FlipAttention(
          (self): FlipSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (softmax): Softmax(dim=-1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=256, out_features=256, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear

In [16]:
import time
step=0 
for epoch in range(1):
    loss = 0.0
    accuracy = 0.0
    for cnt in tqdm(range(len(train_index)//64)):

        log_ids=news_words[train_index][cnt*64:cnt*64+64,:512]
        targets= label[train_index][cnt*64:cnt*64+64]

        log_ids = torch.LongTensor(log_ids).cuda(non_blocking=True)
        targets = torch.LongTensor(targets).cuda(non_blocking=True)
        bz_loss, y_hat = model(log_ids, targets)
        loss += bz_loss.data.float()
        accuracy += acc(targets, y_hat)
        unified_loss=bz_loss
        optimizer.zero_grad()
        unified_loss.backward()
        optimizer.step() 
        step+=1
        if cnt % 100== 0:
            print( ' Ed: {}, train_loss: {:.5f}, acc: {:.5f}'.format(cnt * 64, loss.data / (cnt+1), accuracy / (cnt+1)))
    model.eval()
    allpred=[]
    for cnt in range(len(test_index)//64+1):
    
        log_ids=news_words[test_index][cnt*64:cnt*64+64,:512]
        targets= label[test_index][cnt*64:cnt*64+64]
        log_ids = torch.LongTensor(log_ids).cuda(non_blocking=True)
        targets = torch.LongTensor(targets).cuda(non_blocking=True)
    
        bz_loss2, y_hat2 = model(log_ids, targets)
        allpred+=y_hat2.to('cpu').detach().numpy().tolist()
        
    y_pred=np.argmax(allpred,axis=-1)
    y_true=label[test_index]
    from sklearn.metrics import *
    print(accuracy_score(y_true, y_pred))
    model.train()

  1%|          | 3/390 [00:04<08:14,  1.28s/it]

 Ed: 0, train_loss: 0.68936, acc: 0.56250


 27%|██▋       | 104/390 [00:09<00:14, 20.14it/s]

 Ed: 6400, train_loss: 0.45629, acc: 0.77816


 52%|█████▏    | 203/390 [00:14<00:07, 24.80it/s]

 Ed: 12800, train_loss: 0.39661, acc: 0.81810


 78%|███████▊  | 305/390 [00:18<00:04, 20.23it/s]

 Ed: 19200, train_loss: 0.36830, acc: 0.83524


100%|██████████| 390/390 [00:23<00:00, 16.94it/s]


0.88196
