# Transformer&Classifier

## define model

In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy

'''Attention Is All You Need'''


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab-1)

        self.postion_embedding = Positional_Encoding(config.embed, config.pad_size, config.dropout, config.device)
        self.encoder = Encoder(config.dim_model, config.num_head, config.hidden, config.dropout)
        self.encoders = nn.ModuleList([
            copy.deepcopy(self.encoder)
            # Encoder(config.dim_model, config.num_head, config.hidden, config.dropout)
            for _ in range(config.num_encoder)])

        self.fc1 = nn.Linear(config.pad_size * config.dim_model, config.num_classes)
        # self.fc2 = nn.Linear(config.last_hidden, config.num_classes)
        # self.fc1 = nn.Linear(config.dim_model, config.num_classes)

    def forward(self, x):
        out = self.embedding(x)
        #return out
        out = self.postion_embedding(out)
        
        for encoder in self.encoders:
            out = encoder(out)
        out = out.view(out.size(0), -1)
        # out = torch.mean(out, 1)
        out = self.fc1(out)
        return out


class Encoder(nn.Module):
    def __init__(self, dim_model, num_head, hidden, dropout):
        super(Encoder, self).__init__()
        self.attention = Multi_Head_Attention(dim_model, num_head, dropout)
        self.feed_forward = Position_wise_Feed_Forward(dim_model, hidden, dropout)

    def forward(self, x):
        out = self.attention(x)
        out = self.feed_forward(out)
        return out


class Positional_Encoding(nn.Module):
    def __init__(self, embed, pad_size, dropout, device):
        super(Positional_Encoding, self).__init__()
        self.device = device
        self.pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)])
        self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])
        self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
        out = self.dropout(out)
        return out


class Scaled_Dot_Product_Attention(nn.Module):
    '''Scaled Dot-Product Attention '''
    def __init__(self):
        super(Scaled_Dot_Product_Attention, self).__init__()

    def forward(self, Q, K, V, scale=None):
        '''
        Args:
            Q: [batch_size, len_Q, dim_Q]
            K: [batch_size, len_K, dim_K]
            V: [batch_size, len_V, dim_V]
            scale: 缩放因子 论文为根号dim_K
        Return:
            self-attention后的张量，以及attention张量
        '''
        attention = torch.matmul(Q, K.permute(0, 2, 1))
        if scale:
            attention = attention * scale
        # if mask:  # TODO change this
        #     attention = attention.masked_fill_(mask == 0, -1e9)
        attention = F.softmax(attention, dim=-1)
        context = torch.matmul(attention, V)
        return context


class Multi_Head_Attention(nn.Module):
    def __init__(self, dim_model, num_head, dropout=0.0):
        super(Multi_Head_Attention, self).__init__()
        self.num_head = num_head
        assert dim_model % num_head == 0
        self.dim_head = dim_model // self.num_head
        self.fc_Q = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_K = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_V = nn.Linear(dim_model, num_head * self.dim_head)
        self.attention = Scaled_Dot_Product_Attention()
        self.fc = nn.Linear(num_head * self.dim_head, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        batch_size = x.size(0)
        Q = self.fc_Q(x)
        K = self.fc_K(x)
        V = self.fc_V(x)
        Q = Q.view(batch_size * self.num_head, -1, self.dim_head)
        K = K.view(batch_size * self.num_head, -1, self.dim_head)
        V = V.view(batch_size * self.num_head, -1, self.dim_head)
        # if mask:  # TODO
        #     mask = mask.repeat(self.num_head, 1, 1)  # TODO change this
        scale = K.size(-1) ** -0.5  # 缩放因子
        context = self.attention(Q, K, V, scale)

        context = context.view(batch_size, -1, self.dim_head * self.num_head)
        out = self.fc(context)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out


class Position_wise_Feed_Forward(nn.Module):
    def __init__(self, dim_model, hidden, dropout=0.0):
        super(Position_wise_Feed_Forward, self).__init__()
        self.fc1 = nn.Linear(dim_model, hidden)
        self.fc2 = nn.Linear(hidden, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out


In [78]:
import torch
class Config(object):

    """配置参数"""
    def __init__(self):
        self.model_name = 'Transformer'
        self.embedding_pretrained = None                                # 预训练词向量
        self.device = torch.device("mps")                               # 设备
        self.dropout = 0.5                                              # 随机失活
        self.num_classes = 14                                           # 类别数
        self.num_epochs = 10                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                             # 每句话处理成的长度(短填长切)
        self.n_vocab = None                                             #这里需要读取数据的部分进行赋值
        self.learning_rate = 5e-4                                       # 学习率
        self.embed = 8                                                  # 词向量维度
        self.dim_model = 8
        self.hidden = 32
        self.last_hidden = 16
        self.num_head = 2
        self.num_encoder = 1
        self.n_splits = 2                                               #k折交叉验证
    

## load_data

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
#--------------------------加载数据----------------------------
def load_data(config):
    df = pd.read_csv('./dataset/train_set.csv',sep='\t')

    train = []
    targets = []
    label = df['label'].values
    text = df['text'].values
    id = 0
    vocabs_size = 0
    for val in tqdm(text):
        s = val.split(' ')
        single_data = []
        for i in range(len(s)):
            vocabs_size = max(vocabs_size,int(s[i])+1)
            single_data.append(int(s[i])+1)
            if len(single_data)>=config.pad_size:
                train.append(single_data)
                targets.append(int(label[id]))
                single_data = []
        if len(single_data)>=150:
            single_data = single_data + [0]*(config.pad_size-len(single_data))
            train.append(single_data)
            targets.append(int(label[id]))  
        id += 1
        


    train = np.array(train)
    targets = np.array(targets)
    return train,targets,vocabs_size


## train model

In [79]:
#---------------------------------------------------
import pandas as pd
from collections import Counter
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GroupKFold, KFold
import numpy as np
import torch
from    torch import autograd
import os
from tqdm import tqdm

config = Config()

In [80]:
train,targets,vocabs_size = load_data(config)#加载数据
config.n_vocab = vocabs_size + 1

batch_size = config.batch_size

kf = KFold(n_splits=config.n_splits, shuffle=True, random_state=2021)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [01:01<00:00, 3250.69it/s]


In [None]:
for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    x_train, x_val = train[train_idx], train[test_idx]
    y_train, y_val = targets[train_idx], targets[test_idx]

    #print(x_train.shape)
    
    M_train = len(x_train)
    M_val = len(x_val)
    if M_train % batch_size == 1:#因为模型里面有层标准化，训练中不能出现单条数据，至少为2条
        M_train -= 1
    if M_val % batch_size == 1:
        M_val -= 1
    x_train = torch.from_numpy(x_train).to(torch.long).to(config.device)
    x_val = torch.from_numpy(x_val).to(torch.long).to(config.device)
    y_train = torch.from_numpy(y_train).to(torch.long).to(config.device)
    y_val = torch.from_numpy(y_val).to(torch.long).to(config.device)

    model = Model(config)#调用transformer的编码器
    model.to(config.device)
    optimizer = torch.optim.Adam(model.parameters(),lr=config.learning_rate)
    loss_func = nn.CrossEntropyLoss()#多分类的任务
    model.train()
    print('开始迭代....')
    #开始迭代
    for step in range(config.num_epochs):
        #print('step=',step+1)
        L_val = -batch_size
        with tqdm(np.arange(0,M_train,batch_size), desc='Training...') as tbar:
            for index in tbar:
                L = index
                R = min(M_train,index+batch_size)
                L_val += batch_size
                L_val %= M_val
                R_val = min(M_val,L_val + batch_size)
                #-----------------训练内容------------------
                #print(x_train[L:R].shape)
                train_pre = model(x_train[L:R])     # 喂给 model训练数据 x, 输出预测值
                train_loss = loss_func(train_pre, y_train[L:R])
                val_pre = model(x_val[L_val:R_val])#验证集也得分批次，不然数据量太大内存爆炸
                val_loss = loss_func(val_pre, y_val[L_val:R_val])

                #----------- -----计算准确率----------------
                train_acc = np.sum(np.argmax(np.array(train_pre.data.cpu()),axis=1) == np.array(y_train[L:R].data.cpu()))/(R-L)
                val_acc = np.sum(np.argmax(np.array(val_pre.data.cpu()),axis=1) == np.array(y_val[L_val:R_val].data.cpu()))/(R_val-L_val)

                #---------------打印在进度条上--------------
                tbar.set_postfix(train_loss=float(train_loss.data.cpu()),train_acc=train_acc,val_loss=float(val_loss.data.cpu()),val_acc=val_acc)
                tbar.update()  # 默认参数n=1，每update一次，进度+n

                #-----------------反向传播更新---------------
                optimizer.zero_grad()   # 清空上一步的残余更新参数值
                train_loss.backward()         # 以训练集的误差进行反向传播, 计算参数更新值
                optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
    

--------------- > Fold 1 < ---------------
开始迭代....


Training...: 100%|███████████████████████████| 21771/21771 [05:12<00:00, 69.66it/s, train_acc=0.478, train_loss=1.77, val_acc=0.612, val_loss=1.6]
Training...: 100%|██████████████████████████| 21771/21771 [05:13<00:00, 69.38it/s, train_acc=0.507, train_loss=1.55, val_acc=0.627, val_loss=1.39]
Training...: 100%|██████████████████████████| 21771/21771 [05:07<00:00, 70.70it/s, train_acc=0.537, train_loss=1.48, val_acc=0.657, val_loss=1.37]
Training...:  66%|█████████████████▊         | 14397/21771 [03:13<01:41, 72.36it/s, train_acc=0.766, train_loss=0.9, val_acc=0.594, val_loss=1.28]

In [81]:
from thop import profile
model = Model(config)

print(model)

Model(
  (embedding): Embedding(7551, 8, padding_idx=7550)
  (postion_embedding): Positional_Encoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (encoder): Encoder(
    (attention): Multi_Head_Attention(
      (fc_Q): Linear(in_features=8, out_features=8, bias=True)
      (fc_K): Linear(in_features=8, out_features=8, bias=True)
      (fc_V): Linear(in_features=8, out_features=8, bias=True)
      (attention): Scaled_Dot_Product_Attention()
      (fc): Linear(in_features=8, out_features=8, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
      (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    )
    (feed_forward): Position_wise_Feed_Forward(
      (fc1): Linear(in_features=8, out_features=32, bias=True)
      (fc2): Linear(in_features=32, out_features=8, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
      (layer_norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    )
  )
  (encoders): ModuleList(
    (0): Encoder(
      (attentio

In [82]:
params = list(model.parameters())
k = 0
for i in params:
    l = 1
    print("该层的结构：" + str(list(i.size())))
    for j in i.size():
        l *= j
    print("该层参数和：" + str(l))
    k = k + l
print("总参数数量和：" + str(k))

该层的结构：[7551, 8]
该层参数和：60408
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[32, 8]
该层参数和：256
该层的结构：[32]
该层参数和：32
该层的结构：[8, 32]
该层参数和：256
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8, 8]
该层参数和：64
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[32, 8]
该层参数和：256
该层的结构：[32]
该层参数和：32
该层的结构：[8, 32]
该层参数和：256
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[8]
该层参数和：8
该层的结构：[14, 256]
该层参数和：3584
该层的结构：[14]
该层参数和：14
总参数数量和：65750


# RNN

In [16]:
import pandas as pd
from collections import Counter
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GroupKFold, KFold
import numpy as np
import torch
from    torch import autograd
import os
from tqdm import tqdm
#from gensim.models.word2vec import Word2Vec

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("mps")
#--------------------------加载数据----------------------------
df = pd.read_csv('./dataset/train_set.csv',sep='\t')

In [17]:
#mx_length = 900
vocabs_size = 0
n_class = 14
training_step = 20#迭代次数
batch_size = 256#每个批次的大小
train = []
targets = []
label = df['label'].values
text = df['text'].values
id = 0
for val in tqdm(text):
    s = val.split(' ')
    single_data = []
    for i in range(len(s)):
        vocabs_size = max(vocabs_size,int(s[i])+1)
        single_data.append(int(s[i])+1)
        if len(single_data)>=256:
            train.append(single_data)
            targets.append(int(label[id]))
            single_data = []
    if len(single_data)>=150:
        single_data = single_data + [0]*(256-len(single_data))
        train.append(single_data)
        targets.append(int(label[id]))  
    id += 1
    


train = np.array(train)
targets = np.array(targets)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [01:33<00:00, 2146.70it/s]


In [18]:
class Bi_Lstm(nn.Module):
    def __init__(self):
        super(Bi_Lstm,self).__init__() 
        self.embeding = nn.Embedding(vocabs_size+1,100)
        self.lstm = nn.LSTM(input_size = 100, hidden_size = 100,num_layers = 1,bidirectional = False,batch_first=True,dropout=0.5)#加了双向，输出的节点数翻2倍
        self.l1 = nn.BatchNorm1d(100)
        self.l2 = nn.ReLU()
        self.l3 = nn.Linear(100,n_class)#特征输入
        self.l4 = nn.Dropout(0.3)
        self.l5 = nn.BatchNorm1d(n_class)
    def forward(self, x):
        x = self.embeding(x)
        out,_ = self.lstm(x)
        #选择最后一个时间点的output
        out = self.l1(out[:,-1,:])
        out = self.l2(out)
        out = self.l3(out)
        out = self.l4(out)
        out = self.l5(out)
        return out


print(train.shape)
print(targets.shape)

(691522, 256)
(691522,)


In [19]:
kf = KFold(n_splits=2, shuffle=True, random_state=2021)#5折交叉验证
for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    x_train, x_val = train[train_idx], train[test_idx]
    y_train, y_val = targets[train_idx], targets[test_idx]
    
    M_train = len(x_train)-1
    M_val = len(x_val)

    x_train = torch.from_numpy(x_train).to(torch.long).to(device)
    x_val = torch.from_numpy(x_val).to(torch.long).to(device)
    y_train = torch.from_numpy(y_train).to(torch.long).to(device)
    y_val = torch.from_numpy(y_val).to(torch.long).to(device)

    model = Bi_Lstm()
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    loss_func = nn.CrossEntropyLoss()#多分类的任务
    model.train()#模型中有BN和Droupout一定要添加这个说明
    
    #开始迭代
    for step in range(training_step):
        print('step=',step)
        L_val = -batch_size
        with tqdm(np.arange(0,M_train,batch_size), desc='Training...') as tbar:
            for index in tbar:
                L = index
                R = min(M_train,index+batch_size)
                L_val += batch_size
                L_val %= M_val
                R_val = min(M_val,L_val + batch_size)
                #-----------------训练内容------------------
                train_pre = model(x_train[L:R])     # 喂给 model训练数据 x, 输出预测值
                train_loss = loss_func(train_pre, y_train[L:R])
                val_pre = model(x_val[L_val:R_val])#验证集也得分批次，不然数据量太大内存爆炸
                val_loss = loss_func(val_pre, y_val[L_val:R_val])

                #----------- -----计算准确率----------------
                train_acc = np.sum(np.argmax(np.array(train_pre.data.cpu()),axis=1) == np.array(y_train[L:R].data.cpu()))/(R-L)
                val_acc = np.sum(np.argmax(np.array(val_pre.data.cpu()),axis=1) == np.array(y_val[L_val:R_val].data.cpu()))/(R_val-L_val)

                #---------------打印在进度条上--------------
                tbar.set_postfix(train_loss=float(train_loss.data.cpu()),train_acc=train_acc,val_loss=float(val_loss.data.cpu()),val_acc=val_acc)
                tbar.update()  # 默认参数n=1，每update一次，进度+n

                #-----------------反向传播更新---------------
                optimizer.zero_grad()   # 清空上一步的残余更新参数值
                train_loss.backward()         # 以训练集的误差进行反向传播, 计算参数更新值
                optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
    del model

--------------- > Fold 1 < ---------------




step= 0


Training...:  50%|████████████████████████▉                         | 1079/2161 [01:48<01:48,  9.94it/s, train_acc=0.625, train_loss=1.36, val_acc=0.598, val_loss=1.3]


KeyboardInterrupt: 