## Define TextCNN

In [19]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, windows_size, max_len, feature_size, n_class, dropout=0.4):
        super(TextCNN, self).__init__()
        # embedding层
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        # 卷积层特征提取
        self.conv1 = nn.ModuleList([
            nn.Sequential(nn.Conv1d(in_channels=embedding_dim, out_channels=feature_size, kernel_size=h),
                          nn.LeakyReLU(),
                          nn.MaxPool1d(kernel_size=max_len-h+1),
                          )
            for h in windows_size]
        )
        # 全连接层
        self.fc = nn.Linear(feature_size*len(windows_size), n_class)
        # dropout防止过拟合
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embed(x) # [batch, seq_len, embed_dim]
        x = x.permute(0, 2, 1) # [batch, embed_dim, seq_len]
        x = [conv(x) for conv in self.conv1]
        x = torch.cat(x, 1)
        x = x.view(-1, x.size(1)) # [batch, feature_size*len(windows_size)]
        x = self.dropout(x)
        x = self.fc(x)# [batch, n_class]
        return x

## Train TextCNN

In [21]:
import pandas as pd
import numpy as np
import torch
import time
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, AdamW
import jieba
import os
import random
from tqdm import tqdm

In [34]:
def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)  # 为了禁止hash随机化，使得实验可复现
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


# 统计词库
def count_word(sentences, word_to_index):
    with open('./dataset/stopwords.txt', "r", encoding="utf-8") as f:
        stopwords = f.readlines()
    for i in range(len(stopwords)):
        stopwords[i] = stopwords[i].strip("\n")
    for sentence in sentences:
        for word in sentence:
            if word not in word_to_index and word not in stopwords:
                word_to_index[word] = len(word_to_index)

# 将word转换为token
def sentence_to_index(sentence, max_len, word_to_index):
    sentence = [word_to_index.get(word, 0) for word in sentence]
    if len(sentence) < max_len:
        sentence += (max_len - len(sentence)) * [0]
    else:
        sentence = sentence[:max_len]
    return sentence

In [25]:
# 定义数据类
class MyDataset(Dataset):

    def __init__(self, sent, label):
        self.sentence = np.array(sent).astype('float')
        self.label = np.array(label)

    def __getitem__(self, index):
        label, sentence = self.label[index], self.sentence[index]

        return {'label': label, 'sentence': torch.Tensor(sentence)}

    def __len__(self):
        return len(self.sentence)

In [27]:
# 加载训练数据
def load_train(path, type='char'):
    train = {}
    train['label'] = []
    train['sentence'] = []
    word_to_index = {'pad': 0}

    with open(path, encoding='utf-8') as f:
        for line in f:
            line = eval(line)
            train['label'].append(line['label'])
            train['sentence'].append(line['sentence'])

    train = pd.DataFrame(train)
    label_to_id = {}
    sents = []
    labels = []
    for i, label in enumerate(train['label'].unique()):
        label_to_id[label] = i
    if type == 'char':
        train['cut_sentence'] = train['sentence']
    else:
        train['cut_sentence'] = train['sentence'].map(jieba.lcut)
    count_word(train['cut_sentence'], word_to_index)

    for sent, label in zip(train['cut_sentence'], train['label']):
        sents.append(sentence_to_index(sent, max_len=max_len, word_to_index=word_to_index))
        labels.append(label_to_id[label])
    return sents, labels, label_to_id, word_to_index

# 加载验证数据
def load_val(val_path, label_to_id, word_to_index, type='char'):
    val = {}
    val['label'] = []
    val['sentence'] = []
    with open(val_path, encoding='utf-8') as f:
        for line in f:
            line = eval(line)
            val['label'].append(line['label'])
            val['sentence'].append(line['sentence'])

    val = pd.DataFrame(val)
    val_sents = []
    val_labels = []
    if type == 'char':
        val['cut_sentence'] = val['sentence']
    else:
        val['cut_sentence'] = val['sentence'].map(jieba.lcut)
    for sent, label in zip(val['cut_sentence'], val['label']):
        val_sents.append(sentence_to_index(sent, max_len=max_len, word_to_index=word_to_index))
        val_labels.append(label_to_id[label])

    return val_sents, val_labels

In [29]:
# 评估函数
def evaluate(val_loader, model, device):
    model.eval()
    corrects, avg_loss = 0, 0
    for data in val_loader:
        label = data['label']
        sentence = data['sentence']

        label = label.type(torch.LongTensor)
        sentence = sentence.type(torch.LongTensor)

        label = label.to(device)
        sentence = sentence.to(device)

        output = model(sentence)

        loss = criterion(output, label)

        avg_loss += loss.item()
        corrects += (output.argmax(1) == label).sum().item()

    size = len(val_dataset)
    avg_loss /= size
    accuracy = corrects / size
    print('\nEvaluation - loss: {:.3f} acc: {:.4f}\n'.format(avg_loss, accuracy))

    return accuracy

In [36]:
seed_torch()
train_path = './dataset/tnews_public/train.json'
val_path = './dataset/tnews_public/dev.json'
max_len = 64
# 加载数据
sents, labels, label_to_id, word_to_index = load_train(train_path, type='char')
val_sents, val_labels = load_val(val_path, label_to_id, word_to_index, type='char')
# 超参数
batch_size = 128
learn_rate = 3e-4
n_epochs = 64
embedding_dim = 300
windows_size = [2, 4, 3]
feature_size = 100
dropout = 0.5
vocab_size = len(word_to_index)
n_class = len(label_to_id)

In [58]:
# 训练
train_dataset = MyDataset(sents, labels)
val_dataset = MyDataset(val_sents, val_labels)
train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

val_loader = DataLoader(val_dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=0)

device = torch.device("mps" if (torch.backends.mps.is_available()) else "cpu")
model = TextCNN(vocab_size, embedding_dim, windows_size, max_len, feature_size, n_class, dropout).to(device)
optimizer = Adam(model.parameters(), lr=learn_rate)
criterion = nn.CrossEntropyLoss()
best_acc = 0.0
early_times = 0
for epoch in range(1, n_epochs + 1):
    running_loss = 0.0
    model.train()
    start = time.time()
    for batch_i, data in enumerate(train_loader):
        start = time.time()
        label = data['label']
        sentence = data['sentence']

        label = label.type(torch.LongTensor)
        sentence = sentence.type(torch.LongTensor)

        label = label.to(device)
        sentence = sentence.to(device)

        output = model(sentence)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if batch_i % 100 == 99:
            print(
                'epoch: {}, batch: {}/{}, loss: {}'.format(epoch, batch_i + 1, len(train_loader),
                                                           round(running_loss / 100/64, 4)))
            running_loss = 0.0
    val_acc = evaluate(val_loader, model, device)
    if best_acc < val_acc:
        best_acc = val_acc
        early_times = 0
    else:
        early_times += 1
        if early_times > 5:
            print('EarlyStopping---best_acc: {}, '.format(best_acc))
            break

epoch: 1, batch: 100/417, loss: 0.0414
epoch: 1, batch: 200/417, loss: 0.0369
epoch: 1, batch: 300/417, loss: 0.0342
epoch: 1, batch: 400/417, loss: 0.0324

Evaluation - loss: 0.015 acc: 0.4347

epoch: 2, batch: 100/417, loss: 0.0302
epoch: 2, batch: 200/417, loss: 0.0295
epoch: 2, batch: 300/417, loss: 0.0289
epoch: 2, batch: 400/417, loss: 0.0282

Evaluation - loss: 0.013 acc: 0.4751

epoch: 3, batch: 100/417, loss: 0.0269
epoch: 3, batch: 200/417, loss: 0.0265
epoch: 3, batch: 300/417, loss: 0.0266
epoch: 3, batch: 400/417, loss: 0.0265

Evaluation - loss: 0.013 acc: 0.4902

epoch: 4, batch: 100/417, loss: 0.0247
epoch: 4, batch: 200/417, loss: 0.0252
epoch: 4, batch: 300/417, loss: 0.025
epoch: 4, batch: 400/417, loss: 0.0254

Evaluation - loss: 0.012 acc: 0.4992

epoch: 5, batch: 100/417, loss: 0.0239
epoch: 5, batch: 200/417, loss: 0.0237
epoch: 5, batch: 300/417, loss: 0.0235
epoch: 5, batch: 400/417, loss: 0.024

Evaluation - loss: 0.012 acc: 0.5053

epoch: 6, batch: 100/417, l

In [66]:
params = model.parameters()
#print(params.shape)
#print("shape of weight: ", np.array(params[0]).shape)
#print("shape of bias: ", np.array(params[1]).shape)

AttributeError: 'generator' object has no attribute 'shape'

# Transformer&Classifier

## define model

In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy

'''Attention Is All You Need'''


class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        if config.embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)

        self.postion_embedding = Positional_Encoding(config.embed, config.pad_size, config.dropout, config.device)
        self.encoder = Encoder(config.dim_model, config.num_head, config.hidden, config.dropout)
        self.encoders = nn.ModuleList([
            copy.deepcopy(self.encoder)
            # Encoder(config.dim_model, config.num_head, config.hidden, config.dropout)
            for _ in range(config.num_encoder)])

        self.fc1 = nn.Linear(config.pad_size * config.dim_model, config.num_classes)
        # self.fc2 = nn.Linear(config.last_hidden, config.num_classes)
        # self.fc1 = nn.Linear(config.dim_model, config.num_classes)

    def forward(self, x):
        out = self.embedding(x)
        #return out
        out = self.postion_embedding(out)
        for encoder in self.encoders:
            out = encoder(out)
        out = out.view(out.size(0), -1)
        # out = torch.mean(out, 1)
        out = self.fc1(out)
        return out


class Encoder(nn.Module):
    def __init__(self, dim_model, num_head, hidden, dropout):
        super(Encoder, self).__init__()
        self.attention = Multi_Head_Attention(dim_model, num_head, dropout)
        self.feed_forward = Position_wise_Feed_Forward(dim_model, hidden, dropout)

    def forward(self, x):
        out = self.attention(x)
        out = self.feed_forward(out)
        return out


class Positional_Encoding(nn.Module):
    def __init__(self, embed, pad_size, dropout, device):
        super(Positional_Encoding, self).__init__()
        self.device = device
        self.pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)])
        self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])
        self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
        out = self.dropout(out)
        return out


class Scaled_Dot_Product_Attention(nn.Module):
    '''Scaled Dot-Product Attention '''
    def __init__(self):
        super(Scaled_Dot_Product_Attention, self).__init__()

    def forward(self, Q, K, V, scale=None):
        '''
        Args:
            Q: [batch_size, len_Q, dim_Q]
            K: [batch_size, len_K, dim_K]
            V: [batch_size, len_V, dim_V]
            scale: 缩放因子 论文为根号dim_K
        Return:
            self-attention后的张量，以及attention张量
        '''
        attention = torch.matmul(Q, K.permute(0, 2, 1))
        if scale:
            attention = attention * scale
        # if mask:  # TODO change this
        #     attention = attention.masked_fill_(mask == 0, -1e9)
        attention = F.softmax(attention, dim=-1)
        context = torch.matmul(attention, V)
        return context


class Multi_Head_Attention(nn.Module):
    def __init__(self, dim_model, num_head, dropout=0.0):
        super(Multi_Head_Attention, self).__init__()
        self.num_head = num_head
        assert dim_model % num_head == 0
        self.dim_head = dim_model // self.num_head
        self.fc_Q = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_K = nn.Linear(dim_model, num_head * self.dim_head)
        self.fc_V = nn.Linear(dim_model, num_head * self.dim_head)
        self.attention = Scaled_Dot_Product_Attention()
        self.fc = nn.Linear(num_head * self.dim_head, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        batch_size = x.size(0)
        Q = self.fc_Q(x)
        K = self.fc_K(x)
        V = self.fc_V(x)
        Q = Q.view(batch_size * self.num_head, -1, self.dim_head)
        K = K.view(batch_size * self.num_head, -1, self.dim_head)
        V = V.view(batch_size * self.num_head, -1, self.dim_head)
        # if mask:  # TODO
        #     mask = mask.repeat(self.num_head, 1, 1)  # TODO change this
        scale = K.size(-1) ** -0.5  # 缩放因子
        context = self.attention(Q, K, V, scale)

        context = context.view(batch_size, -1, self.dim_head * self.num_head)
        out = self.fc(context)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out


class Position_wise_Feed_Forward(nn.Module):
    def __init__(self, dim_model, hidden, dropout=0.0):
        super(Position_wise_Feed_Forward, self).__init__()
        self.fc1 = nn.Linear(dim_model, hidden)
        self.fc2 = nn.Linear(hidden, dim_model)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = self.dropout(out)
        out = out + x  # 残差连接
        out = self.layer_norm(out)
        return out


In [86]:
import torch
class Config(object):

    """配置参数"""
    def __init__(self):
        self.model_name = 'Transformer'
        self.embedding_pretrained = None                                 # 预训练词向量
        self.device = torch.device("mps" if (torch.backends.mps.is_available()) else 'cpu')   # 设备
        self.dropout = 0.5                                              # 随机失活
        self.num_classes = 14                        # 类别数
        self.num_epochs = 20                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 256                                              # 每句话处理成的长度(短填长切)
        self.n_vocab = None#这里需要读取数据的部分进行赋值
        self.learning_rate = 5e-4                                       # 学习率
        self.embed = 90           # 词向量维度
        self.dim_model = 90
        self.hidden = 1024
        self.last_hidden = 512
        self.num_head = 5
        self.num_encoder = 2
        self.n_splits = 5#k折交叉验证
    

## load_data

In [80]:
import pandas as pd
import numpy as np
from tqdm import tqdm
#--------------------------加载数据----------------------------
def load_data(config):
    df = pd.read_csv('./dataset/train_set.csv',sep='\t')

    train = []
    targets = []
    label = df['label'].values
    text = df['text'].values
    id = 0
    vocabs_size = 0
    for val in tqdm(text):
        s = val.split(' ')
        single_data = []
        for i in range(len(s)):
            vocabs_size = max(vocabs_size,int(s[i])+1)
            single_data.append(int(s[i])+1)
            if len(single_data)>=config.pad_size:
                train.append(single_data)
                targets.append(int(label[id]))
                single_data = []
        if len(single_data)>=150:
            single_data = single_data + [0]*(config.pad_size-len(single_data))
            train.append(single_data)
            targets.append(int(label[id]))  
        id += 1
        


    train = np.array(train)
    targets = np.array(targets)
    return train,targets,vocabs_size


## train model

In [None]:

#---------------------------------------------------
import pandas as pd
from collections import Counter
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GroupKFold, KFold
import numpy as np
import torch
from    torch import autograd
import os
from tqdm import tqdm

config = Config()
train,targets,vocabs_size = load_data(config)#加载数据
config.n_vocab = vocabs_size + 1

batch_size = config.batch_size

kf = KFold(n_splits=config.n_splits, shuffle=True, random_state=2021)
for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
    print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    x_train, x_val = train[train_idx], train[test_idx]
    y_train, y_val = targets[train_idx], targets[test_idx]
    
    M_train = len(x_train)
    M_val = len(x_val)
    if M_train % batch_size == 1:#因为模型里面有层标准化，训练中不能出现单条数据，至少为2条
        M_train -= 1
    if M_val % batch_size == 1:
        M_val -= 1
    x_train = torch.from_numpy(x_train).to(torch.long).to(config.device)
    x_val = torch.from_numpy(x_val).to(torch.long).to(config.device)
    y_train = torch.from_numpy(y_train).to(torch.long).to(config.device)
    y_val = torch.from_numpy(y_val).to(torch.long).to(config.device)

    model = Model(config)#调用transformer的编码器
    model.to(config.device)
    optimizer = torch.optim.Adam(model.parameters(),lr=config.learning_rate)
    loss_func = nn.CrossEntropyLoss()#多分类的任务
    model.train()
    print('开始迭代....')
    #开始迭代
    for step in range(config.num_epochs):
        print('step=',step+1)
        L_val = -batch_size
        with tqdm(np.arange(0,M_train,batch_size), desc='Training...') as tbar:
            for index in tbar:
                L = index
                R = min(M_train,index+batch_size)
                L_val += batch_size
                L_val %= M_val
                R_val = min(M_val,L_val + batch_size)
                #-----------------训练内容------------------
                train_pre = model(x_train[L:R])     # 喂给 model训练数据 x, 输出预测值
                train_loss = loss_func(train_pre, y_train[L:R])
                val_pre = model(x_val[L_val:R_val])#验证集也得分批次，不然数据量太大内存爆炸
                val_loss = loss_func(val_pre, y_val[L_val:R_val])

                #----------- -----计算准确率----------------
                train_acc = np.sum(np.argmax(np.array(train_pre.data.cpu()),axis=1) == np.array(y_train[L:R].data.cpu()))/(R-L)
                val_acc = np.sum(np.argmax(np.array(val_pre.data.cpu()),axis=1) == np.array(y_val[L_val:R_val].data.cpu()))/(R_val-L_val)

                #---------------打印在进度条上--------------
                tbar.set_postfix(train_loss=float(train_loss.data.cpu()),train_acc=train_acc,val_loss=float(val_loss.data.cpu()),val_acc=val_acc)
                tbar.update()  # 默认参数n=1，每update一次，进度+n

                #-----------------反向传播更新---------------
                optimizer.zero_grad()   # 清空上一步的残余更新参数值
                train_loss.backward()         # 以训练集的误差进行反向传播, 计算参数更新值
                optimizer.step()        # 将参数更新值施加到 net 的 parameters 上
    del model






100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200000/200000 [00:59<00:00, 3368.07it/s]


--------------- > Fold 1 < ---------------
开始迭代....
step= 1


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [16:46<00:00,  4.29it/s, train_acc=0.852, train_loss=0.478, val_acc=0.781, val_loss=0.663]


step= 2


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [14:48<00:00,  4.86it/s, train_acc=0.875, train_loss=0.362, val_acc=0.828, val_loss=0.563]


step= 3


Training...: 100%|████████████████████████████████████████████████| 4322/4322 [14:48<00:00,  4.86it/s, train_acc=0.953, train_loss=0.23, val_acc=0.859, val_loss=0.495]


step= 4


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [14:49<00:00,  4.86it/s, train_acc=0.922, train_loss=0.251, val_acc=0.836, val_loss=0.453]


step= 5


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [14:55<00:00,  4.83it/s, train_acc=0.961, train_loss=0.194, val_acc=0.844, val_loss=0.464]


step= 6


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [15:00<00:00,  4.80it/s, train_acc=0.953, train_loss=0.166, val_acc=0.859, val_loss=0.412]


step= 7


Training...: 100%|████████████████████████████████████████████████| 4322/4322 [19:10<00:00,  3.76it/s, train_acc=0.93, train_loss=0.199, val_acc=0.852, val_loss=0.485]


step= 8


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [18:53<00:00,  3.81it/s, train_acc=0.914, train_loss=0.214, val_acc=0.883, val_loss=0.377]


step= 9


Training...: 100%|███████████████████████████████████████████████| 4322/4322 [47:08<00:00,  1.53it/s, train_acc=0.961, train_loss=0.159, val_acc=0.852, val_loss=0.419]


step= 10


Training...:  82%|██████████████████████████████████████▋        | 3559/4322 [11:55<02:36,  4.86it/s, train_acc=0.867, train_loss=0.355, val_acc=0.828, val_loss=0.488]